Модель основывалась на данных прогноза погоды

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import pickle

from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [2]:
name_point = {
    3019: ['Киренск', [30028, 30219, 30328, 30337, 30433, 30230]],
    3027: ['Витим', [24713, 24726, 24817, 30356, 30471, 30372, 30069, 30253, 30252, 24923]],
    3028: ['Пеледуй', [24713, 24726, 24817, 30356, 30471, 30372, 30069, 30253, 30252, 24923]],
    3029: ['Крестовский Лесоучасток', [24713, 24726, 24817, 30356, 30471, 30372, 30069, 30253, 30252, 24923]],
    3030: ['Ленск', [24713, 24726, 24817, 30356, 30471, 30372, 30069, 30253, 30252, 24923]],
    3035: ['Олекминск',[24538,24738,24641,24933,30089,30385,30493,30393,31102,31004,24951,24944]],
    3041: ['Покровск', [31137,31026,24967,24966,24641,24643,24661,24671,24763]],
    3045: ['Якутск', [31137,31026,24967,24966,24641,24643,24661,24671,24763]],
    3230: ['Батамай',[31137,31026,24967,24966,24641,24643,24661,24671,24763]],
    3050: ['Сангар',[31137,31026,24967,24966,24641,24643,24661,24671,24763]]
}

In [3]:
test = pd.read_csv('Данные/2_track_cp4/test.csv')
test['date'] = pd.to_datetime(test['date'], dayfirst=True)
test.set_index('date', inplace=True)

extra_train = pd.read_csv('Данные/2_track_cp4/extra_train.csv')
forecast_meteo_3hours = pd.read_csv('Данные/2_track_cp4/forecast_meteo_3hours.csv')
extra_meteo_3hours = pd.read_csv('Данные/2_track_cp4/extra_meteo_3hours.csv')

extra_train['date'] = pd.to_datetime(extra_train['date'], dayfirst=True)
extra_train.set_index('date', inplace=True)


train = pd.read_csv('Данные/track_2_package/train.csv')
train['date'] = pd.to_datetime(train['date'], dayfirst=True)
train.set_index('date', inplace=True)

In [4]:
need_col = ['station_id', 'air_temperature', 'precipitation', 'wind_speed_aver', 'wind_direction', 'date_local']
col_to_agg = ['air_temperature', 'precipitation', 'wind_speed_aver', 'wind_direction']
meteo = extra_meteo_3hours[need_col]

In [5]:
def merge(train, meteo_aggregate, name_point):
    data = pd.concat([
        train.loc[(train['station_id']==point)].merge(
            meteo_aggregate.loc[name_point[point][1]].groupby('date').agg('mean'), 
            how='left', 
            left_index=True,
            right_index=True
        ) for point in name_point.keys()
    ])
    return data

def aggr_mean(data: pd.DataFrame) -> pd.DataFrame:
    suffix = '_mean'
    series = pd.Series(np.nanmean(data, axis=0)).to_frame().T.copy()
    series.columns = data.columns
    series = series.add_suffix(suffix)
    return series

def aggr_min(data: pd.DataFrame) -> pd.DataFrame:
    suffix = '_min'
    series = pd.Series(np.min(data, axis=0)).to_frame().T.copy()
    series.columns = data.columns
    series = series.add_suffix(suffix)
    return series

def aggr_max(data: pd.DataFrame) -> pd.DataFrame:
    suffix = '_max'
    series = pd.Series(np.max(data, axis=0)).to_frame().T.copy()
    series.columns = data.columns
    series = series.add_suffix(suffix)
    return series
    
def aggr_std(data: pd.DataFrame) -> pd.DataFrame:
    suffix = '_std'
    series = pd.Series(np.nanstd(data, axis=0)).to_frame().T.copy()
    series.columns = data.columns
    series = series.add_suffix(suffix)
    return series

In [6]:
aggr = [aggr_mean, aggr_min, aggr_max, aggr_std]

In [7]:
%%time
meteo_aggregate_test = extra_meteo_3hours.groupby(['station_id', 'date_local']).apply(lambda x: pd.concat([func(x[col_to_agg]) for func in aggr], axis=1))
meteo_aggregate_test['date'] = meteo_aggregate_test.index.get_level_values(1)

Wall time: 8.99 s


In [8]:
# Предобработанный файл с исторической погодой
with open('meteo3_hour_agg.pickle', 'rb') as f:
    meteo_aggregate_train = pickle.load(f)
    
meteo_aggregate_train['date'] = meteo_aggregate_train.index.get_level_values(1)
meteo_aggregate_train['date'] = pd.to_datetime(meteo_aggregate_train['date'], dayfirst=True)

In [9]:
data_train = merge(train, meteo_aggregate_train, name_point)
data_train.sort_index(inplace=True)

In [10]:
data_test = merge(extra_train, meteo_aggregate_test, name_point)
data_test.sort_index(inplace=True)

# Обучение модели

In [11]:
data_train = data_train.loc[(data_train['month']==4) | (data_train['month']==5) | (data_train['month']==6)]

In [12]:
model = XGBRegressor(max_depth=4, n_estimators=100, n_jobs=10, subsample=0.91)

In [13]:
columns_model = [
    'air_temperature_mean', 'precipitation_mean', 'wind_speed_aver_mean',  
    'air_temperature_min',    'precipitation_min',  'wind_speed_aver_min',
    'wind_direction_min',  'air_temperature_max',    'precipitation_max',
    'wind_speed_aver_max',   'wind_direction_max',  'air_temperature_std',
    'precipitation_std',  'wind_speed_aver_std',   'wind_direction_std',
    'wind_direction_mean'
]

In [14]:
%%time
model.fit(data_train[columns_model], data_train['delta_stage_max'])

Wall time: 1.42 s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=10, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.91,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
data_test['predict'] = model.predict(data_test[columns_model])
data_test['stats'] = np.nan
for day in data_test['day'].unique():
    for station in data_test['station_id'].unique():
        data_test.loc[(data_test['day']==day) & (data_test['station_id']==station), 'stats'] = \
        data_train.loc[(data_train['day']==day) & (data_train['station_id']==station)]['delta_stage_max'].mean()

In [16]:
for station in data_test['station_id'].unique():
    data = data_test.loc[data_test['station_id']==station]
    print(f'{station}_predict: ', round(mean_squared_error(data['delta_stage_max'], data['predict']), 2))
    print(f'{station}_stat: ', round(mean_squared_error(data['delta_stage_max'], data['stats']), 2), end='\n\n')

3041_predict:  1266.87
3041_stat:  1075.03

3230_predict:  891.55
3230_stat:  751.33

3045_predict:  1015.84
3045_stat:  763.09

3027_predict:  2696.11
3027_stat:  2880.4

3029_predict:  2513.65
3029_stat:  2612.22

3050_predict:  2338.48
3050_stat:  1867.19

3019_predict:  470.03
3019_stat:  478.79

3030_predict:  1803.81
3030_stat:  2138.14

3028_predict:  2489.13
3028_stat:  2499.7

3035_predict:  2791.01
3035_stat:  2600.48



In [27]:
with open('model.pickle', 'wb') as f:
    pickle.dump([model, columns_model], f)

# На сабмит

In [17]:
%%time
meteo_aggregate_ = forecast_meteo_3hours.groupby(['station_id', 'date_local']).apply(lambda x: pd.concat([func(x[col_to_agg]) for func in aggr], axis=1))
meteo_aggregate_['date'] = meteo_aggregate_.index.get_level_values(1)

  
  keepdims=keepdims)


Wall time: 23.8 s


In [18]:
test_ = merge(test, meteo_aggregate_, name_point)

In [19]:
data_all = pd.concat([data_train, data_test, test_])
data_all.sort_index(inplace=True)

In [20]:
columns = [
    'air_temperature_mean', 'precipitation_mean', 'wind_speed_aver_mean',  
    'air_temperature_min',    'precipitation_min',  'wind_speed_aver_min',
    'wind_direction_min',  'air_temperature_max',    'precipitation_max',
    'wind_speed_aver_max',   'wind_direction_max',  'air_temperature_std',
    'precipitation_std',  'wind_speed_aver_std',   'wind_direction_std', 
    'wind_direction_mean', 'delta_stage_max', 'day'
]

In [21]:
# Так как не удолась выгрузить метеоданные можем интерполировать наши признаки для теста
data_inter = data_all.groupby(['station_id', 'year']).apply(lambda x: x.sort_index()[columns].interpolate(method='time'))
data_inter['station_id'] = data_inter.index.get_level_values(0)
data_inter['year'] = data_inter.index.get_level_values(1)
data_inter.index = data_inter.index.get_level_values(2)

In [22]:
for day in test['day'].unique():
    for station in test['station_id'].unique():
        test.loc[(test['day']==day) & (test['station_id']==station), 'stats'] = \
        data_train.loc[(data_train['day']==day) & (data_train['station_id']==station)]['delta_stage_max'].mean()

In [23]:
data_inter['date'] = data_inter.index
data_inter['predict'] = model.predict(data_inter[columns_model])
test['date'] = test.index

for day in test.index.unique():
    for station in test['station_id'].unique():
        test.loc[(test['date']==day) & (test['station_id']==station), 'delta_stage_max'] = \
        data_inter.loc[(data_inter['date']==day) & (data_inter['station_id']==station)]['predict']

In [25]:
test['delta_stage_max'] = (test['delta_stage_max']+test['stats'])/2

In [26]:
test[['year', 'station_id', 'month', 'day', 'date', 'delta_stage_max']].to_csv('Sub_9.csv', sep=',')