# Библиотеки

In [30]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

# Константы

In [9]:
RAWDATADIR = '../data/raw'
DATADIR = '../data'

## Словарь по целевым постам

In [4]:
kirensk_dict = {
    'name': 'Киренск',
    'id': 3019,
    'meteo_st': [30028, 30219, 30328, 30337, 30433, 30230],
    'gydro_st': [3087, 3021],
    'id_influencer': []
}

vitim_dict = {
    'name': 'Витим',
    'id': 3027,
    'meteo_st': [24713, 24726, 24817, 30356, 30471, 30372, 30069, 30253, 30252, 24923],
    'gydro_st': [3106, 3555, 3024, 3554, 3028, 3029, 3030, 3031, 3032],
    'id_influencer': [3019]
}

peledui_dict = {
    'name': 'Пеледуй',
    'id': 3028,
    'meteo_st': [24713, 24726, 24817, 30356, 30471, 30372, 30069, 30253, 30252, 24923],
    'gydro_st': [3106, 3555, 3024, 3554, 3027, 3029, 3030, 3031, 3032],
    'id_influencer': [3019, 3027]
}

krestovskiy_dict = {
    'name': 'Крестовский Лесоучасток',
    'id': 3029,
    'meteo_st': [24713, 24726, 24817, 30356, 30471, 30372, 30069, 30253, 30252, 24923],
    'gydro_st': [3106, 3555, 3024, 3554, 3027, 3028, 3030, 3031, 3032],
    'id_influencer': [3019, 3027, 3028]
}

lensk_dict = {
    'name': 'Ленск',
    'id': 3030,
    'meteo_st': [24713, 24726, 24817, 30356, 30471, 30372, 30069, 30253, 30252, 24923],
    'gydro_st': [3106, 3555, 3024, 3554, 3027, 3028, 3029, 3031, 3032],
    'id_influencer': [3019, 3027, 3028, 3029]
}

olekminsk_dict = {
    'name': 'Олекминск',
    'id': 3035,
    'meteo_st': [24538,24738,24641,24933,30089,30385,30493,30393,31102,31004,24951,24944],
    'gydro_st': [3180,3169,3036,3037,3038],
    'id_influencer': [3019, 3027, 3028, 3029, 3030]
}

pokrovsk_dict = {
    'name': 'Покровск',
    'id': 3041,
    'meteo_st': [31137,31026,24967,24966,24641,24643,24661,24671,24763],
    'gydro_st': [3042,3045,3047,3048],
    'id_influencer': [3019, 3027, 3028, 3029, 3030, 3035]
}

yakutsk_dict = {
    'name': 'Якутск',
    'id': 3045,
    'meteo_st': [31137,31026,24967,24966,24641,24643,24661,24671,24763],
    'gydro_st': [3042,3041,3047,3048],
    'id_influencer': [3019, 3027, 3028, 3029, 3030, 3035, 3041]
}

batamay_dict = {
    'name': 'Батамай',
    'id': 3230,
    'meteo_st': [31137,31026,24967,24966,24641,24643,24661,24671,24763],
    'gydro_st': [3229,3050],
    'id_influencer': [3019, 3027, 3028, 3029, 3030, 3035, 3041, 3045]
}

sangar_dict = {
    'name': 'Сангар',
    'id': 3050,
    'meteo_st': [31137,31026,24967,24966,24641,24643,24661,24671,24763],
    'gydro_st': [3229,3230],
    'id_influencer': [3019, 3027, 3028, 3029, 3030, 3035, 3041, 3045, ]
}

# Методы

## make_gydro_df

In [34]:
def make_gydro_df(train_df, station_dict, verbose=False):
    """
    Формирует датасет с гидрологическими данными
    """

    tr_data = df_preprocessing(train_df, station_dict['id'])

    # соберём данные со смежных постов, выделенных в словаре по данному целевому посту
    mapper = dict()
    use_feature = ['date', 'stage_avg', 'stage_min','stage_max', 'temp', 'water_code', 'ice_thickness','snow_height', 
                   'place', 'discharge']

    for n, st in enumerate(station_dict['gydro_st']):
        if n == 0:
            # данные с поста
            df = df_preprocessing(train_df, st)
            df = df[use_feature]
            for col in [x for x in use_feature if x not in ['date']]:
                mapper[col] = f'{col}_{st}'
            gydro_st = df.rename(columns=mapper)
        else:
            # данные с поста
            df = df_preprocessing(train_df, st)
            df = df[use_feature]
            for col in [x for x in use_feature if x not in ['date']]:
                mapper[col] = f'{col}_{st}'
            df.rename(columns=mapper, inplace=True)
            gydro_st = pd.merge(gydro_st, df, on='date')
            
    # сводные данные по целевому посту и его окресных гидропостов
    return pd.merge(tr_data, gydro_st, on='date')

## make_meteo_df

In [32]:
def make_meteo_df(meteo_df, stations_id, verbose=False):
    """
    Формирует датасет с метеорологическим данными
    """
    # Замена np.nan'ами неподтверждённых или опровергнутых данных
    _qual_cols = list(meteo_df.filter(regex='_qual$').columns)
    if verbose: print(meteo_df.shape)
    for col in _qual_cols:
        meteo_df = mute_untrastable(meteo_df, col)
    if verbose: print(meteo_df.shape)

    features = pd.Series(_qual_cols)
    spl_feat = []
    for n, f in enumerate(features):
        spl = features.values[n].split('_')
        spl_feat.append(('_'.join(spl[:len(spl)-1])))

    use_feature = ['station_id', 'date_local', 'month_local'] + spl_feat

    meteo_data = meteo_df[use_feature].copy()
    if verbose: print(meteo_data.shape)
    del meteo_df
    gc.collect()
    
    # аггрегация данных с дискретностью 1 день
    meteo_1day = meteo_data.groupby(['station_id', 'date_local']).median()
    meteo_1day.reset_index(inplace=True)
    if verbose: print(meteo_1day.shape)
    del meteo_data
    gc.collect()
    
    mapper = dict()
    use_feature = ['date_local', 'month_local'] + spl_feat

    # берём данные по нужным станциям
    for n, st in enumerate(stations_id):
        if verbose: print(n, st)
        if n == 0:
            df = meteo_data_processing(meteo_1day, st)
            df = df[use_feature]
            if verbose: print(df.shape)
            for col in spl_feat:
                mapper[col] = f'{col}_{st}'
            meteo_st = df.rename(columns=mapper)
        else:
            df = meteo_data_processing(meteo_1day, st)
            df = df[use_feature]
            if verbose: print(df.shape)
            for col in spl_feat:
                mapper[col] = f'{col}_{st}'
            df.rename(columns=mapper, inplace=True)
            meteo_st = pd.merge(meteo_st, df, how='outer', on=['date_local', 'month_local'])
            
        
    # находим значения для заполнения праметров с пропусками, учитывая специфику каждого месяца
    features_dict = dict()
    features = [x for x in meteo_st.columns if x not in ['date_local', 'month_local']]
    for f in features:
        features_dict[f] = dict()
    for feature in features:
        for i in range(1, 13):
            val = meteo_st.loc[meteo_st.month_local == i, feature].median()
            if (val is not np.nan)|(val is not pd.NA)|(val != 'nan'):
                features_dict[feature][i] = val
            else:
                features_dict[feature][i] = -777
    # заполняем пропуски
    for col in features:
        for i in range(1,13):
            meteo_st.loc[(meteo_st.month_local == i)&(meteo_st[col].isna()), col] = features_dict[col][i]
                
    meteo_st.fillna(-777, inplace=True)
    
    return meteo_st

## df_preprocessing

In [5]:
def df_preprocessing(df, station_id, verbose=False):
    """
    Предобработка данных по указанной гидростанции станции
    """
    tr_data = df[(df.station_id == station_id)].copy()

    # если замер льда не делали, то пусть 0 будет такой категорией
    tr_data['place'].fillna(0, inplace=True)
    # water_code и place должны быть категориальными
    tr_data['water_code'] = tr_data['water_code'].astype('str')
    tr_data['place'] = tr_data['place'].astype('int8')
    
    # проверка признаков на наличие пропусков
    na_features = []
    if verbose: print('Пропуски:')
    for col in tr_data.columns:
        cnt = tr_data[col].isna().sum()
        if cnt > 0:
            if verbose: print(col, ':', cnt)
            na_features.append(col)
    if verbose: print()
    if len(na_features) > 0:
        # находим значения для заполнения праметров с пропусками, учитывая специфику каждого месяца
        na_features_dict = dict()
        for f in na_features:
            na_features_dict[f] = dict()
        for feature in na_features:
            for i in range(1, 13):
                val = tr_data.loc[tr_data.month == i, feature].median()
                # признак если среднего расхода воды не знаем или не замеряется на посту
                if (val is np.nan)&(feature == 'discharge'):
                    na_features_dict[feature][i] = -1
                elif val is np.nan:
                    na_features_dict[feature][i] = 0.0
                else:
                    na_features_dict[feature][i] = val
        # заполняем пропуски
        for col in na_features:
            for i in range(1,13):
                tr_data.loc[(tr_data.month == i)&(tr_data[col].isna()), col] = na_features_dict[col][i]
            
    return tr_data

## mute_untrastable

In [6]:
def mute_untrastable(df, feature):
    """
    Замена NaN'ами неподтверждённых или опровергнутых данных
    feature - фича с суфиксом _qual
    """
    spl = feature.split('_')
    spl_feat = ('_').join(spl[:len(spl)-1])
    
    df.loc[df[feature] > 2, spl_feat] = np.nan
    
    return df

## meteo_data_processing

In [7]:
def meteo_data_processing(meteo_1day, station_id, verbose=False):
    """
    Предобработка метеоданных
    meteo_1day - dataframe с метеоданными, аггрегированные за 1 день
    """
    
    df = meteo_1day[(meteo_1day.station_id == station_id)].copy()
    
    # проверка признаков на наличие пропусков
    na_features = []
    if verbose: print('Пропуски:')
    for col in df.columns:
        cnt = df[col].isna().sum()
        if cnt > 0:
            if verbose: print(col, ':', cnt)
            na_features.append(col)
            
    if len(na_features) > 0:
        # находим значения для заполнения праметров с пропусками, учитывая специфику каждого месяца
        na_features_dict = dict()
        for f in na_features:
            na_features_dict[f] = dict()
        for feature in na_features:
            for i in range(1, 13):
                val = df.loc[df.month_local == i, feature].median()
                if (val is np.nan):
                    na_features_dict[feature][i] = -777
                else:
                    na_features_dict[feature][i] = val
                        
        # заполняем пропуски
        for col in na_features:
            for i in range(1,13):
                df.loc[(df.month_local == i)&(df[col].isna()), col] = na_features_dict[col][i]
            
    return df

# Baseline

In [35]:
# Киренск
gydro_df = make_gydro_df(
    pd.read_csv(f'{RAWDATADIR}/track_2_package/train.csv', parse_dates=['date']),
    kirensk_dict
)
gydro_df.shape

(11604, 33)

In [36]:
meteo_df = make_meteo_df(
    pd.read_csv(f'{RAWDATADIR}/track_2_package/meteo_3hours.csv', parse_dates=['date_local']),
    kirensk_dict['meteo_st']
)
meteo_df.shape

(12310, 218)

In [87]:
data = pd.merge(gydro_df, meteo_df.rename(columns={'date_local': 'date'}), how='left', on='date')
data.shape

(11604, 250)

In [42]:
sum(data.isna().sum())

70631

In [88]:
data.fillna(-777, inplace=True)
sum(data.isna().sum())

0

data['delta_stage_max_1'] = data.delta_stage_max.shift(1)
data['delta_stage_max_2'] = data.delta_stage_max.shift(2)
data['delta_stage_max_3'] = data.delta_stage_max.shift(3)
data['delta_stage_max_4'] = data.delta_stage_max.shift(4)
data['delta_stage_max_5'] = data.delta_stage_max.shift(5)
data['delta_stage_max_6'] = data.delta_stage_max.shift(6)
data['delta_stage_max_7'] = data.delta_stage_max.shift(7)
data.dropna(inplace=True)
data.shape

In [77]:
import pickle

with open('data.pkl', 'wb') as f:
    pickle.dump(data, f)

In [63]:
import catboost

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [65]:
list(data.columns)

['station_id',
 'date',
 'stage_avg',
 'stage_min',
 'stage_max',
 'temp',
 'water_code',
 'ice_thickness',
 'snow_height',
 'place',
 'discharge',
 'year',
 'month',
 'day',
 'delta_stage_max',
 'stage_avg_3087',
 'stage_min_3087',
 'stage_max_3087',
 'temp_3087',
 'water_code_3087',
 'ice_thickness_3087',
 'snow_height_3087',
 'place_3087',
 'discharge_3087',
 'stage_avg_3021',
 'stage_min_3021',
 'stage_max_3021',
 'temp_3021',
 'water_code_3021',
 'ice_thickness_3021',
 'snow_height_3021',
 'place_3021',
 'discharge_3021',
 'month_local',
 'horizontal_visibility_30028',
 'cloud_amount_total_30028',
 'cloud_amount_low_level_30028',
 'cloud_form_high_level_30028',
 'cloud_form_middle_level_30028',
 'cloud_form_vertical_develop_30028',
 'cloud_form_strat_stratocum_30028',
 'cloud_form_strat_rain_30028',
 'cloud_base_altitude_30028',
 'cloud_below_station_30028',
 'soil_surface_condition_30028',
 'weather_before_30028',
 'weather_30028',
 'wind_direction_30028',
 'wind_speed_aver_30028

In [89]:
wc = list(data.filter(regex='water_code').columns)
# target = list(data.filter(regex='delta_stage_max').columns)
# X = data.drop(['station_id','date']+wc+target,axis=1).to_numpy()
X = data.drop(['station_id','date','delta_stage_max']+wc,axis=1).to_numpy()
y = data.delta_stage_max.to_numpy()
# y = data[target].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, train_size=0.2, random_state=13)

In [90]:
params = {
    'iterations': 3000,
    'eval_metric': 'RMSE',
    'random_seed': 13
}

model = catboost.CatBoostRegressor(**params)
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=300,
)

Learning rate set to 0.027247
0:	learn: 12.1493701	test: 12.4069636	best: 12.4069636 (0)	total: 7.26ms	remaining: 21.8s
300:	learn: 8.2536051	test: 11.9282734	best: 11.9238833 (290)	total: 1.74s	remaining: 15.6s
600:	learn: 6.2211977	test: 11.9211369	best: 11.8943374 (391)	total: 3.44s	remaining: 13.7s
900:	learn: 4.6444998	test: 11.9836054	best: 11.8943374 (391)	total: 5.14s	remaining: 12s
1200:	learn: 3.6471358	test: 12.0252858	best: 11.8943374 (391)	total: 6.87s	remaining: 10.3s
1500:	learn: 2.9080811	test: 12.0548815	best: 11.8943374 (391)	total: 8.56s	remaining: 8.55s
1800:	learn: 2.3345122	test: 12.0749865	best: 11.8943374 (391)	total: 10.2s	remaining: 6.82s
2100:	learn: 1.9135354	test: 12.0934866	best: 11.8943374 (391)	total: 12s	remaining: 5.12s
2400:	learn: 1.5710098	test: 12.1046676	best: 11.8943374 (391)	total: 13.7s	remaining: 3.41s
2700:	learn: 1.2934982	test: 12.1101531	best: 11.8943374 (391)	total: 15.5s	remaining: 1.72s
2999:	learn: 1.0887356	test: 12.1177916	best: 11.8

<catboost.core.CatBoostRegressor at 0x7fc6ec4ba358>

In [71]:
model.tree_count_

392

In [72]:
kirensk_dict['id']

3019

In [91]:
metric = mean_squared_error(y_test, model.predict(X_test))/185.35707752426708
metric

0.7632579392014218

# Test

# Отработка методов и подходов

In [144]:
train_df = pd.read_csv(f'{RAWDATADIR}/track_2_package/train.csv', parse_dates=['date'])

# удаляем наблюдения за 1985-01-01, т.к. изменение уровня воды NaN из-за отсуютсвтия более ранних исторических данных
train_df.drop(train_df[train_df.date == pd.to_datetime('1985-01-01', format='%Y-%m-%d')].index, inplace=True)

### Киренск

In [128]:
tr_data = train_df[(train_df.station_id == kirensk_dict['id'])].copy()

# если замер льда не делали, то пусть 0 будет такой категорией
tr_data['place'].fillna(0, inplace=True)
# water_code и place должны быть категориальными
tr_data['water_code'] = tr_data['water_code'].astype('str')
tr_data['place'] = tr_data['place'].astype('int8')

tr_data.head()

Unnamed: 0,station_id,date,stage_avg,stage_min,stage_max,temp,water_code,ice_thickness,snow_height,place,discharge,year,month,day,delta_stage_max
0,3019,1985-01-01,-23.0,-23.0,-23.0,,16,,,0,,1985,1,1,
1,3019,1985-01-02,-23.0,-23.0,-23.0,,16,,,0,,1985,1,2,0.0
2,3019,1985-01-03,-23.0,-23.0,-23.0,,16,,,0,,1985,1,3,0.0
3,3019,1985-01-04,-24.0,-24.0,-24.0,,16,,,0,,1985,1,4,-1.0
4,3019,1985-01-05,-24.0,-24.0,-24.0,,16,53.0,29.0,1,,1985,1,5,0.0


In [130]:
# проверка признаков на наличие пропусков
na_features = []
for col in tr_data.columns:
    cnt = tr_data[col].isna().sum()
    if cnt > 0:
        print(col, cnt)
        na_features.append(col)

temp 5372
ice_thickness 10945
snow_height 10968
discharge 11603


In [131]:
na_features

['temp', 'ice_thickness', 'snow_height', 'discharge']

In [132]:
# находим значения для заполнения праметров с пропусками, учитывая специфику каждого месяца

na_features_dict = dict()
for f in na_features:
    na_features_dict[f] = dict()

for feature in na_features:
    for i in range(1, 13):
#         if feature == 'place':
#             na_features_dict[feature][i] = tr_data.loc[tr_data.month == i, feature].mode().values[0]
#         else:
            val = tr_data.loc[tr_data.month == i, feature].mean()
            if (val is np.nan)&(feature == 'discharge'):
                na_features_dict[feature][i] = -1  # признак если среднего расхода воды не знаем или не замеряется на посту
            elif val is np.nan:
                na_features_dict[feature][i] = 0.0
            else:
                na_features_dict[feature][i] = val

na_features_dict

{'temp': {1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.09090909090909091,
  5: 4.593290322580645,
  6: 16.093793103448277,
  7: 20.633577712609974,
  8: 18.358797653958945,
  9: 10.244141414141415,
  10: 2.1803519061583576,
  11: 0.025,
  12: 0.0},
 'ice_thickness': {1: 53.78861788617886,
  2: 62.458333333333336,
  3: 65.61538461538461,
  4: 59.13414634146341,
  5: 41.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 10.0,
  11: 21.8125,
  12: 39.77777777777778},
 'snow_height': {1: 31.6260162601626,
  2: 36.94117647058823,
  3: 33.034188034188034,
  4: 14.603174603174603,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 5.5,
  11: 10.648936170212766,
  12: 24.384615384615383},
 'discharge': {1: -1,
  2: -1,
  3: -1,
  4: -1,
  5: -1,
  6: -1,
  7: -1,
  8: -1,
  9: -1,
  10: -1,
  11: -1,
  12: -1}}

In [133]:
# заполняем пропуски
for col in na_features:
    for i in range(1,13):
#         tr_data.loc[tr_data.month == i, col].fillna(value=na_features_dict[col][i], inplace=True)  # не работает...
        tr_data.loc[(tr_data.month == i)&(tr_data[col].isna()), col] = na_features_dict[col][i]
display(tr_data.head())
tr_data.info()

Unnamed: 0,station_id,date,stage_avg,stage_min,stage_max,temp,water_code,ice_thickness,snow_height,place,discharge,year,month,day,delta_stage_max
1,3019,1985-01-02,-23.0,-23.0,-23.0,0.0,16,53.788618,31.626016,0,-1.0,1985,1,2,0.0
2,3019,1985-01-03,-23.0,-23.0,-23.0,0.0,16,53.788618,31.626016,0,-1.0,1985,1,3,0.0
3,3019,1985-01-04,-24.0,-24.0,-24.0,0.0,16,53.788618,31.626016,0,-1.0,1985,1,4,-1.0
4,3019,1985-01-05,-24.0,-24.0,-24.0,0.0,16,53.0,29.0,1,-1.0,1985,1,5,0.0
5,3019,1985-01-06,-24.0,-24.0,-24.0,0.0,16,53.788618,31.626016,0,-1.0,1985,1,6,0.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 11603 entries, 1 to 11603
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   station_id       11603 non-null  int64         
 1   date             11603 non-null  datetime64[ns]
 2   stage_avg        11603 non-null  float64       
 3   stage_min        11603 non-null  float64       
 4   stage_max        11603 non-null  float64       
 5   temp             11603 non-null  float64       
 6   water_code       11603 non-null  object        
 7   ice_thickness    11603 non-null  float64       
 8   snow_height      11603 non-null  float64       
 9   place            11603 non-null  int8          
 10  discharge        11603 non-null  float64       
 11  year             11603 non-null  int64         
 12  month            11603 non-null  int64         
 13  day              11603 non-null  int64         
 14  delta_stage_max  11603 non-null  float

### Гидрологические данные

In [176]:
tr_data = df_preprocessing(train_df, kirensk_dict['id'])
tr_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 11603 entries, 1 to 11603
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   station_id       11603 non-null  int64         
 1   date             11603 non-null  datetime64[ns]
 2   stage_avg        11603 non-null  float64       
 3   stage_min        11603 non-null  float64       
 4   stage_max        11603 non-null  float64       
 5   temp             11603 non-null  float64       
 6   water_code       11603 non-null  object        
 7   ice_thickness    11603 non-null  float64       
 8   snow_height      11603 non-null  float64       
 9   place            11603 non-null  int8          
 10  discharge        11603 non-null  float64       
 11  year             11603 non-null  int64         
 12  month            11603 non-null  int64         
 13  day              11603 non-null  int64         
 14  delta_stage_max  11603 non-null  floa

In [177]:
# соберём данные со смежных постов, выделенных в словаре по данному целевому посту
mapper = dict()
use_feature = ['date', 'stage_avg', 'stage_min','stage_max', 'temp', 'water_code', 'ice_thickness','snow_height', 
               'place', 'discharge']

for n, st in enumerate(kirensk_dict['gydro_st']):
    if n == 0:
        # данные с поста
        df = df_preprocessing(train_df, st)
        df = df[use_feature]
        for col in [x for x in use_feature if x not in ['date']]:
            mapper[col] = f'{col}_{st}'
        gydro_st = df.rename(columns=mapper)
    else:
        # данные с поста
        df = df_preprocessing(train_df, st)
        df = df[use_feature]
        for col in [x for x in use_feature if x not in ['date']]:
            mapper[col] = f'{col}_{st}'
        df.rename(columns=mapper, inplace=True)
        gydro_st = pd.merge(gydro_st, df, on='date')

gydro_st.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 11603 entries, 0 to 11602
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date                11603 non-null  datetime64[ns]
 1   stage_avg_3087      11603 non-null  float64       
 2   stage_min_3087      11603 non-null  float64       
 3   stage_max_3087      11603 non-null  float64       
 4   temp_3087           11603 non-null  float64       
 5   water_code_3087     11603 non-null  object        
 6   ice_thickness_3087  11603 non-null  float64       
 7   snow_height_3087    11603 non-null  float64       
 8   place_3087          11603 non-null  int8          
 9   discharge_3087      11603 non-null  float64       
 10  stage_avg_3021      11603 non-null  float64       
 11  stage_min_3021      11603 non-null  float64       
 12  stage_max_3021      11603 non-null  float64       
 13  temp_3021           11603 non-null  float64 

In [179]:
# сводные данные по целевому посту и его окресных гидропостов
kirensk_df = pd.merge(tr_data, gydro_st, on='date')
kirensk_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11603 entries, 0 to 11602
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   station_id          11603 non-null  int64         
 1   date                11603 non-null  datetime64[ns]
 2   stage_avg           11603 non-null  float64       
 3   stage_min           11603 non-null  float64       
 4   stage_max           11603 non-null  float64       
 5   temp                11603 non-null  float64       
 6   water_code          11603 non-null  object        
 7   ice_thickness       11603 non-null  float64       
 8   snow_height         11603 non-null  float64       
 9   place               11603 non-null  int8          
 10  discharge           11603 non-null  float64       
 11  year                11603 non-null  int64         
 12  month               11603 non-null  int64         
 13  day                 11603 non-null  int64     

### Метеорологические данные

In [31]:
meteo_st = make_meteo_df(
    pd.read_csv(f'{RAWDATADIR}/track_2_package/meteo_3hours.csv', parse_dates=['date_local']),
    kirensk_dict['meteo_st']
)
sum(meteo_st.isna().sum())

0 30028
(12279, 38)
1 30219
(11363, 38)
2 30328
(11834, 38)
3 30337
(12279, 38)
4 30433
(12310, 38)
5 30230
(12279, 38)


0

In [10]:
# метеоданные
meteo_df = pd.read_csv(f'{RAWDATADIR}/track_2_package/meteo_3hours.csv', parse_dates=['date_local'])

# Замена np.nan'ами неподтверждённых или опровергнутых данных
_qual_cols = list(meteo_df.filter(regex='_qual$').columns)
print(meteo_df.shape)
for col in _qual_cols:
    meteo_df = mute_untrastable(meteo_df, col)
print(meteo_df.shape)

features = pd.Series(_qual_cols)
spl_feat = []
for n, f in enumerate(features):
    spl = features.values[n].split('_')
    spl_feat.append(('_'.join(spl[:len(spl)-1])))
    
use_feature = ['station_id', 'date_local', 'month_local'] + spl_feat

meteo_data = meteo_df[use_feature].copy()
print(meteo_data.shape)

del meteo_df
gc.collect()

(3666552, 94)
(3666552, 94)
(3666552, 39)


193

In [11]:
# аггрегация данных с дискретностью 1 день
meteo_1day = meteo_data.groupby(['station_id', 'date_local']).median()
meteo_1day.reset_index(inplace=True)
print(meteo_1day.shape)

del meteo_data
gc.collect()

(458319, 39)


0

In [18]:
mapper = dict()
use_feature = ['date_local', 'month_local'] + spl_feat

# берём данные по нужным станциям
for n, st in enumerate(kirensk_dict['meteo_st']):
    print(n, st)
    if n == 0:
        df = meteo_data_processing(meteo_1day, st)
        df = df[use_feature]
        print(df.shape)
        for col in spl_feat:
            mapper[col] = f'{col}_{st}'
        meteo_st = df.rename(columns=mapper)
    else:
        df = meteo_data_processing(meteo_1day, st)
        df = df[use_feature]
        print(df.shape)
        for col in spl_feat:
            mapper[col] = f'{col}_{st}'
        df.rename(columns=mapper, inplace=True)
        meteo_st = pd.merge(meteo_st, df, how='outer', on=['date_local', 'month_local'])

meteo_st.shape

0 30028


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


(12279, 38)
1 30219


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


(11363, 38)
2 30328


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


(11834, 38)
3 30337


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


(12279, 38)
4 30433


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


(12310, 38)
5 30230


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


(12279, 38)


(12310, 218)

In [19]:
meteo_st.shape

(12310, 218)

In [20]:
meteo_st.tail()

Unnamed: 0,date_local,month_local,horizontal_visibility_30028,cloud_amount_total_30028,cloud_amount_low_level_30028,cloud_form_high_level_30028,cloud_form_middle_level_30028,cloud_form_vertical_develop_30028,cloud_form_strat_stratocum_30028,cloud_form_strat_rain_30028,...,air_temperature_max_before_30230,air_max_temperature_30230,water_vapour_partial_pressure_30230,relative_humidity_30230,vapour_pressure_deficit_30230,dew_point_temperature_30230,pressure_30230,pressure_sea_level_30230,barometric_tendency_characteristic_30230,barometric_tendency_30230
12305,2020-01-27,1,,,,,,,,,...,,,,,,,,,,
12306,2020-01-28,1,,,,,,,,,...,,,,,,,,,,
12307,2020-01-29,1,,,,,,,,,...,,,,,,,,,,
12308,2020-01-30,1,,,,,,,,,...,,,,,,,,,,
12309,2020-01-31,1,,,,,,,,,...,,,,,,,,,,


In [25]:
        # находим значения для заполнения праметров с пропусками, учитывая специфику каждого месяца
        features_dict = dict()
        features = [x for x in meteo_st.columns if x not in ['date_local', 'month_local']]
        for f in features:
            features_dict[f] = dict()
        for feature in features:
            for i in range(1, 13):
                val = meteo_st.loc[meteo_st.month_local == i, feature].median()
                if (val is not np.nan)|(val is not pd.NA)|(val != 'nan'):
                    features_dict[feature][i] = val
                else:
                    features_dict[feature][i] = -777
#         features_dict                
        # заполняем пропуски
        for col in features:
            for i in range(1,13):
                meteo_st.loc[(meteo_st.month_local == i)&(meteo_st[col].isna()), col] = features_dict[col][i]
                
        meteo_st.fillna(-777, inplace=True)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

In [26]:
meteo_st.tail()

Unnamed: 0,date_local,month_local,horizontal_visibility_30028,cloud_amount_total_30028,cloud_amount_low_level_30028,cloud_form_high_level_30028,cloud_form_middle_level_30028,cloud_form_vertical_develop_30028,cloud_form_strat_stratocum_30028,cloud_form_strat_rain_30028,...,air_temperature_max_before_30230,air_max_temperature_30230,water_vapour_partial_pressure_30230,relative_humidity_30230,vapour_pressure_deficit_30230,dew_point_temperature_30230,pressure_30230,pressure_sea_level_30230,barometric_tendency_characteristic_30230,barometric_tendency_30230
12305,2020-01-27,1,97.0,6.5,0.0,1.0,0.0,0.0,0.0,0.0,...,-23.15,-24.35,0.57,77.5,0.135,-29.05,996.05,1032.4,5.0,0.8
12306,2020-01-28,1,97.0,6.5,0.0,1.0,0.0,0.0,0.0,0.0,...,-23.15,-24.35,0.57,77.5,0.135,-29.05,996.05,1032.4,5.0,0.8
12307,2020-01-29,1,97.0,6.5,0.0,1.0,0.0,0.0,0.0,0.0,...,-23.15,-24.35,0.57,77.5,0.135,-29.05,996.05,1032.4,5.0,0.8
12308,2020-01-30,1,97.0,6.5,0.0,1.0,0.0,0.0,0.0,0.0,...,-23.15,-24.35,0.57,77.5,0.135,-29.05,996.05,1032.4,5.0,0.8
12309,2020-01-31,1,97.0,6.5,0.0,1.0,0.0,0.0,0.0,0.0,...,-23.15,-24.35,0.57,77.5,0.135,-29.05,996.05,1032.4,5.0,0.8


In [27]:
sum(meteo_st.isna().sum())

0

In [33]:
kirensk_dict['meteo_st']

[30028, 30219, 30328, 30337, 30433, 30230]