In [None]:
import pandas as pd
import os
import glob
import re
import numpy as np

In [None]:
path_to_data = '..//00_data/train_test_data'

In [None]:
X_train = pd.read_csv(os.path.join(path_to_data, 'X_train.csv'))

## Cleaning

In [None]:
def drop_mostly_empty_columns(dataframe, missing_frac_threshold, to_keep):
    frac_missing = pd.isna(dataframe).mean()
    mostly_empty_columns = frac_missing[frac_missing > missing_frac_threshold].index

    mostly_empty_columns = [col for col in mostly_empty_columns if col not in to_keep]

    print('Dropping {} columns: {}'.format(len(mostly_empty_columns), mostly_empty_columns))
    dataframe = dataframe.drop(mostly_empty_columns, axis=1)
    return dataframe

In [None]:
X_train = drop_mostly_empty_columns(X_train, 0.1, [])

In [None]:
def drop_columns_with_low_variance(dataframe, threshold):
    print('Dropped columns with variance low than {}'.format(threshold))
    dataframe.drop(dataframe.std()[dataframe.std() < threshold].index.values, axis=1, inplace = True)
    return dataframe

In [None]:
X_train = drop_columns_with_low_variance(X_train, 0.3)

In [None]:
def drop_highly_correlated_columns(dataframe, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataframe.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataframe.columns:
                    del dataframe[colname] # deleting the column from the dataset
    return dataframe

In [None]:
X_train = drop_highly_correlated_columns(X_train, 0.9)

In [None]:
path_to_data = '..//00_data/track_1'
temp = pd.read_csv(os.path.join(path_to_data, 'meteo_1day.csv'))
temp.head()

In [None]:
pd.crosstab(temp.temperature_20cm_qual, temp.temperature_20cm)

In [None]:
# 1, 2, 3, 4, 8 - ошибочное значение
# 5, 6, 7 - сомнительное значение
# 9 - отсутствует значение
for col in list(temp.filter(regex='qual')):
    print(temp[col].value_counts(dropna = False))

In [None]:
def replace_incorrect_or_missing_values(dataframe):
    # Находим колонки с признаками достоверности
    for col in list(dataframe.filter(regex='qual')):
        col_to_replace = col.rstrip(r'_qual')
        dataframe.loc[dataframe[col] != 0.0, [col_to_replace]] = np.nan
        #dataframe[col_to_replace][dataframe[col] == 9.0] =  np.nan
        dataframe = dataframe.drop(col, 1)
        return dataframe

In [None]:
temp_2 = replace_incorrect_or_missing_values(temp)

In [None]:
temp_2.describe()

In [None]:
# One-hot-encoding for categorical values (define categorical!!)
X_train = pd.get_dummies(data, columns=[''], drop_first=False)

In [None]:
# Handling missing values by numeric values
mlm = mlm.sort_values(['station_id', 'year', 'month'])
num_cols = ['precipitation_observed',
            'precipitation_corrected', 
            'precipitation_corrected_liquid',
           'precipitation_corrected_mixed',
           'precipitation_corrected_solid',
           'sunshine_hours']
mlm[num_cols] = mlm[num_cols + ['station_id']].groupby('station_id').\
    apply(lambda group: group.interpolate(method='index')).drop('station_id', axis = 1)

## Feature engineering

In [None]:
path_to_data = '..//00_data/track_1'

In [None]:
hydro_coord = pd.read_csv(os.path.join(path_to_data, 'hydro_coord.csv'))

In [None]:
# calculate_height_difference
# distance_from_source - расстояние от истока
# z_null - высотная отметка нуля графика на гидропосте
def calculate_height_difference(dataframe_with_stations):
    dataframe_with_stations=dataframe_with_stations.sort_values(by = 'distance_from_source')
    dataframe_with_stations['height_difference'] = dataframe_with_stations['z_null'] - dataframe_with_stations['z_null'].shift(1) 
    dataframe_with_stations['distance_to_previous'] = dataframe_with_stations['distance_from_source'] - dataframe_with_stations['distance_from_source'].shift(1)
    dataframe_with_stations['height_diff_by_dist'] = dataframe_with_stations['height_difference']/dataframe_with_stations['distance_to_previous']
    dataframe_with_stations.fillna(0, inplace = True)
    return dataframe

In [None]:
hydro_coord = calculate_height_difference(hydro_coord)

In [None]:
hydro_coord.head()

In [None]:
hydro_coord.groupby(['station_id'])['distance_from_source'].value_counts(dropna = False)