In [1]:
import pandas as pd
import os
import glob
import re
import numpy as np

In [91]:
path_to_data = '..//00_data/train_test_data'

In [92]:
X_train = pd.read_csv(os.path.join(path_to_data, 'X_train.csv'))

In [51]:
# Средний уклон русла: разницу по высоте / расстояние между пунками
# Удалить инвариантные колонки
# Значения, в которых не уверены? (Threshold)

## Cleaning

In [99]:
def drop_mostly_empty_columns(dataframe, missing_frac_threshold, to_keep):
    frac_missing = pd.isna(dataframe).mean()
    mostly_empty_columns = frac_missing[frac_missing > missing_frac_threshold].index

    mostly_empty_columns = [col for col in mostly_empty_columns if col not in to_keep]

    print('Dropping {} columns: {}'.format(len(mostly_empty_columns), mostly_empty_columns))
    dataframe = dataframe.drop(mostly_empty_columns, axis=1)
    return dataframe

In [104]:
X_train = drop_mostly_empty_columns(X_train, 0.1, [])

Dropping 0 columns: []


In [89]:
def drop_columns_with_low_variance(dataframe, threshold):
    print('Dropped columns with variance low than {}'.format(threshold))
    dataframe.drop(dataframe.std()[dataframe.std() < threshold].index.values, axis=1, inplace = True)
    return dataframe

In [94]:
X_train = drop_columns_with_low_variance(X_train, 0.3)

Dropped columns with variance low than 0.3


In [141]:
def drop_highly_correlated_columns(dataframe, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataframe.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in dataframe.columns:
                    del dataset[colname] # deleting the column from the dataset
    return dataset

In [143]:
X_train = drop_highly_correlated_columns(X_train, 0.9)

NameError: name 'dataset' is not defined

In [None]:
X_train.head()

In [None]:
# One-hot-encoding for categorical values (define categorical)
X_train = pd.get_dummies(data, columns=[''], drop_first=False)

## Feature engineering

In [116]:
path_to_data = '..//00_data/track_1'

In [118]:
hydro_coord = pd.read_csv(os.path.join(path_to_data, 'hydro_coord.csv'))

In [128]:
# calculate_height_difference
# distance_from_source - расстояние от истока
# z_null - высотная отметка нуля графика на гидропосте
hydro_coord=hydro_coord.sort_values(by = 'distance_from_source')
hydro_coord['height_difference'] = hydro_coord['z_null'] - hydro_coord['z_null'].shift(1) 
hydro_coord['distance_to_previous'] = hydro_coord['distance_from_source'] - hydro_coord['distance_from_source'].shift(1)
hydro_coord['height_diff_by_dist'] = hydro_coord['height_difference']/hydro_coord['distance_from_source']
hydro_coord.fillna(0, inplace = True)

In [137]:
hydro_coord.head()

Unnamed: 0,station_id,name,lat,lon,distance_from_source,drainage_area,z_null,height_difference,distance_to_previous,height_diff_by_dist
25,3554,ЧУЯ - Р.ЧУЯ,59.26,112.41,509.9,18400,179.0,0.0,0.0,0.0
19,3087,ШОРОХОВА - Р.КИРЕНГА,57.63,108.11,728.0,46500,254.24,75.24,218.1,0.103352
22,3180,ТОККО - Р.ЧАРА,60.0,119.88,730.0,62500,143.8,-110.44,2.0,-0.151288
0,3019,КИРЕНСК - Р.ЛЕНА,57.77,108.07,1140.0,92200,249.38,105.58,410.0,0.092614
1,3021,ЗМЕИНОВО - Р.ЛЕНА,57.78,108.32,1157.0,140000,245.25,-4.13,17.0,-0.00357


In [139]:
hydro_coord.groupby(['station_id'])['distance_from_source'].value_counts(dropna = False)

station_id  distance_from_source
3019        1140.0                  1
3021        1157.0                  1
3024        1456.0                  1
3027        1575.0                  1
3028        1601.0                  1
3029        1639.0                  1
3030        1786.0                  1
3031        1870.0                  1
3032        1981.0                  1
3035        2191.0                  1
3036        2216.0                  1
3037        2343.0                  1
3038        2409.0                  1
3041        2720.0                  1
3042        2767.0                  1
3045        2803.0                  1
3047        2837.0                  1
3048        2891.0                  1
3050        3117.0                  1
3087        728.0                   1
3106        1543.0                  1
3169        1280.0                  1
3180        730.0                   1
3229        2122.0                  1
3230        2272.0                  1
3554        509.9

In [112]:
# Print dropped
#list(X_train.loc[:,X_train.apply(pd.Series.nunique) == 1].columns)

In [31]:
X_train = X_train.loc[:,X_train.apply(pd.Series.nunique) != 1]

In [32]:
X_train.describe()

Unnamed: 0.1,Unnamed: 0,year,station_id,day,drainage_area,z_null,1_mean_stage_min,1_mean_stage_max,1_mean_ice_thickness,1_mean_snow_height_x,...,12_std_temperature_20cm,12_std_temperature_40cm,12_std_temperature_40cm_qual,12_std_temperature_80cm,12_std_temperature_80cm_qual,12_std_temperature_120cm,12_std_temperature_160cm,12_std_temperature_160cm_qual,12_std_temperature_240cm,12_std_temperature_320cm
count,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0,...,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0,6116.0
mean,3057.5,2001.122302,3047.52518,21.794964,-0.018848,0.057178,-0.05411,-0.054099,0.014485,0.006891,...,0.051831,0.073598,0.072362,-0.001914,0.000635,0.041738,-0.016915,-0.009645,0.09463,0.063792
std,1765.681455,11.235795,48.903739,12.70765,1.024864,0.990888,0.964532,0.964525,0.996678,0.888984,...,1.094583,1.273186,1.272645,0.97518,0.97745,1.070226,0.803811,0.813019,1.155316,1.073874
min,0.0,1986.0,3019.0,0.0,-1.250208,-1.391692,-3.049477,-3.049866,-1.665515,-1.40712,...,-0.519018,-0.120256,-0.119836,-0.098347,-0.104139,-0.351941,-0.092165,-0.101222,-0.298322,-0.35203
25%,1528.75,1990.0,3029.0,11.0,-0.485736,-0.82879,-0.366726,-0.366981,-0.744286,-0.686197,...,-0.519018,-0.120256,-0.119836,-0.098347,-0.104139,-0.351941,-0.092165,-0.101222,-0.298322,-0.35203
50%,3057.5,2000.0,3035.0,22.0,-0.428309,0.302352,0.052202,0.051968,-0.09823,-0.089571,...,-0.519018,-0.120256,-0.119836,-0.098347,-0.104139,-0.351941,-0.092165,-0.101222,-0.298322,-0.35203
75%,4586.25,2010.0,3045.0,33.0,0.587005,0.637459,0.518892,0.518681,0.631575,0.432476,...,0.376591,-0.111126,-0.119836,-0.095026,-0.104139,-0.351941,-0.090351,-0.101222,-0.298322,-0.35203
max,6115.0,2019.0,3230.0,44.0,2.397112,2.027005,1.870338,1.870194,4.220777,3.142151,...,5.766657,9.194273,9.181381,11.045442,10.919234,8.259717,13.299272,13.1909,5.496598,4.183021


In [36]:
# Drop duplicated columns

In [43]:
X_train['temperature_20cm_qual'].value_counts()

KeyError: 'temperature_20cm_qual'

In [45]:
temp = list(X_train.columns)

In [49]:
X_train['water_in_snow']

KeyError: 'water_in_snow'

In [50]:
meteo = pd.read_csv(os.path.join(path_to_data, 'track_1/meteo_1day'))

FileNotFoundError: [Errno 2] No such file or directory: '..//00_data/train_test_data\\track_1/meteo_1day'