# Исследование алгоритмов

### Проверка и установка рабочей директории, должен быть корень проекта

In [1]:
%pwd

'C:\\Users\\Kuroha\\source\\repos_py\\bauman_final_project\\notebooks'

In [2]:
%cd ..

C:\Users\Kuroha\source\repos_py\bauman_final_project


### Загрузка датасета

In [3]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from src.utils import *

In [4]:
path = get_filepath(DATA_PROCESSED_TRAIN, is_raw=False)
df = pd.read_csv(path, parse_dates=['date'])

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
uid,9386,9386,9386,9386,9386
date,2008-01-01 00:00:00,2008-01-02 00:00:00,2008-01-03 00:00:00,2008-01-04 00:00:00,2008-01-05 00:00:00
latitude,0.312439,0.312439,0.312439,0.312439,0.312439
longitude,0.699807,0.699807,0.699807,0.699807,0.699807
temperature,0.438202,0.314607,0.146067,0.168539,0.325843
pressure,0.414474,0.519737,0.546053,0.546053,0.361842
cloud,1.0,0.0,0.0,0.666667,1.0
wind_spd,0.333333,0.166667,0.0,0.0,0.666667
is_fallback_data,1,1,1,1,1
water_level,138.0,138.0,138.0,138.0,138.0


### Подготовка данных

Необходимо проверить, зависит ли качество моделей от способа кодирования погоды: одной колонкой с наличием осадков, двумя - дождь и снег, либо тремя - дождь, гроза и снег.

Датасеты с разные вариантами кодирования погоды будут именоваться как **v1**, **v2**, **v3**.

In [6]:
df.columns

Index(['uid', 'date', 'latitude', 'longitude', 'temperature', 'pressure',
       'cloud', 'wind_spd', 'is_fallback_data', 'water_level', 'uid_0',
       'uid_1', 'uid_2', 'uid_3', 'uid_4', 'uid_5', 'uid_6', 'uid_7', 'uid_8',
       'uid_9', 'uid_10', 'uid_11', 'uid_12', 'uid_13', 'uid_14', 'year',
       'day_sin', 'day_cos', 'weather_v1_precip', 'weather_v2_rain',
       'weather_snow', 'weather_v3_rain', 'weather_v3_storm', 'north', 'south',
       'west', 'east', 'pressure_upper', 'pressure_lower', 'wind_spd_upper'],
      dtype='object')

Для обучения модели используем данные за 2008-2016 года, а для тестирования - за 2017.

In [7]:
df_X_train = df.loc[(df['date'] < '2017-01-01')]
df_X_test = df.loc[(df['date'] >= '2017-01-01')]
y_train = df_X_train['water_level']
y_test = df_X_test['water_level']

print(f'''Размерность оригинального датасета: {df.shape}
Размерность тренировочного датасета: {df_X_train.shape}
Размерность тестового датасета: {df_X_test.shape}
Размер тестовой выборки: {df_X_test.shape[0] / df_X_train.shape[0] * 100:.2f}%''')

Размерность оригинального датасета: (71159, 40)
Размерность тренировочного датасета: (61535, 40)
Размерность тестового датасета: (9624, 40)
Размер тестовой выборки: 15.64%


In [8]:
def get_all_df(input_df):
    df_all_base = input_df.drop(['date', 'water_level', 'uid'], axis=1)
    df_all_v1 = df_all_base.drop(['weather_v2_rain', 'weather_snow', 'weather_v3_rain', 'weather_v3_storm'], axis=1)
    df_all_v2 = df_all_base.drop(['weather_v1_precip', 'weather_v3_rain', 'weather_v3_storm'], axis=1)
    df_all_v3 = df_all_base.drop(['weather_v1_precip', 'weather_v2_rain'], axis=1)
    return [df_all_v1, df_all_v2, df_all_v3]

df_X_train_all = get_all_df(df_X_train)
df_X_test_all = get_all_df(df_X_test)

### Применение алгоритмов машинного обучения

Для оценки качества моделей будут использованы 2 меры качества: средняя квадратичная ошибка (MSE) и коэффициент детерминации ($R^2$).

In [9]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score

results = {}  # словарь с сохранёнными метриками
algos = {}  # словарь со всеми алгоритмами

In [10]:
# Сохранение модели вместе с оценкой предсказания
def add_result(model, predict, model_name):
    mae_result = round(mae(y_test, predict), 3)
    r2_score_result = round(r2_score(y_test, predict), 3)
    #mae_result = round(mae(y_test, predict), 3)
    #r2_score_result = round(r2_score(y_test, predict), 3)
    print(model_name)
    print(f'MAE: {mae_result}')
    print(f'R2_score: {r2_score_result}')
    print('=' * 20)
    
    
    results[model_name] = {'MAE': mae_result, 'R2_score': r2_score_result, 'model': model}

# Предсказание по заранее обученной модели
def predict_result(model, X_test, model_name):
    predict = model.predict(X_test)    
    add_result(model, predict, model_name)

# Применение алгоритма ко всем вариациям датасета с последующей оценкой качества
def apply_to_all(algo, model_name):
    algos[model_name] = algo
    for i in range(len(df_X_train_all)):
        model = algo(df_X_train_all[i], df_X_test_all[i])    
        predict_result(model, df_X_test_all[i], f'{model_name} (df_v{i+1})')

#### 1. Заглушка

Представим, что модель всегда возвращает средний уровень воды. Полученные ошибки будут использованы для оценки качества реальных моделей.

In [11]:
y_predicted = pd.Series([y_train.mean()] * y_test.shape[0])
y_predicted

0       266.42995
1       266.42995
2       266.42995
3       266.42995
4       266.42995
          ...    
9619    266.42995
9620    266.42995
9621    266.42995
9622    266.42995
9623    266.42995
Length: 9624, dtype: float64

In [12]:
add_result(None, y_predicted, 'Mean')

Mean
MAE: 153.267
R2_score: -0.014


#### 2. Линейная регрессия

In [13]:
from sklearn.linear_model import LinearRegression 

def linear_regression(X_train, _):    
    linear_regression = LinearRegression()
    linear_regression.fit(X_train, y_train)  
    
    return linear_regression

In [14]:
apply_to_all(linear_regression, 'Linear Regression')

Linear Regression (df_v1)
MAE: 109.123
R2_score: 0.412
Linear Regression (df_v2)
MAE: 109.105
R2_score: 0.412
Linear Regression (df_v3)
MAE: 109.104
R2_score: 0.412


In [15]:
pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,MAE,R2_score,model
Mean,153.267,-0.014,
Linear Regression (df_v1),109.123,0.412,LinearRegression()
Linear Regression (df_v2),109.105,0.412,LinearRegression()
Linear Regression (df_v3),109.104,0.412,LinearRegression()


#### 3. Дерево решений

In [16]:
from sklearn.tree import DecisionTreeRegressor

def decision_tree(X_train, _):
    decision_tree = DecisionTreeRegressor()
    decision_tree.fit(X_train, y_train)
    
    return decision_tree

In [17]:
apply_to_all(decision_tree, 'Decision Tree')

Decision Tree (df_v1)
MAE: 70.218
R2_score: 0.593
Decision Tree (df_v2)
MAE: 70.47
R2_score: 0.591
Decision Tree (df_v3)
MAE: 70.8
R2_score: 0.588


#### 4. Гребневая регрессия

In [18]:
from sklearn.linear_model import Ridge

def ridge(X_train, _):
    ridge = Ridge()
    ridge.fit(X_train, y_train)
    
    return ridge

In [19]:
apply_to_all(ridge, 'Ridge')

Ridge (df_v1)
MAE: 109.121
R2_score: 0.411
Ridge (df_v2)
MAE: 109.104
R2_score: 0.412
Ridge (df_v3)
MAE: 109.103
R2_score: 0.412


#### 5. Лассо

In [20]:
from sklearn.linear_model import Lasso

def lasso(X_train, _):
    lasso = Lasso()
    lasso.fit(X_train, y_train)
    
    return lasso

In [21]:
apply_to_all(lasso, 'Lasso')

Lasso (df_v1)
MAE: 117.914
R2_score: 0.365
Lasso (df_v2)
MAE: 117.914
R2_score: 0.365
Lasso (df_v3)
MAE: 117.914
R2_score: 0.365


#### 6. Простая нейронная сеть

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import mean_absolute_error


def simple_neural_network(X_train, X_test):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=[X_train.shape[1]]))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(1))
    
    model.compile(loss=mean_absolute_error, optimizer='Adam', metrics=['mae'])
    model.fit(X_train, y_train,
              validation_data=(X_test, y_test),
              epochs=10,
              verbose=1)
    
    return model

In [23]:
apply_to_all(simple_neural_network, 'Neural Network (simple)')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network (simple) (df_v1)
MAE: 65.183
R2_score: 0.639
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network (simple) (df_v2)
MAE: 66.984
R2_score: 0.65
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network (simple) (df_v3)
MAE: 63.495
R2_score: 0.651


#### 7. Нейронная сеть с ранней остановкой

In [24]:
from tensorflow.keras.callbacks import EarlyStopping


early_stopping = EarlyStopping(
    min_delta=0.1,
    patience=20,
    restore_best_weights=True,
)

def neural_network_with_ES(X_train, X_test):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=[X_train.shape[1]]))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(1))
    
    model.compile(loss=mean_absolute_error, optimizer='Adam', metrics=['mae'])
    model.fit(X_train, y_train,
              validation_data=(X_test, y_test),
              epochs=100,
              callbacks=[early_stopping],
              verbose=1)
    
    return model

In [25]:
apply_to_all(neural_network_with_ES, 'Neural Network (early stopping)')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Neural Network (early stopping) (df_v1)
MAE: 65.124
R2_score: 0.636
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100


Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Neural Network (early stopping) (df_v2)
MAE: 64.109
R2_score: 0.656
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Neural Network (early stopping) (df_v3)
MAE: 63.725
R2_score: 0.652


#### 8. Нейронная сеть со слоями Batch Normalization и Dropout

In [26]:
from tensorflow.keras.layers import BatchNormalization, Dropout


def neural_network_with_Dropout(X_train, X_test):
    model = Sequential()
    model.add(Dense(1024, activation='relu', input_shape=[X_train.shape[1]]))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(1))
    
    model.compile(loss=mean_absolute_error, optimizer='Adam', metrics=['mae'])
    model.fit(X_train, y_train,
              validation_data=(X_test, y_test),
              epochs=100,
              callbacks=[early_stopping],
              verbose=1)
    
    return model

In [27]:
apply_to_all(neural_network_with_Dropout, 'Neural Network (Dropout)')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Neural Network (Dropout) (df_v1)
MAE: 61.561
R2_score: 0.621
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Neural Network (Dropout) (df_v2)
MAE: 61.897
R2_score: 0.619
Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Neural Network (Dropout) (df_v3)
MAE: 62.461
R2_score: 0.616


#### 9. Эластичная сеть

In [28]:
from sklearn.linear_model import ElasticNet

def elastic_net(X_train, _):
    elastic_net = ElasticNet()
    elastic_net.fit(X_train, y_train)
    
    return elastic_net

In [29]:
apply_to_all(elastic_net, 'Elastic Net')

Elastic Net (df_v1)
MAE: 141.368
R2_score: 0.149
Elastic Net (df_v2)
MAE: 141.373
R2_score: 0.149
Elastic Net (df_v3)
MAE: 141.372
R2_score: 0.149


#### 10. Регрессия опорных векторов

In [30]:
from sklearn.svm import SVR

def svr(X_train, _):
    svr = SVR()
    svr.fit(X_train, y_train)
    
    return svr

In [31]:
apply_to_all(svr, 'SVR')

SVR (df_v1)
MAE: 86.954
R2_score: 0.385
SVR (df_v2)
MAE: 87.009
R2_score: 0.385
SVR (df_v3)
MAE: 87.041
R2_score: 0.384


#### 11. Градиентный бустинг

In [32]:
from sklearn.ensemble import GradientBoostingRegressor

def gbr(X_train, _):
    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, y_train)
    
    return gbr

In [33]:
apply_to_all(gbr, 'Gradient Boosting')

Gradient Boosting (df_v1)
MAE: 66.839
R2_score: 0.682
Gradient Boosting (df_v2)
MAE: 66.835
R2_score: 0.682
Gradient Boosting (df_v3)
MAE: 66.845
R2_score: 0.682


#### 12. Случайный лес 

In [34]:
from sklearn.ensemble import RandomForestRegressor

def random_forest(X_train, _):
    random_forest = RandomForestRegressor()
    random_forest.fit(X_train, y_train)
    
    return random_forest

In [35]:
apply_to_all(random_forest, 'Random Forest')

Random Forest (df_v1)
MAE: 67.223
R2_score: 0.648
Random Forest (df_v2)
MAE: 67.524
R2_score: 0.643
Random Forest (df_v3)
MAE: 67.4
R2_score: 0.646


In [36]:
frame = pd.DataFrame.from_dict(results, orient='index')
frame.sort_values(by='R2_score', ascending=False)

Unnamed: 0,MAE,R2_score,model
Gradient Boosting (df_v3),66.845,0.682,([DecisionTreeRegressor(criterion='friedman_ms...
Gradient Boosting (df_v2),66.835,0.682,([DecisionTreeRegressor(criterion='friedman_ms...
Gradient Boosting (df_v1),66.839,0.682,([DecisionTreeRegressor(criterion='friedman_ms...
Neural Network (early stopping) (df_v2),64.109,0.656,<keras.engine.sequential.Sequential object at ...
Neural Network (early stopping) (df_v3),63.725,0.652,<keras.engine.sequential.Sequential object at ...
Neural Network (simple) (df_v3),63.495,0.651,<keras.engine.sequential.Sequential object at ...
Neural Network (simple) (df_v2),66.984,0.65,<keras.engine.sequential.Sequential object at ...
Random Forest (df_v1),67.223,0.648,"(DecisionTreeRegressor(max_features=1.0, rando..."
Random Forest (df_v3),67.4,0.646,"(DecisionTreeRegressor(max_features=1.0, rando..."
Random Forest (df_v2),67.524,0.643,"(DecisionTreeRegressor(max_features=1.0, rando..."
