# Регрессия 

## Импорт библиотек

In [36]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

## Подготовка данных

In [43]:
path_to_file = '../datasets/regression/metro_interstate_traffic_volume_preprocessed.csv'
dataset = pd.read_csv(path_to_file)

In [4]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,date_time,traffic_volume,h_Christmas Day,h_Columbus Day,h_Independence Day,...,wd_sleet,wd_smoke,wd_snow,wd_thunderstorm,wd_thunderstorm with drizzle,wd_thunderstorm with heavy rain,wd_thunderstorm with light drizzle,wd_thunderstorm with light rain,wd_thunderstorm with rain,wd_very heavy rain
0,0,288.28,0.0,0.0,40,2012-10-02 09:00:00,5545,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,289.36,0.0,0.0,75,2012-10-02 10:00:00,4516,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,289.58,0.0,0.0,90,2012-10-02 11:00:00,4767,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,290.13,0.0,0.0,90,2012-10-02 12:00:00,5026,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,291.14,0.0,0.0,75,2012-10-02 13:00:00,4918,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Удалим признаки *Unnamed: 0* (старая индексация) и *date_time* (дата/время), так как они не нужны для решения задачи

In [44]:
dataset = dataset.drop(['Unnamed: 0', 'date_time'], axis=1)
dataset

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,traffic_volume,h_Christmas Day,h_Columbus Day,h_Independence Day,h_Labor Day,h_Martin Luther King Jr Day,...,wd_sleet,wd_smoke,wd_snow,wd_thunderstorm,wd_thunderstorm with drizzle,wd_thunderstorm with heavy rain,wd_thunderstorm with light drizzle,wd_thunderstorm with light rain,wd_thunderstorm with rain,wd_very heavy rain
0,288.28,0.0,0.0,40,5545,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,289.36,0.0,0.0,75,4516,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,289.58,0.0,0.0,90,4767,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,290.13,0.0,0.0,90,5026,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,291.14,0.0,0.0,75,4918,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48199,283.45,0.0,0.0,75,3543,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48200,282.76,0.0,0.0,90,2781,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48201,282.73,0.0,0.0,90,2159,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48202,282.09,0.0,0.0,90,1450,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Разделим датасет на матрицу "объект-признак" и вектор-столбец ответов 

In [45]:
X = dataset.iloc[:, [0, 1, 2, 3] + list(range(5, 65))]
y = dataset.iloc[:, 4]

In [13]:
X.head()

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,h_Christmas Day,h_Columbus Day,h_Independence Day,h_Labor Day,h_Martin Luther King Jr Day,h_Memorial Day,...,wd_sleet,wd_smoke,wd_snow,wd_thunderstorm,wd_thunderstorm with drizzle,wd_thunderstorm with heavy rain,wd_thunderstorm with light drizzle,wd_thunderstorm with light rain,wd_thunderstorm with rain,wd_very heavy rain
0,288.28,0.0,0.0,40,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,289.36,0.0,0.0,75,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,289.58,0.0,0.0,90,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,290.13,0.0,0.0,90,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,291.14,0.0,0.0,75,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
y.head()

0    5545
1    4516
2    4767
3    5026
4    4918
Name: traffic_volume, dtype: int64

Разделим данные на обучающую и тестовую выборки

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

Теперь **масштабируем** признаки

In [47]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [48]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Решение задачи регрессии

### Вспомогательные функции

Функция *fit_reg_by_grid_search_cv* возвращает обученный данный алгоритм с подбором параметров по сетке с кросс-валидацией

In [18]:
def fit_reg_by_grid_search_cv(estimator, params):
    reg = GridSearchCV(estimator, params)
    reg.fit(X_train, y_train)
    return reg

Функция *print_metrics_score* выводит оценки алгоритма по критериям R^2, Mean Square Error(mse), Root Mean Square Error(rmse),  Mean Absolute Error(mae)

In [24]:
def print_metrics_score(regressor):
    y_pred = regressor.predict(X_test)
    
    r2 = regressor.score(X_test, y_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    print(f'R^2: {r2}\nMean Absolute Error: {mae}\nMean Squared Error: {mse}\nRoot Mean Squared Error: {rmse}')

### Линейная регрессия

Обучим линейную регрессию на наших данных, посмотрим на значения метрик

In [25]:
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

print_metrics_score(lin_reg)

R^2: -7.243954332276073e+21
Mean Absolute Error: 1720467851671.8792
Mean Squared Error: 2.8537452773949284e+28
Root Mean Squared Error: 168930319285643.0


Видно, что линейная регрессия плохо подходит для решения задачи на наших данных, потому что ошибки слишком большие

### Дерево решений

Посмотрим на то, как справится с задачей дерево решений. Подберём параметры max_depth, min_samples_leaf

In [30]:
params = {
    'max_depth': [500, 1000, 1500, 2000],
    'min_samples_leaf': [1, 2, 3, 4, 5]
}

reg = fit_reg_by_grid_search_cv(DecisionTreeRegressor(), params)

print_metrics_score(reg)

R^2: -0.05584058300711536
Mean Absolute Error: 1653.900238204975
Mean Squared Error: 4159468.6261526146
Root Mean Squared Error: 2039.4775375454897


Посмотрим на оптимальные параметры

In [31]:
reg.best_params_

{'max_depth': 2000, 'min_samples_leaf': 5}

Ошибка меньше, чем на линейной регрессии, но всё равно большая

### Lasso регрессия

Теперь посмотрим на Lasso (линейная модель с L1-регуляризацией)   
Поподбираем параметры alpha - константу перед множителем регуляризации, max_iter - максимальное число итераций алгоритма

In [37]:
params = {
    'alpha': [0.2, 0.4, 0.5, 0.6, 0.8], 
    'max_iter': [1000, 2000]
}

reg = fit_reg_by_grid_search_cv(Lasso(), params)

print_metrics_score(reg)

R^2: 0.05111016965195614
Mean Absolute Error: 1678.1001850025923
Mean Squared Error: 3738137.6909826235
Root Mean Squared Error: 1933.4264120939858


Посмотрим на оптимальные параметры

In [38]:
reg.best_params_

{'alpha': 0.8, 'max_iter': 1000}

Теперь посмотрим на веса коэффициентов

In [39]:
reg.best_estimator_.coef_

array([ 2.64984176e+02,  1.13285020e+01,  1.28350758e+01,  3.26002976e+02,
       -2.65315824e+01, -3.03551597e+01, -1.78797936e+01, -2.65978824e+01,
       -2.85953888e+01, -2.76257869e+01, -1.67666960e+01, -2.63472195e+01,
       -2.52055200e+01, -2.94515327e+01, -2.11351287e+01,  1.44187337e+02,
        1.60441352e+02, -3.34229447e+00, -4.70295830e+01,  7.66425272e+01,
       -9.07908445e+01, -0.00000000e+00,  3.35778096e+00, -4.60489628e+00,
       -1.43127619e+01, -5.53190387e+01, -8.26333812e-02,  4.43206949e+01,
        1.50726672e-01, -3.90684707e+01,  8.10836373e+01, -1.33872615e+00,
        6.61735247e+00,  2.93925447e+00,  3.12517441e-01, -4.64619354e+01,
       -1.43621226e+01, -0.00000000e+00,  9.61196812e+00,  0.00000000e+00,
        1.42825788e+01,  1.82466873e+01, -0.00000000e+00, -1.55594934e+00,
       -4.22405594e+01, -1.03549440e+02,  5.43333185e+01, -0.00000000e+00,
        1.18286558e+00, -5.66769194e+00,  1.00532651e+02,  2.74165564e+00,
        0.00000000e+00,  

Видно, что веса при некоторых признаках почти равны 0. L1-регуляризация провела отбор признаков

Ошибка примерно такая же, как и при испольовазнии дерева решений

### Ridge регрессия

Посмотрим на Ridge (линейная модель с L2-регуляризацией). Будем подбирать параметры alpha (аналогично предыдущему) и max_iter

In [49]:
params = {
    'alpha': [0.2, 0.4, 0.5, 0.6, 0.8], 
    'max_iter': [1000, 2000]
}

reg = fit_reg_by_grid_search_cv(Ridge(), params)

print_metrics_score(reg)

R^2: 0.04953961001916707
Mean Absolute Error: 1669.546780102418
Mean Squared Error: 3714557.7653065016
Root Mean Squared Error: 1927.3188021981473


Результат аналогичен предыдущему   
Посмотрим на оптимальные параметры и коэффициенты модели

In [50]:
reg.best_params_

{'alpha': 0.8, 'max_iter': 1000}

In [51]:
reg.best_estimator_.coef_

array([263.1201082 ,  12.79795794,   1.98952536, 332.97858673,
       -24.996593  , -23.93975523, -23.22082142, -27.64933624,
       -24.74780352, -25.02023919, -16.21114066, -28.84483862,
       -24.31052326, -26.56819458, -28.437431  ,  53.73676339,
        66.98704055, -26.92238296, -39.43419401,  26.3438889 ,
       -69.39305333, -39.3460812 ,  -2.17297974, -30.8958456 ,
        -8.29280376, -42.1609461 ,  -8.29280376,  53.36368058,
        24.4214967 , -43.41254137, 104.2266636 , -39.43419401,
         7.26273663,  26.3438889 ,  -7.54582294, -36.55791693,
       -20.19306125,   0.91988757,   7.1574637 , -11.47518876,
         4.00856245,  22.37557697, -12.63538172, -69.39305333,
       -50.22581474, -91.87740747,  57.71829414, -27.49601661,
        -1.69791039,  -8.91089348, 121.24198854,   3.28971202,
         0.        ,  33.07272323,   8.46229372,  -2.17297974,
       -38.33212364, -22.57290423,  -8.14709481, -16.35050752,
        -6.38344742, -14.99421511,  -4.83880084, -17.32

### ElasticNet регрессия

Протестируем работу алгоритма ElasticNet (линейная модель, совмещающая в себе L1- и L2-регуляризации)на наших данных   
Будем использовать класс ElasticNetCV для подборки параметров на кросс-валидации

In [55]:
elastic_net_reg = ElasticNetCV()

elastic_net_reg.fit(X_train, y_train)

print_metrics_score(elastic_net_reg)

R^2: 0.04507284293669578
Mean Absolute Error: 1679.4645093773856
Mean Squared Error: 3732014.63623654
Root Mean Squared Error: 1931.842290725757


Посмотрим на коэффициенты модели

In [56]:
elastic_net_reg.coef_

array([199.81531906,   9.61695951,   1.62725387, 144.84198447,
       -19.69634766, -18.74315963, -18.41757616, -22.37965808,
       -20.07264257, -19.52308994, -12.84700067, -22.09490454,
       -19.45193327, -19.79458944, -22.49737876,  -1.12016797,
        75.55425961,  -9.13360767, -35.2091813 ,  24.29316059,
       -52.54614706, -15.55607322,  -1.50116563, -18.4544976 ,
        -6.43576871, -28.62246018,  -6.43602317,  30.94002591,
        40.01827139, -26.29929938,  58.23696683, -35.20948215,
         5.99581583,  24.28961176,  -2.50400998, -20.6698962 ,
       -10.47033589,   7.81857653,   7.25054908,  -1.58920204,
         3.58021691,  17.705596  ,  -7.37343808, -52.55312673,
       -27.82516741, -36.05699855,  49.88919775, -18.42844301,
         0.        ,  -5.01877901,  87.88418665,   2.96593406,
         0.        , -15.03222109,   7.28935029,  -1.5016315 ,
       -25.33460854, -15.83654515,  -5.53417822, -10.13290685,
        -3.16193998, -10.09086848,  -1.09984925, -12.90

## Выводы

Если не брать в расчёт линейную регрессию, на всех алгоритмах ошибка **большая**, но **примерно одинаковая**  
Можно использовать любой из этих алгоритмов (кроме линейной регрессии)