# ДЗ: Знакомство с Машинным обучением

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LassoCV, RidgeCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline

In [2]:
boston_data = load_boston()
print(boston_data['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [3]:
X = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)
y = boston_data.target

In [4]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
X_train.index

Int64Index([ 33, 283, 418, 502, 402, 368, 201, 310, 343, 230,
            ...
            228,   8,  73, 400, 118, 486, 189, 495, 206, 355],
           dtype='int64', length=404)

In [6]:
X_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
33,1.15172,0.0,8.14,0.0,0.538,5.701,95.0,3.7872,4.0,307.0,21.0,358.77,18.35
283,0.01501,90.0,1.21,1.0,0.401,7.923,24.8,5.8850,1.0,198.0,13.6,395.52,3.16
418,73.53410,0.0,18.10,0.0,0.679,5.957,100.0,1.8026,24.0,666.0,20.2,16.45,20.62
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
402,9.59571,0.0,18.10,0.0,0.693,6.404,100.0,1.6390,24.0,666.0,20.2,376.11,20.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,5.69175,0.0,18.10,0.0,0.583,6.114,79.8,3.5459,24.0,666.0,20.2,392.68,14.98
189,0.08370,45.0,3.44,0.0,0.437,7.185,38.9,4.5667,5.0,398.0,15.2,396.90,5.39
495,0.17899,0.0,9.69,0.0,0.585,5.670,28.8,2.7986,6.0,391.0,19.2,393.29,17.60
206,0.22969,0.0,10.59,0.0,0.489,6.326,52.5,4.3549,4.0,277.0,18.6,394.87,10.97


In [7]:
y_train

array([13.1, 50. ,  8.8, 20.6, 12.1, 50. , 24.1, 16.1, 23.9, 24.3, 13.1,
       30.3, 15.2, 13.8, 26.4, 16.6, 18.9, 17.6, 18.7, 33.4, 20.7, 17.1,
       23.4, 26.5, 21.4, 21.5, 19.2, 50. , 50. , 23. , 10.5, 17.8, 10.9,
       21. , 13.8, 10.5, 22.2, 30.5, 19.4, 15.6, 20.2, 19.3, 34.6, 50. ,
       24. , 18.7, 19.8, 22.5, 13.3, 50. , 11.8, 11. , 23.7, 35.4, 15.2,
       24.4, 33.4, 31.6, 13.4, 34.9, 14.4, 35.4, 25.3, 18.3, 16.6, 13.4,
       23.6, 27.5, 22.2, 17.7, 14.3, 21.7,  8.4, 15.3, 20.3, 32. , 20. ,
       19.1, 28.7, 46. , 22.6, 23.9, 21.9, 15.6, 50. , 25. , 37.9, 21.6,
       19.3, 17.5, 22.9, 15. , 27.5, 10.2, 23.8, 23.9, 20.1, 16.5, 33.1,
       14.6, 28.4, 23.7, 12.3, 31.5, 22. , 12.5, 35.1, 14.9, 22.9, 22.9,
       19.3, 19.8, 20. , 29.6, 20.5, 29. , 20.7, 19.9, 11.9,  5. , 23.3,
       20.6, 22.9, 19.6, 14.1, 30.8, 43.1, 19.9, 13.9, 22.3, 14.3, 23.9,
       16. , 20.5, 10.2, 20.1, 12.8, 18.9, 22. , 20.4, 17.5, 13.1, 22. ,
       45.4, 18.8, 20. , 20.1, 21.4, 17.4, 21.1, 28

## Обучить модель

In [8]:
model = LinearRegression()

In [9]:
model.fit(
    X_train,
    y_train
)

LinearRegression()

## Оценить точность модели на обучающей выборке и тестовой

In [10]:
pred_train = model.predict(
    X_train
)

In [11]:
pred_test = model.predict(
    X_test
)

In [12]:
print(f"Средняя квадратичная ошибка (MSE) модели на обучающей выборке {mean_squared_error(y_train, pred_train)}")
print(f"Средняя квадратичная ошибка (MSE) модели на тестовой выборке {mean_squared_error(y_test, pred_test)}")
print()
print(f"Коэффициент детерминации (R^2) модели на обучающей выборке {r2_score(y_train, pred_train)}")
print(f"Коэффициент детерминации (R^2) модели на тестовой выборке {r2_score(y_test, pred_test)}")

Средняя квадратичная ошибка (MSE) модели на обучающей выборке 22.477090408387628
Средняя квадратичная ошибка (MSE) модели на тестовой выборке 20.869292183770682

Коэффициент детерминации (R^2) модели на обучающей выборке 0.738339392059052
Коэффициент детерминации (R^2) модели на тестовой выборке 0.7334492147453092


In [13]:
results = pd.DataFrame(columns=['MSE_train', 'MSE_test', 'R2_train', 'R2_test'],
                       index=['linear_regression', 'lasso', 'ridge', 'elastic_net'])
for name, model in zip(list(results.index), [
    make_pipeline(StandardScaler(), PolynomialFeatures(), LinearRegression()),
    make_pipeline(StandardScaler(), PolynomialFeatures(), LassoCV(cv=5, n_jobs=-1)),
    make_pipeline(StandardScaler(), PolynomialFeatures(), RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10, 100])),
    make_pipeline(StandardScaler(), PolynomialFeatures(), ElasticNetCV(n_jobs=-1, l1_ratio=[i/100 for i in range(1, 101)]))
]):
    print(f"{name}:")
    model.fit(X_train, y_train)
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    print(f"SCORE = {model.score(X_test, y_test)}")
    
    results.loc[name]['MSE_train'] = mean_squared_error(y_train, pred_train)
    results.loc[name]['MSE_test'] = mean_squared_error(y_test, pred_test)
    results.loc[name]['R2_train'] = r2_score(y_train, pred_train)
    results.loc[name]['R2_test'] = r2_score(y_test, pred_test)
    if name == 'lasso':
        print(f"\talpha={model[-1].alpha_}")
    elif name == 'ridge':
        print(f"\talpha={model[-1].alpha_}")
    elif name == 'elastic_net':
        print(f"\talpha={model[-1].alpha_}, l1_ratio={model[-1].l1_ratio_}")
    print("="*80)
results

linear_regression:
SCORE = 0.869967225640876
lasso:
SCORE = 0.8922968684000933
	alpha=0.014259493323556414
ridge:
SCORE = 0.889254375486358
	alpha=1.0
elastic_net:
SCORE = 0.8944933922162833
	alpha=0.025702921480391803, l1_ratio=0.45


Unnamed: 0,MSE_train,MSE_test,R2_train,R2_test
linear_regression,5.905266,10.180769,0.931256,0.869967
lasso,6.587117,8.432495,0.923318,0.892297
ridge,6.121455,8.670703,0.928739,0.889254
elastic_net,7.129038,8.26052,0.917009,0.894493


In [14]:
pipe = Pipeline([('preprocessing', make_pipeline(StandardScaler(), PolynomialFeatures())), ('estimator', Lasso())])

param_grid = [
    {'preprocessing': [make_pipeline(StandardScaler(), PolynomialFeatures())],
     'estimator': [Lasso()],
     'estimator__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'preprocessing': [make_pipeline(StandardScaler(), PolynomialFeatures())],
     'estimator': [Ridge()],
     'estimator__alpha': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'preprocessing': [make_pipeline(StandardScaler(), PolynomialFeatures())],
     'estimator': [ElasticNet()],
     'estimator__l1_ratio': [i/100 for i in range(1, 101)]}
]

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'estimator': Ridge(alpha=10), 'estimator__alpha': 10, 'preprocessing': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('polynomialfeatures', PolynomialFeatures())])}

Best cross-validation score: 0.86
Test-set score: 0.90
