# Топик 4

In [2]:
%matplotlib inline
from matplotlib import pyplot as plt

plt.rcParams["figure.figsize"] = (10, 8)

import collections

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_csv("data.csv")
df.head(5) # Показываем первые 5 элементов

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
y_test = df["quality"]
x_test = df.drop("quality", axis=1)

### Разбиваем на 2 группы. "holdout / train"

In [6]:
x_train, x_holdout, y_train, y_holdout = train_test_split(
    x_test, y_test, test_size=0.3, random_state=17
)

In [7]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_holdout_scaled = scaler.transform(x_holdout)

## Линейная регрессия

### Обучение простой модели линейной регрессии

In [8]:
linreg = LinearRegression()
linreg.fit(x_train_scaled, y_train);

In [9]:
print(
    "Среднеквадратичная ошибка (train): %.3f"
    % mean_squared_error(y_train, linreg.predict(x_train_scaled))
)
print(
    "Среднеквадратичная ошибка (test): %.3f"
    % mean_squared_error(y_holdout, linreg.predict(x_holdout_scaled))
)

Среднеквадратичная ошибка (train): 0.425
Среднеквадратичная ошибка (test): 0.399


### Характеристика модели линейной регрессии, наиболее влияющая на кач-во вина

In [10]:
linreg_coef = pd.DataFrame(
    {
        "coef": linreg.coef_, 
        "coef_abs": np.abs(linreg.coef_)
    },
    index = df.columns.drop("quality"),
)
linreg_coef.sort_values(by="coef_abs", ascending=False)

Unnamed: 0,coef,coef_abs
alcohol,0.288473,0.288473
volatile acidity,-0.196166,0.196166
sulphates,0.136241,0.136241
total sulfur dioxide,-0.09268,0.09268
chlorides,-0.071022,0.071022
fixed acidity,0.070145,0.070145
free sulfur dioxide,0.052133,0.052133
citric acid,-0.049277,0.049277
pH,-0.044656,0.044656
density,-0.03418,0.03418


## Lasso

### Обучение модели LASSO с 𝛼=0,01 (слабая регуляризация) и масштабированными данными. 

In [11]:
lasso1 = Lasso(alpha=0.01, random_state=17)
lasso1.fit(x_train_scaled, y_train)

Lasso(alpha=0.01, random_state=17)

### Какой признак является наименее информативным для прогнозирования качества вина в соответствии с этой моделью LASSO?

In [12]:
lasso1_coef = pd.DataFrame(
    {"coef": lasso1.coef_, "coef_abs": np.abs(lasso1.coef_)},
    index=df.columns.drop("quality"),
)
lasso1_coef.sort_values(by="coef_abs", ascending=False)

Unnamed: 0,coef,coef_abs
alcohol,0.29589,0.29589
volatile acidity,-0.180571,0.180571
sulphates,0.118258,0.118258
total sulfur dioxide,-0.072435,0.072435
chlorides,-0.064996,0.064996
pH,-0.038977,0.038977
free sulfur dioxide,0.02902,0.02902
fixed acidity,0.016821,0.016821
citric acid,-0.0,0.0
residual sugar,0.0,0.0


### Обучите LassoCV с random_state=17, чтобы выбрать лучшее значение 𝛼 в 5-кратной перекрестной проверке

In [13]:
alphas = np.logspace(-6, 2, 200)
lasso_cv = LassoCV(random_state=17, cv=5, alphas=alphas)
lasso_cv.fit(x_train_scaled, y_train)

LassoCV(alphas=array([1.00000000e-06, 1.09698580e-06, 1.20337784e-06, 1.32008840e-06,
       1.44811823e-06, 1.58856513e-06, 1.74263339e-06, 1.91164408e-06,
       2.09704640e-06, 2.30043012e-06, 2.52353917e-06, 2.76828663e-06,
       3.03677112e-06, 3.33129479e-06, 3.65438307e-06, 4.00880633e-06,
       4.39760361e-06, 4.82410870e-06, 5.29197874e-06, 5.80522552e-06,
       6.36824994e-06, 6.98587975e-0...
       1.18953407e+01, 1.30490198e+01, 1.43145894e+01, 1.57029012e+01,
       1.72258597e+01, 1.88965234e+01, 2.07292178e+01, 2.27396575e+01,
       2.49450814e+01, 2.73644000e+01, 3.00183581e+01, 3.29297126e+01,
       3.61234270e+01, 3.96268864e+01, 4.34701316e+01, 4.76861170e+01,
       5.23109931e+01, 5.73844165e+01, 6.29498899e+01, 6.90551352e+01,
       7.57525026e+01, 8.30994195e+01, 9.11588830e+01, 1.00000000e+02]),
        cv=5, random_state=17)

In [14]:
lasso_cv.alpha_

0.001135733358343105

### Какой признак наименее информативен для прогнозирования качества вина по настроенной модели LASSO?

In [15]:
lasso_cv_coef = pd.DataFrame(
    {"coef": lasso_cv.coef_, "coef_abs": np.abs(lasso_cv.coef_)},
    index=df.columns.drop("quality"),
)
lasso_cv_coef.sort_values(by="coef_abs", ascending=False)

Unnamed: 0,coef,coef_abs
alcohol,0.291851,0.291851
volatile acidity,-0.194686,0.194686
sulphates,0.133492,0.133492
total sulfur dioxide,-0.09047,0.09047
chlorides,-0.070557,0.070557
fixed acidity,0.059823,0.059823
free sulfur dioxide,0.049825,0.049825
pH,-0.045997,0.045997
citric acid,-0.04338,0.04338
density,-0.025397,0.025397


### Среднеквадратичные ошибки

In [16]:
print(
    "Среднеквадратичная ошибка (train): %.3f"
    % mean_squared_error(y_train, lasso_cv.predict(x_train_scaled))
)
print(
    "Среднеквадратичная ошибка (test): %.3f"
    % mean_squared_error(y_holdout, lasso_cv.predict(x_holdout_scaled))
)

Среднеквадратичная ошибка (train): 0.425
Среднеквадратичная ошибка (test): 0.399


## Random Forest

### Обучение Random Forest с готовыми параметрами

In [17]:
forest = RandomForestRegressor(random_state=17)
forest.fit(x_train_scaled, y_train)

RandomForestRegressor(random_state=17)

In [18]:
print(
    "Среднеквадратичная ошибка (train): %.3f"
    % mean_squared_error(y_train, forest.predict(x_train_scaled))
)
print(
    "Среднеквадратичная ошибка (cv): %.3f"
    % np.mean(
        np.abs(
            cross_val_score(
                forest, x_train_scaled, y_train, scoring="neg_mean_squared_error"
            )
        )
    )
)
print(
    "Среднеквадратичная ошибка (test): %.3f"
    % mean_squared_error(y_holdout, forest.predict(x_holdout_scaled))
)

Среднеквадратичная ошибка (train): 0.048
Среднеквадратичная ошибка (cv): 0.356
Среднеквадратичная ошибка (test): 0.356


### Настройка гиперпараметров
Установка max_features и max_depth гиперпараметров

In [19]:
forest_params = {"max_depth": list(range(10, 25)), "max_features": list(range(6, 12))}

locally_best_forest = GridSearchCV(
    RandomForestRegressor(n_jobs=-1, random_state=17),
    forest_params,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    cv=5,
    verbose=True,
)
locally_best_forest.fit(x_train_scaled, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=-1, random_state=17),
             n_jobs=-1,
             param_grid={'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                       20, 21, 22, 23, 24],
                         'max_features': [6, 7, 8, 9, 10, 11]},
             scoring='neg_mean_squared_error', verbose=True)

In [20]:
locally_best_forest.best_params_, locally_best_forest.best_score_

({'max_depth': 19, 'max_features': 6}, -0.3502156757419382)

### Каковы среднеквадратические ошибки настроенной модели Random Forest. 

In [21]:
print(
    "Среднеквадратичная ошибка (cv): %.3f"
    % np.mean(
        np.abs(
            cross_val_score(
                locally_best_forest.best_estimator_,
                x_train_scaled,
                y_train,
                scoring="neg_mean_squared_error",
            )
        )
    )
)
print(
    "Среднеквадратичная ошибка (test): %.3f"
    % mean_squared_error(y_holdout, locally_best_forest.predict(x_holdout_scaled))
)

Среднеквадратичная ошибка (cv): 0.350
Среднеквадратичная ошибка (test): 0.345
