In [1]:
import numpy as np
import pandas as pd 

In [2]:
from sklearn.datasets import load_boston

In [3]:
boston_data = load_boston()
X = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)
y = boston_data.target

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train_lin = scaler.transform(X_train)
X_test_lin = scaler.transform(X_test)

# Кросс-валидация для линейной регрессии

In [7]:
from sklearn.linear_model import LinearRegression
LR_model = LinearRegression()
LR_model.fit(X_train_lin, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
from sklearn.model_selection import cross_val_score
scores_train = cross_val_score(LR_model, X_train_lin, y_train, cv=10)
scores_train.mean()

0.7060076733666493

In [9]:
LR_model.score(X_test_lin, y_test)

0.7035984390534467

In [10]:
scores_train

array([0.67984886, 0.60747438, 0.53844851, 0.76343917, 0.77780307,
       0.63597571, 0.83567841, 0.7969538 , 0.72968804, 0.69476679])

# GridSearchCV

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
k_range = list(range(10, 110, 10))
k_range

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [14]:
param_grid = {'n_estimators': k_range, 'max_depth': [1, 3, 5, 7, 9, None]}
param_grid

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'max_depth': [1, 3, 5, 7, 9, None]}

In [15]:
grid_RF = GridSearchCV(RandomForestRegressor(), param_grid, cv=10)

In [16]:
grid_RF.fit(X_train_lin, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_depth': [1, 3, 5, 7, 9, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
print(grid_RF.best_params_)
print(grid_RF.best_score_)
print(grid_RF.best_estimator_)

{'max_depth': None, 'n_estimators': 100}
0.8346739112514507
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


In [18]:
from sklearn import linear_model

model = linear_model.SGDRegressor()

param_grid = {
    'alpha': [0.1, 0.001, 0.0001],
    'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'learning_rate': ['constant', 'optimal', 'invscaling'],
}
grid_SGD = GridSearchCV(model, param_grid, cv=10)
grid_SGD.fit(X_train_lin, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.1, 0.001, 0.0001], 'loss': ['squared_loss', 'huber', 'epsilon_insensitive'], 'penalty': ['l2', 'l1', 'elasticnet'], 'learning_rate': ['constant', 'optimal', 'invscaling']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [19]:
print(grid_SGD.best_params_)
print(grid_SGD.best_score_)
print(grid_SGD.best_estimator_)

{'alpha': 0.0001, 'learning_rate': 'invscaling', 'loss': 'squared_loss', 'penalty': 'l1'}
0.6974802123880638
SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l1', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)


# Оцениваем на валидационной выборке лучшие модели

In [20]:
score_RF = grid_RF.best_estimator_.score(X_test_lin, y_test)
score_SGD = grid_SGD.best_estimator_.score(X_test_lin, y_test)

In [21]:
score_RF

0.8718963785653266

In [22]:
score_SGD

0.6727720624326131