# Домашняя работа
Взять boston house-prices datase (sklearn.datasets.load_boston) и сделать тоже самое для задачи регрессии (попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество)

In [37]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [23]:
import numpy as np
import pandas as pd

In [30]:
X, y = load_boston(return_X_y=True)

#### Пробуем для начала простую модель линейной регрессии:

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [32]:
reg_basic = LinearRegression()

In [33]:
reg_basic.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [34]:
y_pred = reg_basic.predict(X_test)

In [36]:
print("Basic Linear Regression R^2: {}".format(reg_basic.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Basic Linear Regression R^2: 0.711226005748496
Root Mean Squared Error: 4.638689926172797


#### Теперь попробуем то же самое, но уже с cross-validation:

In [108]:
cv_scores = cross_val_score(reg_basic, X, y, cv = 5)
rmse_cv = np.sqrt(-cross_val_score(reg_basic, X, y,  scoring="neg_mean_squared_error", cv = 5))

In [109]:
print("Mean 5-Fold CV Score: {}".format(np.mean(cv_scores)))
print("Root Mean Squared Error: {}".format(np.mean(rmse)))

Mean 5-Fold CV Score: 0.3532759243958772
Root Mean Squared Error: 5.074830769055445


#### Попробуем GridSearch c Ridge регрессией:

In [91]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [111]:
ridge = Ridge()

In [112]:
parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}

In [113]:
ridge_reg = GridSearchCV(ridge, parameters, scoring = 'neg_mean_squared_error', cv = 5)

In [116]:
ridge_reg.fit(X, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [118]:
print(ridge_reg.best_params_)

{'alpha': 20}


In [120]:
ridge_best = Ridge(alpha = 20)

In [121]:
ridge_best.fit(X_train, y_train)

Ridge(alpha=20, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [122]:
y_pred = ridge_best.predict(X_test)

In [123]:
print("Basic Linear Regression R^2: {}".format(ridge_best.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Basic Linear Regression R^2: 0.694304367508078
Root Mean Squared Error: 4.772664651800935


#### Попробуем GridSearch c Lasso регрессией:

In [124]:
lasso = Lasso()

In [125]:
lasso_reg = GridSearchCV(lasso, parameters, scoring = 'neg_mean_squared_error', cv = 5)

In [126]:
lasso_reg.fit(X, y)

  positive)
  positive)
  positive)
  positive)
  positive)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [127]:
print(lasso_reg.best_params_)

{'alpha': 1}


In [128]:
lasso_best = Ridge(alpha = 1)

In [129]:
lasso_best.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [130]:
y_pred = lasso_best.predict(X_test)

In [131]:
print("Basic Linear Regression R^2: {}".format(lasso_best.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Basic Linear Regression R^2: 0.7041586727559436
Root Mean Squared Error: 4.695109486461525


Вывод: среди тех моделей, которые были применены лучший результат показала обычная LinearRegression