In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


%matplotlib inline

## 1. Cross Val Score

In [95]:
data = pd.read_csv('https://github.com/mbburova/MDS/raw/main/house_prices_prep.csv')
data.head()

Unnamed: 0,SalePrice,LotArea,OverallQual,MasVnrArea,TotalBsmtSF,GrLivArea,FullBath,GarageCars,Fireplaces,WoodDeckSF,...,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_other,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None
0,208500,8450.0,7,196.0,856.0,1710.0,2,2,0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,181500,9600.0,6,0.0,1262.0,1262.0,2,2,1,298.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,223500,11250.0,7,162.0,920.0,1786.0,2,2,1,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,140000,9550.0,7,0.0,756.0,1717.0,1,3,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,250000,14260.0,8,350.0,1145.0,2198.0,2,3,1,192.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [96]:
from sklearn.model_selection import train_test_split

tr, te = train_test_split(data, test_size=0.2, random_state=42)

y_train = tr.SalePrice
y_test = te.SalePrice
X_train = tr.drop(['SalePrice'], axis=1)
X_test = te.drop(['SalePrice'], axis=1)

All the preprocessing was already done, so the only thing we need to do is scale numerical features. For example, we can use `StandardScaler` for that.

In [97]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_cols = ['LotArea', 'MasVnrArea', 'TotalBsmtSF', 'GrLivArea', 'WoodDeckSF', 'OpenPorchSF', 'Age', 'RemodAge']

# transform
column_transforms = ColumnTransformer([
    ('scaling', StandardScaler(), num_cols),
    ], remainder='passthrough')

Total Pipeline:
 - Column Tranformer
 - Linear Regression

In [98]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# define pipeline
pipe = make_pipeline(column_transforms,
                    LinearRegression())

But what if I want to try different preprocessing? E.g we can use `MinMaxScaler` for numerical features instead of `StandardScaler`?


In [99]:
from sklearn.preprocessing import MinMaxScaler

#option 2
column_transforms_2 = ColumnTransformer([
    ('scaling', MinMaxScaler(), num_cols),
    ], remainder='passthrough')

pipe_2 = make_pipeline(column_transforms_2,
                      LinearRegression())


We would like to compare Linear Regression these two types of preprocessing **before** evaluating model on the test set. Cross-validation is very useful in this case. 

![im](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

`sklearn.model_selection` module has a a function `cross_val_score`

**Parameters**:
 - estimator (model or the whole pipeline)
 - training data
 - number of folds or custom CV object
 - scorer 

In [100]:
# possible scorers
import sklearn.metrics
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we

We will use K-Fold cross validation. But there are other, more sophisticated options available. You can read about them [here](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators)

CV-score of the first pipeline:

In [101]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipe, X_train, y_train, cv=10, scoring='neg_mean_squared_error')

array([-7.45549592e+08, -1.85002686e+09, -8.77324619e+08, -2.08352484e+09,
       -3.73386123e+09, -1.50531901e+09, -7.92459716e+08, -7.57815939e+08,
       -6.11313256e+08, -7.93048319e+08])

In [103]:
rmse = np.mean((-cross_val_score(pipe, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) ** .5)
rmse

35405.07542373271

CV-score of the second pipeline:

In [104]:
rmse = np.mean((-cross_val_score(pipe_2, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) ** .5)
rmse

35405.075423732706

---

## 2. Linear Regression with Regularization

**Lasso**
$$
\min_{w} MSE + \lambda \|w\|_1
$$


**Ridge**
$$
\min_{w} MSE + \lambda \|w\|_2^2
$$

Let us use cross-validation to compare Lasso and Ridge regression.

In [105]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge

# define pipelines
pipe_lasso = make_pipeline(column_transforms, Lasso(max_iter=2000))
# pipe_lasso = Pipeline([('transform', column_transforms), ('lasso', Lasso(max_iter=2000))])

pipe_ridge = make_pipeline(column_transforms, Ridge())
# pipe_ridge = Pipeline(['transform', column_transforms), ('lasso', Ridge())])

In [106]:
# lasso cv score
rmse = np.mean((-cross_val_score(pipe_lasso, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) ** .5)
rmse

35404.292939388324

In [107]:
# ridge cv score
rmse = np.mean((-cross_val_score(pipe_ridge, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) ** .5)
rmse

35390.22070413702

In [108]:
# take a look at our pipeline
pipe_lasso.steps

[('columntransformer',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('scaling', StandardScaler(),
                                   ['LotArea', 'MasVnrArea', 'TotalBsmtSF',
                                    'GrLivArea', 'WoodDeckSF', 'OpenPorchSF',
                                    'Age', 'RemodAge'])])),
 ('lasso', Lasso(max_iter=2000))]

In [109]:
pipe_ridge.steps

[('columntransformer',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('scaling', StandardScaler(),
                                   ['LotArea', 'MasVnrArea', 'TotalBsmtSF',
                                    'GrLivArea', 'WoodDeckSF', 'OpenPorchSF',
                                    'Age', 'RemodAge'])])),
 ('ridge', Ridge())]

But now we also want to try different values of regularization coefficient. Creating new pipeline for each option would be too much, so we need a better solution. `GridSearchCV` will help us.

In [110]:
from sklearn.model_selection import GridSearchCV

In [112]:
# define parameter grid 
param_grid = {
    'ridge__alpha':[1e-4, 1e-2, 1e-1, 1, 10]
}


# define `GridSearchCV` object
pipe_cv = GridSearchCV(pipe_ridge, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')

In [113]:
# fit `pipe_cv`
pipe_cv.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scaling',
                                                                         StandardScaler(),
                                                                         ['LotArea',
                                                                          'MasVnrArea',
                                                                          'TotalBsmtSF',
                                                                          'GrLivArea',
                                                                          'WoodDeckSF',
                                                                          'OpenPorchSF',
                                                                          'Age',
                                                      

In [114]:
# get best estimator
pipe_cv.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scaling', StandardScaler(),
                                                  ['LotArea', 'MasVnrArea',
                                                   'TotalBsmtSF', 'GrLivArea',
                                                   'WoodDeckSF', 'OpenPorchSF',
                                                   'Age', 'RemodAge'])])),
                ('ridge', Ridge(alpha=10))])

### Compare models with GridSearchCV
By far we've used cross-validation to:
- Compare two different models
- Select best set of hyperparameters within one model

But what if we want to do both? We can use `GridSearchCV` to compare different models with different sets of hyperparameters and select the best one. 

To do that, we need to add different models into the parameter grid. 

In [115]:
from sklearn.pipeline import Pipeline

# define pipe 
pipe = Pipeline([
    ('preprocess', column_transforms),
    ('reg', Ridge())
])

# define param grid
param_grid = {
    'reg': [Ridge(max_iter=1000000), Lasso(max_iter=1000000)],
    'reg__alpha': [1e-2, 1e-1, 1, 10]
}

# define grid search object
pipe_cv = GridSearchCV(pipe, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')

In [116]:
# fit
pipe_cv.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scaling',
                                                                         StandardScaler(),
                                                                         ['LotArea',
                                                                          'MasVnrArea',
                                                                          'TotalBsmtSF',
                                                                          'GrLivArea',
                                                                          'WoodDeckSF',
                                                                          'OpenPorchSF',
                                                                          'Age',
                                                             

In [117]:
# print the score of the best model
pipe_cv.best_estimator_

Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scaling', StandardScaler(),
                                                  ['LotArea', 'MasVnrArea',
                                                   'TotalBsmtSF', 'GrLivArea',
                                                   'WoodDeckSF', 'OpenPorchSF',
                                                   'Age', 'RemodAge'])])),
                ('reg', Ridge(alpha=10, max_iter=1000000))])

Finally, what if we also want to compare the Linear Regression model with Ridge and Lasso? 

We cannot add it to the list of models in the parameter grid above, because it does not have `alpha` parameter. 
Turns out `GridSearchCV` can deal with this situation as well. We can create **list of dictionaries** as a param grid. 

In [118]:
# define pipe 
pipe = Pipeline([
    ('preprocess', column_transforms),
    ('reg', Ridge())
])

# define param grid
param_grid = [
    # parameter grid for lasso and ridge (model and regularization coefficient)
    {
        'reg': [Ridge(max_iter=1000000), Lasso(max_iter=1000000)],
        'reg__alpha': [1e-2, 1e-1, 1, 10]
    }, 
    # parameter grid for linear regression (only model)
    {
        'reg': [LinearRegression()]
    }
]

# define grid search object
pipe_cv = GridSearchCV(pipe, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')

In [119]:
# fit and print best estimator
pipe_cv.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scaling',
                                                                         StandardScaler(),
                                                                         ['LotArea',
                                                                          'MasVnrArea',
                                                                          'TotalBsmtSF',
                                                                          'GrLivArea',
                                                                          'WoodDeckSF',
                                                                          'OpenPorchSF',
                                                                          'Age',
                                                             

### Train best model on the whole train and evaluate on test

Now we can use best estimator found by Grid Search, to train on the whole training dataset and evaluate it on the test dataset. 

In [120]:
# get the best model from `pipe_cv`
best_m = pipe_cv.best_estimator_

# fit on the train dataset
best_m.fit(X_train, y_train)

# calculate predictions on test
predict = best_m.predict(X_test)

In [121]:
# calculate root mean squared error on the test set
from sklearn.metrics import mean_squared_error

In [122]:
mean_squared_error(y_test, predict) ** .5

36459.483295899