# Boston - Cross Validation

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
#from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder

## Data Loading

In [None]:
boston = datasets.load_boston()
dir(boston)

In [None]:
#print(boston.DESCR)

## Data Exploring

In [None]:
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
X.shape, y.shape

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Data Visualization

In [None]:
fig, axes = plt.subplots(3, 5, figsize=(20, 10))
for i, ax in enumerate(axes.flat):
    if i > 12:
        ax.set_visible(False)
        continue
    ax.plot(X[:, i], y, 'o', alpha=.5)
    ax.set_title("{}: {}".format(i, boston.feature_names[i]))
    ax.set_ylabel("MEDV")

In [None]:
plt.boxplot(X)
plt.xticks(np.arange(1, X.shape[1] + 1),
           boston.feature_names, rotation=30, ha="right");
plt.savefig("../images/boston_unscaled_box.png")

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
plt.boxplot(X_train)
plt.xticks(np.arange(1, X.shape[1] + 1),
           boston.feature_names, rotation=30, ha="right");
plt.savefig("../images/boston_scaled_box.png")

## Linear Regression

### Train-Test Score

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

### Cross Validation Score

In [None]:
scores = cross_val_score(LinearRegression(), X_train, y_train, cv=5)
np.mean(scores)

## Ridge Regression

### Train-Test Score

In [None]:
r = Ridge()
r.fit(X_train, y_train)
r.score(X_test, y_test)

### Cross Validation Score

In [None]:
scores = cross_val_score(Ridge(), X_train, y_train, cv=5)
np.mean(scores)

### Grid Search: Parameter Alpha

#### Parameter Definition

In [None]:
param_grid = {'alpha': np.logspace(-3, 3, 14)}
print(param_grid)

#### Grid Creation & Training

In [None]:
grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True, iid=True)
grid.fit(X_train, y_train)

#### Grid Best Params

In [None]:
print(grid.best_params_)
print(grid.best_score_)

#### Grid Results Visualization

In [None]:
results = pd.DataFrame(grid.cv_results_)
fig, axes = plt.subplots(figsize=(8, 5))
results.plot('param_alpha', 'mean_train_score', ax=axes)
results.plot('param_alpha', 'mean_test_score', ax=axes)

plt.legend()
plt.xscale("log")

In [None]:
results[['param_alpha', 'mean_test_score', 'std_test_score', 'rank_test_score']]

## Lasso Regression

### Train-Test Score

In [None]:
l = Lasso()
l.fit(X_train, y_train)
r.score(X_test, y_test)

### Cross Validation Score

In [None]:
scores = cross_val_score(Lasso(), X_train, y_train, cv=5)
np.mean(scores)

### Automatic Features Selection

In [None]:
boston.feature_names

In [None]:
l.coef_

In [None]:
boston.feature_names[l.coef_ != 0]

### Grid Search: Parameter Alpha

#### Parameter Definition

In [None]:
param_grid = {'alpha': np.logspace(-3, 3, 14)}
print(param_grid)

#### Grid Creation & Training

grid = GridSearchCV(Lasso(), param_grid, cv=10, return_train_score=True, iid=True)
grid.fit(X_train, y_train)

#### Grid Best Params

In [None]:
print(grid.best_params_)
print(grid.best_score_)

#### Grid Results Visualization

In [None]:
results = pd.DataFrame(grid.cv_results_)
fig, axes = plt.subplots(figsize=(8, 5))
results.plot('param_alpha', 'mean_train_score', ax=axes)
results.plot('param_alpha', 'mean_test_score', ax=axes)

plt.legend()
plt.xscale("log")

In [None]:
results[['param_alpha', 'mean_test_score', 'std_test_score', 'rank_test_score']]