# Reference: Python Data Science Handbook

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


%autosave 9999
%matplotlib inline

In [None]:
# the wrong way of model validation
# load iris method one
from sklearn.datasets import load_iris
iris1 = load_iris()
X1 = iris1.data
y1 = iris1.target
print(X1.shape,y1.shape,sep='\n')

In [None]:
# recall method two
iris2 = sns.load_dataset('iris')
iris2.head()

In [None]:
iris2['species'].unique()

In [None]:
def s2n(x):
    if x=='setosa':
        return 0
    elif x=='versicolor':
        return 1
    else:
        return 2
iris2['species_n'] = iris2['species'].map(s2n)
X2 = iris2.values[:,0:4]
y2 = iris2.values[:,5]
print(X2.shape,y2.shape,sep='\n')

In [None]:
# apply kNN with n_eighbors =1
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X1,y1)
y_pred = model.predict(X1)
accuracy_score(y1,y_pred)
# training and testing by using the same dataset

In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state = 0, train_size = 0.5)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
# by splitting, we lost some data in the training set, 
# this may cause problem when the original trainging set is small
y_pred1 = model.fit(X_train,y_train).predict(X_test)
y_pred2 = model.fit(X_test,y_test).predict(X_train)
accuracy_score(y_test,y_pred1), accuracy_score(y_train,y_pred2)
# this is called cross-validation

In [None]:
from sklearn.cross_validation import cross_val_score
cross_val_score(model,X1,y1, cv=5)

In [None]:
from sklearn.cross_validation import LeaveOneOut
score = cross_val_score(model,X1,y1,cv=LeaveOneOut(len(y1)))

In [None]:
score.mean()

In [None]:
# Validation curves in Scikit-Learn
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2,**kwargs):
    return make_pipeline(PolynomialFeatures(degree),LinearRegression(**kwargs))

def make_data(N,err=1.0, rseed=1):
    np.random.seed(rseed)
    X = np.random.rand(N,1)**2
    y = 10 - 1./(X.ravel()+0.1)
    if err>0:
        y = y + err*np.random.randn(N)
    return X,y
X,y = make_data(40)

In [None]:
sns.set()
X_test = np.linspace(-0.1,1.1,500).reshape(500,1)
plt.figure(figsize=(12,8))
plt.scatter(X.ravel(),y,color='black')
axis = plt.axis()
for degree in [1,5,9,30]:
    y_test = PolynomialRegression(degree).fit(X,y).predict(X_test)
    plt.plot(X_test.ravel(),y_test,label='degree={}'.format(degree))
plt.xlim(-0.1,1.0)
plt.ylim(-2,12)
plt.legend(loc='best')

In [None]:
from sklearn.learning_curve import validation_curve
degree = np.arange(0,21)
train_score, val_score = validation_curve(PolynomialRegression(),X,y,
                                         'polynomialfeatures__degree',degree,cv=7)
plt.figure(figsize=(12,8))
plt.plot(degree,np.median(train_score,1), color='blue',label='training score')
plt.plot(degree,np.median(val_score,1), color='red',label='validation score')
plt.legend('best')
plt.ylim(0,1)
plt.xlabel('degree')
plt.ylabel('score')

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(X.ravel(),y)
lim = plt.axis()
y_test = PolynomialRegression(3).fit(X,y).predict(X_test)
plt.plot(X_test.ravel(),y_test)
plt.axis(lim)

In [None]:
# optimal model will depend on the size of your training data
X2, y2 = make_data(200)
plt.figure(figsize=(12,8))
plt.scatter(X2.ravel(),y2)

In [None]:
degree = np.arange(0,21)
train_score2, val_score2 = validation_curve(PolynomialRegression(),X2,y2,
                                         'polynomialfeatures__degree',degree,cv=7)
plt.figure(figsize=(12,8))
plt.plot(degree,np.median(train_score2,1), color='blue',label='training score')
plt.plot(degree,np.median(val_score2,1), color='red',label='validation score')
plt.plot(degree,np.median(train_score,1), color='blue',alpha=0.3,linestyle='dashed')
plt.plot(degree,np.median(val_score,1), color='red',alpha=0.3,linestyle='dashed')
plt.legend('best')
plt.ylim(0,1)
plt.xlabel('degree')
plt.ylabel('score')

In [None]:
from sklearn.learning_curve import learning_curve

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)

for i, degree in enumerate([2,9]):
    N, train_lc, val_lc = learning_curve(PolynomialRegression(degree),
                                         X, y, cv=7,
                                         train_sizes=np.linspace(0.3, 1, 25))

    ax[i].plot(N, np.mean(train_lc, 1), color='blue', label='training score')
    ax[i].plot(N, np.mean(val_lc, 1), color='red', label='validation score')
    ax[i].hlines(np.mean([train_lc[-1], val_lc[-1]]), N[0], N[-1],
                 color='gray', linestyle='dashed')

    ax[i].set_ylim(0, 1)
    ax[i].set_xlim(N[0], N[-1])
    ax[i].set_xlabel('training size')
    ax[i].set_ylabel('score')
    ax[i].set_title('degree = {0}'.format(degree), size=14)
    ax[i].legend(loc='best')

In [None]:
# find the best model via Grid Search
from sklearn.grid_search import GridSearchCV

param_grid = {'polynomialfeatures__degree': np.arange(21),
              'linearregression__fit_intercept': [True, False],
              'linearregression__normalize': [True, False]}

grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7)
grid.fit(X, y);
print(grid.best_params_)
model = grid.best_estimator_
plt.scatter(X.ravel(), y)
lim = plt.axis()
y_test = model.fit(X, y).predict(X_test)
plt.plot(X_test.ravel(), y_test, hold=True);
plt.axis(lim);