# Cross Validation

## Import Libraries

In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pydataset

from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')

## Exercises 

1. Use the cross validation techniques described in the lesson to find the best model for predicting transmission type with the mpg dataset.
1. Use cross validation techniques to determine the best model for predicting survival with the titanic dataset.
1. Use cross validation techniques to determine the best model for predicting tip amount with the tips dataset.

## MPG Dataset

### Wrangle MPG Dataset

In [2]:
mpg = pydataset.data('mpg')
mpg.trans = np.where(mpg.trans.str.startswith('auto'), 'auto', 'manual')

In [3]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto,f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual,f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual,f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto,f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto,f,16,26,p,compact


### Split the Data

In [4]:
X, y = mpg[['displ', 'cyl', 'cty', 'hwy']], mpg.trans

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

###  Cross Validation Models

#### Decision Tree

In [14]:
clf = DecisionTreeClassifier()
grid = GridSearchCV(clf, {'max_depth': range(1, 21), 'min_samples_leaf': range(1, 11)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 21),
                         'min_samples_leaf': range(1, 11)})

In [15]:
results_decision_tree = pd.DataFrame(grid.cv_results_['params'])
results_decision_tree['score'] = grid.cv_results_['mean_test_score']

In [16]:
results_decision_tree.sort_values(by='score').tail(10)

Unnamed: 0,max_depth,min_samples_leaf,score
90,10,1,0.696405
100,11,1,0.70098
60,7,1,0.701634
80,9,1,0.70719
140,15,1,0.70719
120,13,1,0.70719
110,12,1,0.707516
150,16,1,0.712745
170,18,1,0.718954
180,19,1,0.718954


In [17]:
cross_validate(clf, X_train, y_train, cv=5, return_estimator=True)

{'fit_time': array([0.00308084, 0.00239992, 0.0024581 , 0.00255704, 0.00245309]),
 'score_time': array([0.00177598, 0.0015502 , 0.00151873, 0.00184584, 0.00165892]),
 'estimator': [DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier()],
 'test_score': array([0.6       , 0.85714286, 0.62857143, 0.54285714, 0.65714286])}

#### Random Forest Classifier

In [22]:
clf = RandomForestClassifier()
grid = GridSearchCV(clf, {'max_depth': range(1, 21), 'min_samples_leaf': range(1, 11)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': range(1, 21),
                         'min_samples_leaf': range(1, 11)})

In [24]:
results_random_forest = pd.DataFrame(grid.cv_results_['params'])
results_random_forest['score'] = grid.cv_results_['mean_test_score']

In [25]:
results_random_forest.sort_values(by='score').tail(10)

Unnamed: 0,max_depth,min_samples_leaf,score
160,17,1,0.683987
120,13,1,0.684314
100,11,1,0.684314
150,16,1,0.689542
140,15,1,0.689869
70,8,1,0.689869
110,12,1,0.689869
190,20,1,0.690196
80,9,1,0.695752
130,14,1,0.701634


#### K Nearest Neighbors Classifier

In [27]:
clf = KNeighborsClassifier()
grid = GridSearchCV(clf, {'n_neighbors': range(1, 21)}, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 21)})

In [28]:
results_knn = pd.DataFrame(grid.cv_results_['params'])
results_knn['score'] = grid.cv_results_['mean_test_score']

In [29]:
results_knn.sort_values(by='score').tail(10)

Unnamed: 0,n_neighbors,score
1,2,0.628571
3,4,0.628571
12,13,0.628571
17,18,0.634286
15,16,0.64
9,10,0.64
14,15,0.64
19,20,0.64
16,17,0.651429
11,12,0.651429


### Select Best Model to Use on the Test Dataset

Of our three model types, our best performing model is the Decision Tree with max depth 19, min sample leaf 1. We will run that model on our test dataset.

In [30]:
clf = DecisionTreeClassifier(max_depth=19, min_samples_leaf=1)
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=19)

In [31]:
clf.score(X_test, y_test)

0.6949152542372882

## Titanic Dataset

### Wrangle Titanic Dataset

In [53]:
titanic = pydataset.data('titanic')

In [54]:
titanic.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [56]:
titanic = pd.get_dummies(titanic, prefix = ['class',  'age', 'sex', 'survived'])

In [57]:
titanic.head()

Unnamed: 0,class_1st class,class_2nd class,class_3rd class,age_adults,age_child,sex_man,sex_women,survived_no,survived_yes
1,1,0,0,1,0,1,0,0,1
2,1,0,0,1,0,1,0,0,1
3,1,0,0,1,0,1,0,0,1
4,1,0,0,1,0,1,0,0,1
5,1,0,0,1,0,1,0,0,1


### Split the Data

In [59]:
X = titanic.drop(columns = ['survived_no', 'survived_yes', 'sex_man'])

In [60]:
y = titanic.survived_yes

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

### Cross Validation Models

#### Decision Tree

In [62]:
clf = DecisionTreeClassifier()
grid = GridSearchCV(clf, {'max_depth': range(1, 21), 'min_samples_leaf': range(1, 11)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 21),
                         'min_samples_leaf': range(1, 11)})

In [63]:
results_decision_tree = pd.DataFrame(grid.cv_results_['params'])
results_decision_tree['score'] = grid.cv_results_['mean_test_score']

In [68]:
results_decision_tree.sort_values(by='score').head(10)

Unnamed: 0,max_depth,min_samples_leaf,score
0,1,1,0.774057
8,1,9,0.774057
7,1,8,0.774057
6,1,7,0.774057
5,1,6,0.774057
9,1,10,0.774057
3,1,4,0.774057
2,1,3,0.774057
1,1,2,0.774057
4,1,5,0.774057


In [67]:
results_decision_tree.sort_values(by='score').tail(10)

Unnamed: 0,max_depth,min_samples_leaf,score
84,9,5,0.797341
30,4,1,0.797341
31,4,2,0.797341
53,6,4,0.797341
125,13,6,0.797341
124,13,5,0.797341
123,13,4,0.797341
122,13,3,0.797341
120,13,1,0.797341
54,6,5,0.797341


#### Random Forest Classifier

In [69]:
clf = RandomForestClassifier()
grid = GridSearchCV(clf, {'max_depth': range(1, 21), 'min_samples_leaf': range(1, 11)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': range(1, 21),
                         'min_samples_leaf': range(1, 11)})

In [70]:
results_random_forest = pd.DataFrame(grid.cv_results_['params'])
results_random_forest['score'] = grid.cv_results_['mean_test_score']

In [71]:
results_random_forest.sort_values(by='score').head(10)

Unnamed: 0,max_depth,min_samples_leaf,score
3,1,4,0.740548
7,1,8,0.74965
1,1,2,0.749691
0,1,1,0.754762
5,1,6,0.758844
8,1,9,0.761822
6,1,7,0.766924
2,1,3,0.769965
9,1,10,0.770924
4,1,5,0.777025


In [72]:
results_random_forest.sort_values(by='score').tail(10)

Unnamed: 0,max_depth,min_samples_leaf,score
162,17,3,0.800402
50,6,1,0.800402
24,3,5,0.800402
173,18,4,0.800402
40,5,1,0.800402
23,3,4,0.801412
22,3,3,0.802422
20,3,1,0.802422
190,20,1,0.802422
21,3,2,0.802422


#### K Nearest Neighbors

In [73]:
clf = KNeighborsClassifier()
grid = GridSearchCV(clf, {'n_neighbors': range(1, 21)}, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 21)})

In [74]:
results_knn = pd.DataFrame(grid.cv_results_['params'])
results_knn['score'] = grid.cv_results_['mean_test_score']

In [75]:
results_knn.sort_values(by='score').tail(10)

Unnamed: 0,n_neighbors,score
18,19,0.79636
12,13,0.79636
13,14,0.79636
14,15,0.79636
15,16,0.79636
16,17,0.79636
17,18,0.79636
19,20,0.79636
11,12,0.79939
3,4,0.802436


In [76]:
results_knn.sort_values(by='score').head(10)

Unnamed: 0,n_neighbors,score
0,1,0.731523
7,8,0.771061
8,9,0.773091
6,7,0.774107
10,11,0.780198
9,10,0.780198
1,2,0.782234
4,5,0.790268
2,3,0.79235
5,6,0.793298


### Select Best Model for the Test Dataset

Our best-performing model is K Nearest neighbors with 4 neighbors.

In [77]:
clf = KNeighborsClassifier(n_neighbors = 4)
clf.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4)

In [78]:
clf.score(X_test, y_test)

0.790273556231003

## Tips Dataset

### Wrangle the Tips Dataset

In [79]:
tips = pydataset.data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [81]:
tips_df = pd.get_dummies(tips, prefix = ['sex',  'smoker', 'time', 'day'])

In [83]:
tips_df.head()

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,time_Fri,time_Sat,time_Sun,time_Thur,day_Dinner,day_Lunch
1,16.99,1.01,2,1,0,1,0,0,0,1,0,1,0
2,10.34,1.66,3,0,1,1,0,0,0,1,0,1,0
3,21.01,3.5,3,0,1,1,0,0,0,1,0,1,0
4,23.68,3.31,2,0,1,1,0,0,0,1,0,1,0
5,24.59,3.61,4,1,0,1,0,0,0,1,0,1,0


### Split the Data

In [87]:
X = tips_df.drop(columns = ['sex_Male', 'smoker_No', 'day_Lunch', 'tip'])

In [88]:
y = tips_df.tip

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

### Cross Validation Models

#### Linear Regression

In [92]:
lm = LinearRegression(normalize=True)
cross_val_score(lm, X_train, y_train, cv=5).mean()

0.2919566931359487

In [105]:
np.linspace(0,1,11)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

#### Tweedie Regressor

In [114]:
glm = TweedieRegressor()
grid = GridSearchCV(glm, {'power': range(1,10), 'alpha': np.linspace(0,1,11)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=TweedieRegressor(),
             param_grid={'alpha': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                         'power': range(1, 10)})

In [115]:
results_tweedie = pd.DataFrame(grid.cv_results_['params'])
results_tweedie['score'] = grid.cv_results_['mean_test_score']

In [116]:
results_tweedie.sort_values(by='score').head(10)

Unnamed: 0,alpha,power,score
16,0.1,8,-1.164154
15,0.1,7,-0.58439
24,0.2,7,-0.372331
14,0.1,6,-0.217992
33,0.3,7,-0.185208
23,0.2,6,-0.1549
32,0.3,6,-0.090259
42,0.4,7,-0.071608
41,0.4,6,-0.038736
51,0.5,7,-0.016203


In [117]:
results_tweedie.sort_values(by='score').tail(10)

Unnamed: 0,alpha,power,score
20,0.2,3,0.303143
19,0.2,2,0.308169
28,0.3,2,0.311073
37,0.4,2,0.312563
46,0.5,2,0.313397
55,0.6,2,0.313809
64,0.7,2,0.314068
91,1.0,2,0.314179
73,0.8,2,0.31418
82,0.9,2,0.314224


#### LassoLars

In [140]:
lars = LassoLars()
grid = GridSearchCV(lars, {'alpha':   np.linspace(0,1,11)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=LassoLars(),
             param_grid={'alpha': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])})

In [141]:
results_lars = pd.DataFrame(grid.cv_results_['params'])
results_lars['score'] = grid.cv_results_['mean_test_score']

In [142]:
results_lars.sort_values(by='score').head(10)

Unnamed: 0,alpha,score
1,0.1,-0.222231
2,0.2,-0.222231
3,0.3,-0.222231
4,0.4,-0.222231
5,0.5,-0.222231
6,0.6,-0.222231
7,0.7,-0.222231
8,0.8,-0.222231
9,0.9,-0.222231
10,1.0,-0.222231


In [143]:
results_lars.sort_values(by='score').tail(10)

Unnamed: 0,alpha,score
2,0.2,-0.222231
3,0.3,-0.222231
4,0.4,-0.222231
5,0.5,-0.222231
6,0.6,-0.222231
7,0.7,-0.222231
8,0.8,-0.222231
9,0.9,-0.222231
10,1.0,-0.222231
0,0.0,0.216818


#### Polynomial Degree 2

In [147]:
pf = PolynomialFeatures(degree=2)

# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)

X_test_degree2 = pf.transform(X_test)

In [148]:
# create the model object
lm2 = LinearRegression(normalize=True)
cross_val_score(lm2, X_train_degree2, y_train, cv=5).mean()

-8.958481626990052e+22

### Select Best Model for Test Dataset

TweedieRegressor (alpha = 0.9, power = 2) has the highest score. 

In [150]:
glm = TweedieRegressor(alpha = 0.9, power = 2)
glm.fit(X_train, y_train)

TweedieRegressor(alpha=0.9, power=2)

In [151]:
glm.score(X_test, y_test)

0.4972104899235992