# Cross Validation and model Selection

## imports

In [15]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

import sklearn
from sklearn import tree
from sklearn import datasets
from sklearn import model_selection
from sklearn import metrics

## Load Data

In [16]:
iris = datasets.load_iris()
tmp = {name: iris.data[:, i] for i, name in enumerate(iris.feature_names)}  # dictionary comprehension
tmp["target"] = [iris.target_names[i] for i in iris.target]
iris_df = pd.DataFrame(tmp)
iris_df.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "target"]
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [49]:
X = iris_df.drop(["target"], axis=1)
y = iris_df["target"]
clf = tree.DecisionTreeClassifier()

In [54]:
(X_train, X_test, y_train, y_test) = model_selection.train_test_split(X, y, test_size=1/3)
print(X_train.shape, X_test.shape)

(100, 4) (50, 4)


In [55]:
#train 
clf.fit(X_train, y_train)
#predict
y_pred = clf.predict(X_test)

In [56]:
# score
print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))


precision=0.9408552631578948
recall=   0.94
f1=       0.9399017199017199


## Cross Validation

In [38]:
kf = model_selection.KFold(n_splits=5, shuffle=True)
for train_index, test_index in kf.split(iris_df):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()
    

TRAIN: [0 2 3 4 5] TEST: [ 1  6 11 12 19]
precision=0.9466666666666667
recall=   0.9333333333333333
f1=       0.934006734006734

TRAIN: [0 1 3 4 6] TEST: [ 2  5  7 14 17]
precision=0.9454545454545454
recall=   0.9333333333333333
f1=       0.9333333333333333

TRAIN: [0 1 2 4 5] TEST: [ 3 27 28 45 47]
precision=0.9333333333333333
recall=   0.9333333333333333
f1=       0.9333333333333333

TRAIN: [0 1 2 3 5] TEST: [ 4  9 21 22 25]
precision=1.0
recall=   1.0
f1=       1.0

TRAIN: [1 2 3 4 5] TEST: [ 0  8 10 13 15]
precision=0.9696969696969696
recall=   0.9666666666666667
f1=       0.9656084656084656



[Scoring Parameter](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)

In [62]:
scores = model_selection.cross_validate(clf, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
scores

{'fit_time': array([0.00285697, 0.0018208 , 0.00186205, 0.00211024, 0.00166392]),
 'score_time': array([0.00538611, 0.00458908, 0.00443101, 0.00433278, 0.00432014]),
 'test_precision_weighted': array([0.96969697, 0.96969697, 0.9023569 , 0.93333333, 1.        ]),
 'test_recall_weighted': array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ]),
 'test_f1_weighted': array([0.96658312, 0.96658312, 0.89974937, 0.93333333, 1.        ])}

In [63]:
np.mean(scores["test_precision_weighted"])

0.9550168350168351