<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/week07/cross_validation_in_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris

In [3]:
# Load the data
iris = load_iris()
X = pd.DataFrame(iris.data, columns = iris.feature_names)
y = iris.target
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
# Check the balance of the data set
classes, counts = np.unique(y, return_counts = True)

for n in range(len(classes)):
  print(f'Class {classes[n]} has {counts[n]} members.')

Class 0 has 50 members.
Class 1 has 50 members.
Class 2 has 50 members.


In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 4)

In [11]:
# Construct a preprocessing and modeling pipeline
knn = KNeighborsClassifier()
scaler = StandardScaler()

knn_pipe = make_pipeline(scaler, knn)

In [14]:
# Use cross-validation on the model
scores = cross_val_score(knn_pipe, X_train, y_train, cv = 5)

print(scores.round(2))
print(np.mean(scores))

[0.96 0.87 1.   1.   0.95]
0.9561264822134387


In [21]:
# Tune the model by k
ks = range(1, 20, 2)

scores = []

for k in ks:
  knn_pipe = make_pipeline(scaler, KNeighborsClassifier(n_neighbors = k))
  scores.append(cross_val_score(knn_pipe, X_train, y_train))

print(scores)
print(np.min(scores))
print(np.max(scores))

[array([0.95652174, 0.86956522, 0.95454545, 0.95454545, 1.        ]), array([0.95652174, 0.91304348, 0.95454545, 1.        , 1.        ]), array([0.95652174, 0.86956522, 1.        , 1.        , 0.95454545]), array([0.95652174, 0.91304348, 1.        , 1.        , 1.        ]), array([0.95652174, 0.95652174, 1.        , 1.        , 1.        ]), array([0.95652174, 0.91304348, 1.        , 1.        , 1.        ]), array([0.95652174, 0.86956522, 0.95454545, 1.        , 1.        ]), array([0.95652174, 0.86956522, 0.95454545, 1.        , 1.        ]), array([0.95652174, 0.86956522, 0.95454545, 1.        , 1.        ]), array([0.95652174, 0.86956522, 0.95454545, 1.        , 1.        ])]
0.8695652173913043
1.0


In [24]:
# Find the best n_neighbors
for index, k in enumerate(ks):
  mean_score = np.mean(scores[index])
  print(f'Score for n_neighbors = {k} is {mean_score}')

Score for n_neighbors = 1 is 0.9470355731225297
Score for n_neighbors = 3 is 0.9648221343873518
Score for n_neighbors = 5 is 0.9561264822134387
Score for n_neighbors = 7 is 0.9739130434782609
Score for n_neighbors = 9 is 0.9826086956521738
Score for n_neighbors = 11 is 0.9739130434782609
Score for n_neighbors = 13 is 0.9561264822134387
Score for n_neighbors = 15 is 0.9561264822134387
Score for n_neighbors = 17 is 0.9561264822134387
Score for n_neighbors = 19 is 0.9561264822134387


In [26]:
# Evaluate the final model on the hold-out test set
final_model = KNeighborsClassifier(n_neighbors = 9)
final_pipe = make_pipeline(StandardScaler(), final_model)
final_pipe.fit(X_train, y_train)
final_pipe.score(X_test, y_test)

0.9473684210526315