In [0]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris

In [0]:
X, y = load_iris(return_X_y= True)
# Holdout
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 33)

In [0]:
# Standarization
X_train_standart = StandardScaler().fit_transform(X_train)
X_test_standart = StandardScaler().fit_transform(X_test)

In [0]:
knn = KNeighborsClassifier(n_neighbors = 5,
                            weights = 'uniform',
                            p = 2,
                            metric = 'minkowski',
                            n_jobs = 1)
knn.fit(X_train_standart, y_train)
knn_predictions = knn.predict(X_test_standart)
knn_performance = knn.score(X_test_standart, y_test)

**Repeated holdout**

In [8]:
t_knn = KNeighborsClassifier(n_neighbors = 5,
                              weights = 'uniform',
                              p = 2,
                              metric = 'minkowski',
                              n_jobs = 1)
epoch = 50
accuracy = []
for i in range(epoch):
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size = 0.3,
                                                        random_state = i)
    X_train_standart = StandardScaler().fit_transform(X_train)
    X_test_standart = StandardScaler().fit_transform(X_test)

    t_knn.fit(X_train, y_train)
    t_knn_accuracy = t_knn.score(X_test, y_test)
    accuracy.append(t_knn_accuracy)

accuracy = np.asarray(accuracy)
print('Repeated holdout performance: ', accuracy.mean())
t_knn.fit(X, y)

Repeated holdout performance:  0.9640000000000001


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                     weights='uniform')

**k-fold cross-validation**

In [0]:
hiperParameters = range(1, 20)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 3)
X_train_standart = StandardScaler().fit_transform(X_train)
X_test_standart = StandardScaler().fit_transform(X_test)

In [11]:
cv = KFold(n_splits= 10,
           shuffle= True)

cv_accuracy, cv_standard_dev, cv_standard_err = [], [], []

for k in hiperParameters:
    k_cv_knn = KNeighborsClassifier(n_neighbors = k,
                                    weights = 'uniform',
                                    p = 2,
                                    metric = 'minkowski',
                                    n_jobs = 1) 
    mean_squared_error = []
    for train_index, ver_index, in cv.split(X_train, y_train):
        predictions = k_cv_knn.fit(X_train[train_index],
                                   y_train[train_index]).predict(X_train[ver_index])
        
        mean_squared_error_cv = np.sqrt(np.mean(np.square(y_train[ver_index] - predictions)))
        mean_squared_error.append(mean_squared_error_cv)

    mean_squared_error = np.array(mean_squared_error)

    y_prediction_cv10_mean = mean_squared_error.mean()
    y_prediction_cv10_standard_dev = mean_squared_error.std()
    y_prediciton_cv10_standard_err = y_prediction_cv10_standard_dev / np.sqrt(10)

    cv_accuracy.append(y_prediction_cv10_mean)
    cv_standard_dev.append(y_prediction_cv10_standard_dev)
    cv_standard_err.append(y_prediciton_cv10_standard_err)

best_K = np.argmax(cv_accuracy)
print(best_K)

3


In [0]:
# create new model with best K value and standarized train dataset
k_cv_knn = KNeighborsClassifier(n_neighbors = hiperParameters[best_K],
                                weights = 'uniform',
                                p = 2,
                                metric = 'minkowski',
                                n_jobs = 1)
k_cv_knn.fit(X_train_standart, y_train)
k_cv_knn_y_train_accuracy = k_cv_knn.score(X_test_standart, y_test)

In [15]:
X_standard = StandardScaler().fit_transform(X)
k_cv_knn.fit(X_standard, y)
print('Cross validation performance: ', k_cv_knn.score(X_standard, y))

Cross validation performance:  0.96


**repeated k-fold cross validation**

In [0]:
hiperParameters = range(1, 20)
hiperParameters_general_accuracy = []
cv_accuracy = []

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2)
X_train_standart = StandardScaler().fit_transform(X_train)
X_test_standart = StandardScaler().fit_transform(X_test)

In [0]:
epoch = 5
for i in range(epoch):
    cv = KFold(n_splits = 10, shuffle = True)
    hiperParameters_accuracy = []
    for c in hiperParameters:
        r_k_cv_knn = KNeighborsClassifier(n_neighbors = c,
                                          weights = 'uniform',
                                          p = 2,
                                          metric = 'minkowski',
                                          n_jobs = 1)
        mean_squared_error = []
        for train_index, accuracy_index in cv.split(X_train, y_train):
            prediction = r_k_cv_knn.fit(X_train[train_index],
                                        y_train[train_index]).predict(X_train[accuracy_index])
            
            mean_squared_error_cv = np.sqrt(np.mean(np.square(y_train[accuracy_index] - prediction)))
            mean_squared_error.append(mean_squared_error_cv)
        
        mean_squared_error = np.array(mean_squared_error)
    
    hiperParameters_accuracy.append(mean_squared_error.mean())
    hiperParameters_general_accuracy.append(hiperParameters_accuracy)

best_K = np.argmax(np.mean(hiperParameters_general_accuracy))

In [0]:
# create new model with best K value and standarized train dataset
r_k_cv_knn = KNeighborsClassifier(n_neighbors = hiperParameters[best_K],
                                  weights = 'uniform',
                                  metric = 'minkowski',
                                  n_jobs = 1)
r_k_cv_knn.fit(X_train_standart, y_train)
r_k_cv_knn_y_prediction_train = r_k_cv_knn.score(X_test_standart, y_test)

In [19]:
X_standard = StandardScaler().fit_transform(X)
r_k_cv_knn.fit(X_standard, y)
print('Repeated cross validation performance: ', r_k_cv_knn.score(X_standard, y))

Repeated cross validation performance:  1.0
