## Librairies import

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Split the data to n-folds, return the indices

In [187]:
def kfolds(X, n):
    the_list = []
    length = X.shape[0]
    size = np.floor(length / n)
    rest = length % n
    
    for i in range(0,n):
        test = np.arange(i * size, (i+1) * size)
        if (i == 0):
            train = np.arange((i+1) * size, length)
        else:
            train = np.arange(0, i * size)
            if (length > ((i+1) * size)):
                train = np.concatenate([train, np.arange((i+1) * size, length)])
                    
        the_list.append((train.astype(int), test.astype(int)))
        
    return the_list

## Read data

White Wine Quality from there: https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/

In [53]:
data = pd.read_csv("data/winequality-white.csv", sep=";")

In [54]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [55]:
data.shape

(4898, 12)

## Prepare X / y

In [56]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values
y_class = np.where(y<6, 0, 1)

In [57]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y_class, test_size=0.3)

In [58]:
from sklearn import preprocessing
std_scale = preprocessing.StandardScaler().fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

## sklearn cross-validation with folds

Note: folds are created by my own function kfolds()

In [186]:
from sklearn import neighbors, metrics

param_grid = {'n_neighbors':np.arange(1,16)}
score = 'accuracy'

cv = kfolds(X_train_std, 5)

clf = model_selection.GridSearchCV(
                                    neighbors.KNeighborsClassifier(),
                                    param_grid,
                                    cv=cv,
                                    scoring=score
)

clf.fit(X_train_std, y_train)
print(clf.best_params_)

print("Cross-validation results")
for mean, std, params in zip(
                            clf.cv_results_['mean_test_score'],
                            clf.cv_results_['std_test_score'],
                            clf.cv_results_['params']):
    
    print(f"{score} - {mean} - {params}")

{'n_neighbors': 1}
Cross-validation results
accuracy - 0.7728467153284672 - {'n_neighbors': 1}
accuracy - 0.7197080291970803 - {'n_neighbors': 2}
accuracy - 0.7532846715328467 - {'n_neighbors': 3}
accuracy - 0.7407299270072992 - {'n_neighbors': 4}
accuracy - 0.76 - {'n_neighbors': 5}
accuracy - 0.7541605839416058 - {'n_neighbors': 6}
accuracy - 0.7576642335766424 - {'n_neighbors': 7}
accuracy - 0.7611678832116788 - {'n_neighbors': 8}
accuracy - 0.7553284671532847 - {'n_neighbors': 9}
accuracy - 0.7562043795620438 - {'n_neighbors': 10}
accuracy - 0.7567883211678832 - {'n_neighbors': 11}
accuracy - 0.7614598540145986 - {'n_neighbors': 12}
accuracy - 0.7573722627737226 - {'n_neighbors': 13}
accuracy - 0.7623357664233577 - {'n_neighbors': 14}
accuracy - 0.7594160583941606 - {'n_neighbors': 15}


In [145]:
y_pred = clf.predict(X_test_std)
metrics.accuracy_score(y_test, y_pred)

0.8081632653061225

## "Manually" cross-validation with folds

In [194]:
def crossvalidation(X, y, n_folds, n_neighbors):
    sets = kfolds(X, n_folds)
    results = {
        'neighbors':[],
        'accuracy':[]
    }
    
    for neighbor in np.arange(1,n_neighbors+1):
        nbclf = neighbors.KNeighborsClassifier(n_neighbors=neighbor)
        score = []
        for (i_train, i_test) in sets:
            X_train = X[i_train]
            y_train = y[i_train]
            X_test = X[i_test,:]
            y_test = y[i_test]
            nbclf.fit(X_train, y_train)
            y_pred = nbclf.predict(X_test)
            score.append(metrics.accuracy_score(y_pred, y_test))
            
        results['neighbors'].append(neighbor)
        results['accuracy'].append(np.array(score).mean())
        
    return pd.DataFrame(results)

In [195]:
crossvalidation(X_train_std, y_train, 5, 15)

Unnamed: 0,neighbors,accuracy
0,1,0.772847
1,2,0.719708
2,3,0.753285
3,4,0.74073
4,5,0.76
5,6,0.754161
6,7,0.757664
7,8,0.761168
8,9,0.755328
9,10,0.756204


## Conclusion

The results are exactly the same! :) I found the same accuracy for each neighbors from 1 to 15 between sklearn cross-validation and the cross-validation with my own function.

The method to pass folds both to sklearn and my function is the same (it's also my own function called kfolds()).