In [87]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# CV
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    KFold,
    RepeatedKFold,
    StratifiedKFold,
    RepeatedStratifiedKFold,
    RandomizedSearchCV,
    GridSearchCV,
)

from scipy.stats import uniform, randint, loguniform

# preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

# KNN
from sklearn.neighbors import KNeighborsClassifier


# metrics
from sklearn.metrics import f1_score, accuracy_score


In [88]:
RANDOM_STATE = 42


In [89]:
df = pd.read_csv('monks-1.train', sep='\s+', skip_blank_lines=False,
                 skipinitialspace=False, names=["class", 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'ID'])


In [90]:
df.shape


(124, 8)

In [91]:
df['class'].unique()


array([1, 0])

In [92]:
X_df = df.drop(['class', 'ID'], axis=1)
X_df = pd.get_dummies(X_df, columns=X_df.columns)
y_df = df[['class']]

X = X_df.values  # training set
y = y_df.values  # set of target pairs
y = y.ravel()

X.shape, y.shape


((124, 17), (124,))

In [93]:
col = ['Accuracy', 'F1']

df_results_y = pd.DataFrame(columns=col) 

def kfold_cv(name, to_fit, target=y, grid=False, pipe=False, reduce_dim=False):
    
    outer_kfold_counter = 0
    accuracy_lst = []
    f1_lst = []
    
    outer_kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

    for dev_idx, test_idx in outer_kfold.split(X): 
        outer_kfold_counter+= 1
        
        X_dev, X_test = X[dev_idx], X[test_idx]
        y_dev, y_test = target[dev_idx], target[test_idx]    
        
        print("--------------------------------------------------------------")
        print(f"Results of the outer k_fold split num: {outer_kfold_counter}")
        
        to_fit.fit(X_dev, y_dev)
        
        if grid:
            model = to_fit.best_estimator_
            if pipe:
                clf = model['clf']
                params = clf.get_params()
                if reduce_dim:
                    reduce_dim = model['reduce_dim']
                    print(f'Dimensionality reduction with {reduce_dim}')
        elif pipe:
            model = to_fit
            clf = to_fit['clf']
            params = clf.get_params()       
        else:
            model = to_fit
            params = model.get_params()
            
        print('Parameters')
        print("\n".join("{:<23}\t{}".format(k, v) for k, v in params.items() if v != None))
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        accuracy_lst.append(accuracy)
        f1_lst.append(f1)
        
        
        print(f'Accuracy: {accuracy}')
        print(f'F1: {f1}')
        print()
    

    accuracy_mean = round(np.mean(accuracy_lst), 3)
    accuracy_std = round(np.std(accuracy_lst), 3)   
    accuracy_across_splits = f"{accuracy_mean}±{accuracy_std}"
    
    f1_mean = round(np.mean(f1_lst), 3)
    f1_std = round(np.std(f1_lst), 3)
    f1_across_splits = f"{f1_mean}±{f1_std}"
    
    print(f"Accuracy across splits: {accuracy_across_splits}")
    print(f"F1 across splits: {f1_across_splits}")
    
    results = {'Accuracy': accuracy_across_splits, 'F1': f1_across_splits}    
    
    df_results_y.loc[name] = results
        

## Default parameters

In [94]:
scaler = StandardScaler()
knn = KNeighborsClassifier()

pipe = Pipeline([('scaler', scaler), ('clf', knn)])
name = 'default parameters'

kfold_cv(name, pipe, pipe=True)

df_results_y


--------------------------------------------------------------
Results of the outer k_fold split num: 1
Parameters
algorithm              	auto
leaf_size              	30
metric                 	minkowski
n_neighbors            	5
p                      	2
weights                	uniform
Accuracy: 0.8
F1: 0.7058823529411764

--------------------------------------------------------------
Results of the outer k_fold split num: 2
Parameters
algorithm              	auto
leaf_size              	30
metric                 	minkowski
n_neighbors            	5
p                      	2
weights                	uniform
Accuracy: 0.76
F1: 0.7500000000000001

--------------------------------------------------------------
Results of the outer k_fold split num: 3
Parameters
algorithm              	auto
leaf_size              	30
metric                 	minkowski
n_neighbors            	5
p                      	2
weights                	uniform
Accuracy: 0.88
F1: 0.888888888888889

------------------

Unnamed: 0,Accuracy,F1
default parameters,0.831±0.046,0.814±0.072


## Grid Search

In [96]:
inner_kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

name = 'grid search'

pipe = Pipeline([('scaler', scaler), ('clf', knn)])

param_grid = {
    "clf__n_neighbors": np.arange(1, 10, 2),
    "clf__weights": ["uniform",
                     "distance"],
    "clf__metric": ["euclidean", "cityblock"],
}

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=inner_kfold,
    n_jobs=-1,
    refit=True,
)

kfold_cv(name, grid, grid=True, pipe=True)

df_results_y


--------------------------------------------------------------
Results of the outer k_fold split num: 1
Parameters
algorithm              	auto
leaf_size              	30
metric                 	euclidean
n_neighbors            	5
p                      	2
weights                	uniform
Accuracy: 0.8
F1: 0.7058823529411764

--------------------------------------------------------------
Results of the outer k_fold split num: 2
Parameters
algorithm              	auto
leaf_size              	30
metric                 	cityblock
n_neighbors            	5
p                      	2
weights                	distance
Accuracy: 0.8
F1: 0.7826086956521738

--------------------------------------------------------------
Results of the outer k_fold split num: 3
Parameters
algorithm              	auto
leaf_size              	30
metric                 	euclidean
n_neighbors            	5
p                      	2
weights                	uniform
Accuracy: 0.88
F1: 0.888888888888889

------------------

Unnamed: 0,Accuracy,F1
default parameters,0.831±0.046,0.814±0.072
grid search,0.815±0.04,0.793±0.06


## Gridsearch + kbest

In [97]:
pca = PCA()
scaler = StandardScaler()
kbest = SelectKBest()

pipe = Pipeline([('scaler', scaler), ('reduce_dim', kbest), ('clf', knn)])

name = 'grid search + kbest'

param_grid = {
    'reduce_dim__score_func': [f_classif], #[chi2, f_classif],
    'reduce_dim__k': [i for i in range(1, X.shape[1]+1, 1)],
    "clf__n_neighbors": np.arange(1, 10, 2),
    "clf__weights": ["uniform",   
                     "distance"], 
    "clf__metric": ["euclidean", "cityblock"],
}

grid = GridSearchCV(
    pipe,
    param_grid,
    cv=inner_kfold,
    n_jobs=-1,
    #n_iter=300,
    refit=True,
    #random_state=RANDOM_STATE
)

kfold_cv(name, grid, grid=True, pipe=True, reduce_dim=True)

df_results_y


--------------------------------------------------------------
Results of the outer k_fold split num: 1
Dimensionality reduction with SelectKBest(k=17)
Parameters
algorithm              	auto
leaf_size              	30
metric                 	euclidean
n_neighbors            	5
p                      	2
weights                	uniform
Accuracy: 0.8
F1: 0.7058823529411764

--------------------------------------------------------------
Results of the outer k_fold split num: 2
Dimensionality reduction with SelectKBest(k=16)
Parameters
algorithm              	auto
leaf_size              	30
metric                 	euclidean
n_neighbors            	5
p                      	2
weights                	distance
Accuracy: 0.76
F1: 0.7272727272727273

--------------------------------------------------------------
Results of the outer k_fold split num: 3
Dimensionality reduction with SelectKBest(k=15)
Parameters
algorithm              	auto
leaf_size              	30
metric                 	eucli

Unnamed: 0,Accuracy,F1
default parameters,0.831±0.046,0.814±0.072
grid search,0.815±0.04,0.793±0.06
grid search + kbest,0.864±0.074,0.844±0.106


## Random search + kbest or PCA

In [100]:
name = 'random search + kbest or PCA'

pipe = Pipeline(
    [
        ('scaler', scaler),
        # the reduce_dim stage is populated by the param_grid
        ("reduce_dim", "passthrough"),
        ('clf', knn)
    ]
)


param_grid = [
    {
        "reduce_dim": [PCA()],
        "reduce_dim__n_components":[i for i in range(1, X.shape[1], 1)],
        "clf__n_neighbors": np.arange(1, 10, 2),
        "clf__weights": ["uniform",
                         "distance"],
        "clf__metric": ["euclidean", "cityblock"],
    },
    {
        "reduce_dim": [SelectKBest()],
        'reduce_dim__score_func': [f_classif],
        'reduce_dim__k': [i for i in range(1, X.shape[1]+1, 1)],
        "clf__n_neighbors": np.arange(1, 10, 2),
        "clf__weights": ["uniform",
                         "distance"],
        "clf__metric": ["euclidean", "cityblock"],
    }
]

grid = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid,
    cv=inner_kfold,
    n_jobs=-1,
    n_iter=300,
    refit=True,
    random_state=RANDOM_STATE
)

kfold_cv(name, grid, grid=True, pipe=True, reduce_dim=True)

df_results_y


--------------------------------------------------------------
Results of the outer k_fold split num: 1
Dimensionality reduction with PCA(n_components=11)
Parameters
algorithm              	auto
leaf_size              	30
metric                 	euclidean
n_neighbors            	5
p                      	2
weights                	distance
Accuracy: 0.8
F1: 0.7058823529411764

--------------------------------------------------------------
Results of the outer k_fold split num: 2
Dimensionality reduction with SelectKBest(k=16)
Parameters
algorithm              	auto
leaf_size              	30
metric                 	euclidean
n_neighbors            	5
p                      	2
weights                	distance
Accuracy: 0.76
F1: 0.7272727272727273

--------------------------------------------------------------
Results of the outer k_fold split num: 3
Dimensionality reduction with SelectKBest(k=16)
Parameters
algorithm              	auto
leaf_size              	30
metric                 	c

Unnamed: 0,Accuracy,F1
default parameters,0.831±0.046,0.814±0.072
grid search,0.815±0.04,0.793±0.06
grid search + kbest,0.864±0.074,0.844±0.106
random search + kbest or PCA,0.84±0.071,0.819±0.098
