In [1]:
import pandas as pd

from sklearn.metrics import accuracy_score, make_scorer, precision_recall_fscore_support, cohen_kappa_score, precision_recall_fscore_support, f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

import sys
import time

sys.path.insert(1, "../src")
from knn import *
from pca import *

N_SPLITS = 5

train = pd.read_csv("../data/train.csv")

X = train.drop(columns='label').to_numpy()
y = train.label.to_numpy()

# Corremos un gridsearch sobre los parámetros de PCA+KNN (parámetros k y α)

In [2]:
def grid_search_step(k, a, splits_dataset, metrics_):
    score_values = []
    cohen_kappa_values = []
    precision_recall_values = []
    f1_values = []
    confusion_matrix_values = []
    knn_time_values = []
    pca_train_transform_time_values = []
    pca_test_transform_time_values = []
    print(f"Running gridsearch for: k={k} - alpha={a}")

    print("\tSplits:")
    for idx, _pca, _x_train, _x_test, _y_train, _y_test in splits_dataset:
        print(f"\t\t{idx} time:", end="\t\t")

        _knn = KNNClassifier(k=k)
        
        # PCA train transform
        _pca_train_transform_st = time.time()

        _x_train_transformed = _pca.transform(_x_train, truncate=a)

        _pca_train_transform_et = time.time()

        # PCA test transform
        _pca_test_transform_st = time.time()

        _x_test_transformed = _pca.transform(_x_test, truncate=a)

        _pca_test_transform_et = time.time()
        
        # KNN fit and predict
        _knn_st = time.time()
        
        _knn.fit(_x_train_transformed, _y_train)
        _pred = _knn.predict(_x_test_transformed)

        _knn_et = time.time()

        print(f"{(_knn_et - _knn_st):.4f}")

        _score_metric = accuracy_score(_y_test, _pred)
        _cohen_kappa_metric = cohen_kappa_score(_y_test, _pred)
        _precision_recall_metric = precision_recall_fscore_support(_y_test, _pred)
        _f1_metric = f1_score(_y_test, _pred, average='weighted')
        _confusion_matrix_metric = confusion_matrix(_y_test, _pred)      

        score_values.append(_score_metric)
        cohen_kappa_values.append(_cohen_kappa_metric)
        precision_recall_values.append(_precision_recall_metric)
        f1_values.append(_f1_metric)
        confusion_matrix_values.append(_confusion_matrix_metric)
        knn_time_values.append(_knn_et - _knn_st)
        pca_train_transform_time_values.append(_pca_train_transform_et - _pca_train_transform_st)
        pca_test_transform_time_values.append(_pca_test_transform_et - _pca_test_transform_st)

    print("")

    metrics_[f"{k} - {a}"] = dict()
    metrics_[f"{k} - {a}"]['score'] = np.mean(score_values)
    metrics_[f"{k} - {a}"]['cohen-kappa'] = np.mean(cohen_kappa_values)
    metrics_[f"{k} - {a}"]['recall'] = np.mean(precision_recall_values)
    metrics_[f"{k} - {a}"]['f1'] = np.mean(f1_values)
    metrics_[f"{k} - {a}"]['confusion-matrix'] = confusion_matrix_values
    metrics_[f"{k} - {a}"]['knn-time'] = np.mean(knn_time_values)
    metrics_[f"{k} - {a}"]['pca-train-transform-time'] = np.mean(pca_train_transform_time_values)
    metrics_[f"{k} - {a}"]['pca-test-transform-time'] = np.mean(pca_test_transform_time_values)

In [3]:
def precalculate_pca(splits_dataset_, kfold_splits_, metrics_, alpha_=153):
    print("\tCalculating PCA for splits:")
    
    pca_times_values = []
    for idx, (train_index, test_index) in enumerate(kfold_splits_):
        print(f"\t\t{idx} time:", end="\t\t")

        _x_train, _y_train = X[train_index], y[train_index]
        _x_test, _y_test = X[test_index], y[test_index]

        # PCA fit with train
        _pca = PCA(alpha=alpha_)

        _pca_st = time.time()

        _pca.fit(_x_train)

        _pca_et = time.time()

        pca_times_values.append(_pca_et - _pca_st)

        splits_dataset_.append(tuple([idx, _pca, _x_train, _x_test, _y_train, _y_test]))

        print(f"{(_pca_et - _pca_st):.4f}")
    
    metrics_['pca-fit-time'] = np.mean(pca_times_values)

In [None]:
grid = {
    "k": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
    "alpha": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 153]
}

skf = StratifiedKFold(n_splits=N_SPLITS)
splits = list(skf.split(X, y))

metrics = dict()

splits_dataset = []
precalculate_pca(splits_dataset, splits, metrics)

for a in grid["alpha"]:
    for k in grid["k"]:
        grid_search_step(k,a,splits_dataset,metrics)

# Corremos un gridsearch sobre KNN (parámetro k)

In [4]:
def grid_search_step_without_pca(k, splits_knn, metrics_knn):
    score_values = []
    cohen_kappa_values = []
    precision_recall_values = []
    f1_values = []
    confusion_matrix_values = []
    knn_time_values = []
    print(f"Running gridsearch for: k={k}")

    print("\tSplits:")
    for idx, (train_index, test_index) in enumerate(splits_knn):
        print(f"\t\t{idx} time:", end="\t\t")
        _x_train, _y_train = X[train_index], y[train_index]
        _x_test, _y_test = X[test_index], y[test_index]

        _knn = KNNClassifier(k=k)
        
        # KNN fit and predict
        _knn_st = time.time()
        
        _knn.fit(_x_train, _y_train)
        _pred = _knn.predict(_x_test)

        _knn_et = time.time()

        print(f"{(_knn_et - _knn_st):.4f}")

        _score_metric = accuracy_score(_y_test, _pred)
        _cohen_kappa_metric = cohen_kappa_score(_y_test, _pred)
        _precision_recall_metric = precision_recall_fscore_support(_y_test, _pred)
        _f1_metric = f1_score(_y_test, _pred, average='weighted')
        _confusion_matrix_metric = confusion_matrix(_y_test, _pred)      

        score_values.append(_score_metric)
        cohen_kappa_values.append(_cohen_kappa_metric)
        precision_recall_values.append(_precision_recall_metric)
        f1_values.append(_f1_metric)
        confusion_matrix_values.append(_confusion_matrix_metric)
        knn_time_values.append(_knn_et - _knn_st)

    print("")

    metrics_knn[f"{k}"] = dict()
    metrics_knn[f"{k}"]['score'] = np.mean(score_values)
    metrics_knn[f"{k}"]['cohen-kappa'] = np.mean(cohen_kappa_values)
    metrics_knn[f"{k}"]['recall'] = np.mean(precision_recall_values)
    metrics_knn[f"{k}"]['f1'] = np.mean(f1_values)
    metrics_knn[f"{k}"]['confusion-matrix'] = confusion_matrix_values
    metrics_knn[f"{k}"]['knn-time'] = np.mean(knn_time_values)

In [None]:
grid_knn = {
    "k": [20, 18, 16, 14, 12, 10, 8, 6, 4, 2]
}

skf_knn = StratifiedKFold(n_splits=N_SPLITS)
splits_knn = list(skf_knn.split(X, y))

metrics_knn = dict()

for k in grid_knn["k"]:
    grid_search_step_without_pca(k, splits_knn, metrics_knn)

# Corremos un gridsearch sobre la cantidad de imágenes con PCA+KNN

## Fijamos con los mejores parámetros el gridsearch, tomamos:
- k = 6
- alpha = 40

In [4]:
grid = {
    "k": [6],
    "alpha": [40],
    "images_amount": [5000, 10000, 15000, 20000, 25000, 30000, 35000, 42000]
}

metrics_image_variable = dict()
splits_dataset = []

skf = StratifiedKFold(n_splits=N_SPLITS)

for images_amount in grid["images_amount"]:
    variable_train = pd.read_csv("../data/train.csv")[:images_amount]

    variable_X = variable_train.drop(columns='label').to_numpy()
    variable_y = variable_train.label.to_numpy()
    
    splits = list(skf.split(variable_X, variable_y))
    metrics_image_variable[f"amount-{images_amount}"] = dict()
    
    splits_dataset = []
    precalculate_pca(splits_dataset, splits, metrics_image_variable[f"amount-{images_amount}"], 40)
    
    
    for a in grid["alpha"]:
        for k in grid["k"]:
            grid_search_step(k, a, splits_dataset, metrics_image_variable[f'amount-{images_amount}'])

	Calculating PCA for splits:
		0 time:		25.9788
		1 time:		26.9877
		2 time:		25.2497
		3 time:		30.0484
		4 time:		28.1079
Running gridsearch for: k=6 - alpha=40
	Splits:
		0 time:		0.5503
		1 time:		0.5978
		2 time:		0.5507
		3 time:		0.5525
		4 time:		0.5488

	Calculating PCA for splits:
		0 time:		28.3016
		1 time:		26.6303
		2 time:		30.9109
		3 time:		29.4988
		4 time:		28.1042
Running gridsearch for: k=6 - alpha=40
	Splits:
		0 time:		2.2600
		1 time:		2.2996
		2 time:		2.3303
		3 time:		2.3191
		4 time:		2.3090

	Calculating PCA for splits:
		0 time:		31.6760
		1 time:		34.3168
		2 time:		34.1300
		3 time:		34.2411
		4 time:		35.8752
Running gridsearch for: k=6 - alpha=40
	Splits:
		0 time:		5.0517
		1 time:		4.9019
		2 time:		5.2862
		3 time:		5.2763
		4 time:		4.8780

	Calculating PCA for splits:
		0 time:		36.0108
		1 time:		35.2274
		2 time:		35.2702
		3 time:		33.1858
		4 time:		35.8313
Running gridsearch for: k=6 - alpha=40
	Splits:
		0 time:		8.7256
		1 time:		8.6077
		2

In [5]:
metrics_image_variable

{'amount-5000': {'pca-fit-time': 27.274480676651002,
  '6 - 40': {'score': 0.9443999999999999,
   'cohen-kappa': 0.9381942440546434,
   'recall': 25.708066292787844,
   'f1': 0.9442655366246525,
   'confusion-matrix': [array([[ 98,   0,   0,   0,   0,   0,   1,   0,   0,   0],
           [  0, 110,   1,   0,   0,   0,   0,   0,   1,   0],
           [  0,   1, 101,   1,   0,   0,   0,   3,   3,   0],
           [  0,   0,   1,  92,   0,   1,   0,   1,   1,   0],
           [  0,   0,   0,   0,  89,   0,   2,   0,   0,   4],
           [  0,   1,   0,   0,   0,  92,   1,   0,   0,   0],
           [  1,   0,   0,   0,   0,   0, 102,   0,   0,   0],
           [  0,   3,   0,   0,   0,   0,   1,  95,   0,   2],
           [  1,   1,   0,   4,   1,   3,   1,   1,  84,   0],
           [  2,   1,   1,   1,   1,   0,   0,   2,   0,  87]]),
    array([[ 98,   0,   0,   0,   0,   0,   1,   0,   0,   0],
           [  0, 109,   0,   1,   0,   0,   2,   0,   0,   0],
           [  0,   5,  96, 

# Corremos un gridsearch sobre k (de K-fold), viendo como varía el accuracy y tiempo

In [10]:
grid = {
    "knn_k": [6],
    "alpha": [40],
    "kfold_k": [2,5,10,15,20]
}

metrics_kfold = dict()

for kfold_k in grid["kfold_k"]:
    skf = StratifiedKFold(n_splits=kfold_k)
    splits = list(skf.split(X, y))
    
    metrics_kfold[f"fold-{kfold_k}"] = dict()
    
    splits_dataset = []
    precalculate_pca(splits_dataset, splits, metrics_kfold[f"fold-{kfold_k}"], 40)
    
    for a in grid["alpha"]:
        for knn_k in grid["knn_k"]:
            grid_search_step(knn_k, a, splits_dataset, metrics_kfold[f"fold-{kfold_k}"])

	Calculating PCA for splits:
		0 time:		41.0260
		1 time:		36.9967
Running gridsearch for: k=6 - alpha=40
	Splits:
		0 time:		57.8407
		1 time:		57.5465

	Calculating PCA for splits:
		0 time:		52.3627
		1 time:		54.5466
		2 time:		51.1500
		3 time:		52.3012
		4 time:		49.2461
Running gridsearch for: k=6 - alpha=40
	Splits:
		0 time:		38.6875
		1 time:		37.5783
		2 time:		37.3114
		3 time:		37.7089
		4 time:		38.0740

	Calculating PCA for splits:
		0 time:		56.0514
		1 time:		53.2206
		2 time:		55.8813
		3 time:		51.0804
		4 time:		57.9162
		5 time:		51.8742
		6 time:		53.5744
		7 time:		60.0114
		8 time:		55.9819
		9 time:		54.2248
Running gridsearch for: k=6 - alpha=40
	Splits:
		0 time:		21.1280
		1 time:		21.0108
		2 time:		21.6994
		3 time:		21.7691
		4 time:		21.4126
		5 time:		21.3475
		6 time:		21.8362
		7 time:		21.8823
		8 time:		21.5010
		9 time:		21.0735

	Calculating PCA for splits:
		0 time:		52.5518
		1 time:		54.9795
		2 time:		53.4080
		3 time:		53.8523
		4 time:		54.7

In [11]:
metrics_kfold

{'fold-2': {'pca-fit-time': 39.011345982551575,
  '6 - 40': {'score': 0.9686666666666667,
   'cohen-kappa': 0.9651723769077636,
   'recall': 525.7264108596546,
   'f1': 0.9686369836166244,
   'confusion-matrix': [array([[2049,    0,    1,    0,    0,    3,   11,    1,    1,    0],
           [   0, 2327,    4,    2,    1,    0,    3,    1,    1,    3],
           [   6,   13, 2022,    3,    1,    0,    2,   34,    5,    3],
           [   0,    5,   15, 2091,    0,   22,    2,   10,   24,    6],
           [   0,   14,    0,    0, 1955,    1,    8,    6,    1,   51],
           [   3,    3,    2,   12,    1, 1830,   24,    2,    7,   14],
           [  10,    2,    0,    1,    1,    6, 2043,    0,    5,    0],
           [   1,   29,    9,    0,    4,    0,    0, 2130,    2,   26],
           [   4,    4,    6,   30,    3,   21,   10,    6, 1931,   16],
           [   6,    5,    3,   13,   27,    8,    4,   23,    6, 1999]]),
    array([[2046,    0,    3,    0,    0,    3,   14,    0,

## Vemos como evoluciona el score y el tiempo a medida que crece el k, desde 1 hasta el máximo 21000 (21000 considerando que tomamos 2-fold de un dataset con 42000 datos)

In [17]:
grid = {
    "k": [1,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000,11000,12000,13000,14000,15000,16000,17000,18000,19000,20000,21000],
    "alpha": [40]
}

skf = StratifiedKFold(n_splits=2)
splits = list(skf.split(X, y))

metrics_k_by_total_train = dict()

splits_dataset = []
precalculate_pca(splits_dataset, splits, metrics_k_by_total_train, 40)

for a in grid["alpha"]:
    for k in grid["k"]:
        grid_search_step(k, a, splits_dataset, metrics_k_by_total_train)

	Calculating PCA for splits:
		0 time:		40.4818
		1 time:		40.4071
Running gridsearch for: k=1 - alpha=40
	Splits:
		0 time:		57.9855
		1 time:		58.7945

Running gridsearch for: k=1000 - alpha=40
	Splits:
		0 time:		58.9745
		1 time:		58.3157

Running gridsearch for: k=2000 - alpha=40
	Splits:
		0 time:		57.9522
		1 time:		58.3977

Running gridsearch for: k=3000 - alpha=40
	Splits:
		0 time:		58.5202
		1 time:		57.6760

Running gridsearch for: k=4000 - alpha=40
	Splits:
		0 time:		58.5718
		1 time:		60.7824

Running gridsearch for: k=5000 - alpha=40
	Splits:
		0 time:		60.9638
		1 time:		61.1815

Running gridsearch for: k=6000 - alpha=40
	Splits:
		0 time:		58.5786
		1 time:		58.4038

Running gridsearch for: k=7000 - alpha=40
	Splits:
		0 time:		58.4268
		1 time:		58.6287

Running gridsearch for: k=8000 - alpha=40
	Splits:
		0 time:		58.8813
		1 time:		58.5024

Running gridsearch for: k=9000 - alpha=40
	Splits:
		0 time:		61.3669
		1 time:		61.8536

Running gridsearch for: k=10000 - al

  _warn_prf(average, modifier, msg_start, len(result))


63.4909

Running gridsearch for: k=18000 - alpha=40
	Splits:
		0 time:		

  _warn_prf(average, modifier, msg_start, len(result))


64.5731
		1 time:		

  _warn_prf(average, modifier, msg_start, len(result))


63.6926

Running gridsearch for: k=19000 - alpha=40
	Splits:
		0 time:		

  _warn_prf(average, modifier, msg_start, len(result))


64.7888
		1 time:		

  _warn_prf(average, modifier, msg_start, len(result))


63.4354

Running gridsearch for: k=20000 - alpha=40
	Splits:
		0 time:		

  _warn_prf(average, modifier, msg_start, len(result))


61.4105
		1 time:		

  _warn_prf(average, modifier, msg_start, len(result))


61.2855

Running gridsearch for: k=21000 - alpha=40
	Splits:
		0 time:		

  _warn_prf(average, modifier, msg_start, len(result))


61.3917
		1 time:		

  _warn_prf(average, modifier, msg_start, len(result))


61.6451



  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
metrics_k_by_total_train

{'pca-fit-time': 40.44447696208954,
 '1 - 40': {'score': 0.9659047619047619,
  'cohen-kappa': 0.96210316314373,
  'recall': 525.7242436419714,
  'f1': 0.9658688094830796,
  'confusion-matrix': [array([[2045,    1,    1,    0,    0,    5,   12,    1,    1,    0],
          [   0, 2324,    6,    2,    2,    0,    2,    2,    1,    3],
          [   7,    4, 2036,    4,    3,    0,    1,   27,    4,    3],
          [   6,    3,   15, 2069,    0,   35,    1,    7,   32,    7],
          [   0,    8,    1,    0, 1946,    2,    7,    8,    1,   63],
          [   2,    2,    2,   20,    1, 1820,   23,    4,    9,   15],
          [   7,    4,    1,    0,    2,    9, 2043,    0,    2,    0],
          [   2,   27,   13,    1,    7,    0,    0, 2118,    1,   32],
          [   1,    4,    9,   34,    2,   22,   10,    5, 1925,   19],
          [   6,    7,    2,   13,   43,    7,    3,   20,   10, 1983]]),
   array([[2041,    0,    4,    2,    1,    0,   12,    3,    1,    2],
          [   0