In [1]:
import pandas as pd

from sklearn.metrics import accuracy_score, make_scorer, precision_recall_fscore_support, cohen_kappa_score, precision_recall_fscore_support, f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold

import sys
import time

sys.path.insert(1, "../src")
from knn import *
from pca import *

N_SPLITS = 5

train = pd.read_csv("../data/train.csv")

X = train.drop(columns='label').to_numpy()
y = train.label.to_numpy()

# Corremos un gridsearch sobre los parámetros de PCA+KNN (parámetros k y α)

In [5]:
def grid_search_step(k, a, splits_dataset, metrics):
    score_values = []
    cohen_kappa_values = []
    precision_recall_values = []
    f1_values = []
    confusion_matrix_values = []
    knn_time_values = []
    pca_train_transform_time_values = []
    pca_test_transform_time_values = []
    print(f"Running gridsearch for: k={k} - alpha={a}")

    print("\tSplits:")
    for idx, _pca, _x_train, _x_test, _y_train, _y_test in splits_dataset:
        print(f"\t\t{idx} time:", end="\t\t")

        _knn = KNNClassifier(k=k)
        
        # PCA train transform
        _pca_train_transform_st = time.time()

        _x_train_transformed = _pca.transform(_x_train, truncate=a)

        _pca_train_transform_et = time.time()

        # PCA test transform
        _pca_test_transform_st = time.time()

        _x_test_transformed = _pca.transform(_x_test, truncate=a)

        _pca_test_transform_et = time.time()
        
        # KNN fit and predict
        _knn_st = time.time()
        
        _knn.fit(_x_train_transformed, _y_train)
        _pred = _knn.predict(_x_test_transformed)

        _knn_et = time.time()

        print(f"{(_knn_et - _knn_st):.4f}")

        _score_metric = accuracy_score(_y_test, _pred)
        _cohen_kappa_metric = cohen_kappa_score(_y_test, _pred)
        _precision_recall_metric = precision_recall_fscore_support(_y_test, _pred)
        _f1_metric = f1_score(_y_test, _pred, average='weighted')
        _confusion_matrix_metric = confusion_matrix(_y_test, _pred)      

        score_values.append(_score_metric)
        cohen_kappa_values.append(_cohen_kappa_metric)
        precision_recall_values.append(_precision_recall_metric)
        f1_values.append(_f1_metric)
        confusion_matrix_values.append(_confusion_matrix_metric)
        knn_time_values.append(_knn_et - _knn_st)
        pca_train_transform_time_values.append(_pca_train_transform_et - _pca_train_transform_st)
        pca_test_transform_time_values.append(_pca_test_transform_et - _pca_test_transform_st)

    print("")

    metrics[f"{k} - {a}"] = dict()
    metrics[f"{k} - {a}"]['score'] = np.mean(score_values)
    metrics[f"{k} - {a}"]['cohen-kappa'] = np.mean(cohen_kappa_values)
    metrics[f"{k} - {a}"]['recall'] = np.mean(precision_recall_values)
    metrics[f"{k} - {a}"]['f1'] = np.mean(f1_values)
    metrics[f"{k} - {a}"]['confusion-matrix'] = confusion_matrix_values
    metrics[f"{k} - {a}"]['knn-time'] = np.mean(knn_time_values)
    metrics[f"{k} - {a}"]['pca-train-transform-time'] = np.mean(pca_train_transform_time_values)
    metrics[f"{k} - {a}"]['pca-test-transform-time'] = np.mean(pca_test_transform_time_values)
    print(f"\tMean time: {np.mean(score_values)}")

In [6]:
grid = {
    "k": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
    "alpha": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 153]
}

skf = StratifiedKFold(n_splits=N_SPLITS)
splits = list(skf.split(X, y))

metrics = dict()

splits_dataset = []
pca_times_values = []
print("\tCalculating PCA for splits:")
for idx, (train_index, test_index) in enumerate(splits):
    print(f"\t\t{idx} time:", end="\t\t")
    
    _x_train, _y_train = X[train_index], y[train_index]
    _x_test, _y_test = X[test_index], y[test_index]
    
    # PCA fit with train
    _pca = PCA(alpha=153)
    
    _pca_st = time.time()
            
    _pca.fit(_x_train)
            
    _pca_et = time.time()
    
    pca_times_values.append(_pca_et - _pca_st)
    
    splits_dataset.append(tuple([idx, _pca, _x_train, _x_test, _y_train, _y_test]))

    print(f"{(_pca_et - _pca_st):.4f}")
    
metrics['pca-fit-time'] = np.mean(pca_times_values)

for a in grid["alpha"]:
    for k in grid["k"]:
        grid_search_step(k,a,splits_dataset,metrics)

	Calculating PCA for splits:
		0 time:		148.8692
		1 time:		142.2256
		2 time:		143.9005
		3 time:		146.1806
		4 time:		153.7498
Running gridsearch for: k=2 - alpha=10
	Splits:
		0 time:		25.2077
		1 time:		25.3800
		2 time:		25.4439
		3 time:		25.4514
		4 time:		25.2653

	Mean time: 0.9093809523809524
Running gridsearch for: k=4 - alpha=10
	Splits:
		0 time:		25.2223
		1 time:		25.2092
		2 time:		25.3150
		3 time:		25.5074
		4 time:		25.7419

	Mean time: 0.9244761904761905
Running gridsearch for: k=6 - alpha=10
	Splits:
		0 time:		25.2673
		1 time:		25.3854
		2 time:		25.3434
		3 time:		25.3204
		4 time:		25.5561

	Mean time: 0.9248571428571429
Running gridsearch for: k=8 - alpha=10
	Splits:
		0 time:		25.5884
		1 time:		25.4103
		2 time:		25.6611
		3 time:		25.6375
		4 time:		25.6381

	Mean time: 0.9248571428571429
Running gridsearch for: k=10 - alpha=10
	Splits:
		0 time:		25.2400
		1 time:		25.6307
		2 time:		25.2953
		3 time:		25.5382
		4 time:		25.4433

	Mean time: 0.925476190476

In [7]:
metrics

{'pca-fit-time': 146.98513202667237,
 '2 - 10': {'score': 0.9093809523809524,
  'cohen-kappa': 0.8992820339190011,
  'recall': 210.68125829535683,
  'f1': 0.9093380737769404,
  'confusion-matrix': [array([[785,   0,   3,   5,   2,  15,   9,   1,   5,   2],
          [  0, 916,   5,   4,   2,   0,   3,   2,   2,   3],
          [  3,   2, 790,   9,   4,   3,   3,  10,  11,   0],
          [  2,   4,  11, 733,   4,  31,   2,   2,  70,  11],
          [  1,   1,   5,   0, 701,   5,   5,   6,   2,  88],
          [ 12,   3,   0,  21,   4, 669,  11,   2,  27,  10],
          [  7,   0,   4,   0,   6,   8, 801,   1,   0,   0],
          [  0,   4,  11,   3,   9,   2,   1, 812,   4,  34],
          [  3,   2,   8,  39,   7,  15,   8,   1, 717,  13],
          [  3,   4,   1,  13,  80,   6,   6,  30,  10, 685]]),
   array([[801,   0,   2,   1,   0,  13,   3,   1,   2,   3],
          [  0, 928,   1,   3,   0,   0,   1,   1,   0,   3],
          [  6,   1, 785,   5,   4,   1,   7,  12,  12,   3

# Corremos un gridsearch sobre KNN (parámetro k)

In [4]:
def grid_search_step_without_pca(k, splits_knn, metrics_knn):
    score_values = []
    cohen_kappa_values = []
    precision_recall_values = []
    f1_values = []
    confusion_matrix_values = []
    knn_time_values = []
    print(f"Running gridsearch for: k={k}")

    print("\tSplits:")
    for idx, (train_index, test_index) in enumerate(splits_knn):
        print(f"\t\t{idx} time:", end="\t\t")
        _x_train, _y_train = X[train_index], y[train_index]
        _x_test, _y_test = X[test_index], y[test_index]

        _knn = KNNClassifier(k=k)
        
        # KNN fit and predict
        _knn_st = time.time()
        
        _knn.fit(_x_train, _y_train)
        _pred = _knn.predict(_x_test)

        _knn_et = time.time()

        print(f"{(_knn_et - _knn_st):.4f}")

        _score_metric = accuracy_score(_y_test, _pred)
        _cohen_kappa_metric = cohen_kappa_score(_y_test, _pred)
        _precision_recall_metric = precision_recall_fscore_support(_y_test, _pred)
        _f1_metric = f1_score(_y_test, _pred, average='weighted')
        _confusion_matrix_metric = confusion_matrix(_y_test, _pred)      

        score_values.append(_score_metric)
        cohen_kappa_values.append(_cohen_kappa_metric)
        precision_recall_values.append(_precision_recall_metric)
        f1_values.append(_f1_metric)
        confusion_matrix_values.append(_confusion_matrix_metric)
        knn_time_values.append(_knn_et - _knn_st)

    print("")

    metrics_knn[f"{k}"] = dict()
    metrics_knn[f"{k}"]['score'] = np.mean(score_values)
    metrics_knn[f"{k}"]['cohen-kappa'] = np.mean(cohen_kappa_values)
    metrics_knn[f"{k}"]['recall'] = np.mean(precision_recall_values)
    metrics_knn[f"{k}"]['f1'] = np.mean(f1_values)
    metrics_knn[f"{k}"]['confusion-matrix'] = confusion_matrix_values
    metrics_knn[f"{k}"]['knn-time'] = np.mean(knn_time_values)
    print(f"\tMean time: {np.mean(score_values)}")

In [5]:
grid_knn = {
    "k": [20, 18, 16, 14, 12, 10, 8, 6, 4, 2]
}

skf_knn = StratifiedKFold(n_splits=N_SPLITS)
splits_knn = list(skf_knn.split(X, y))

metrics_knn = dict()

for k in grid_knn["k"]:
    grid_search_step_without_pca(k, splits_knn, metrics_knn)

Running gridsearch for: k=20
	Splits:
		0 time:		397.7364
		1 time:		394.0131
		2 time:		406.2146
		3 time:		386.3378
		4 time:		405.8040

	Mean time: 0.9573095238095238
Running gridsearch for: k=18
	Splits:
		0 time:		417.2139
		1 time:		424.0498
		2 time:		417.1253
		3 time:		410.1973
		4 time:		393.2071

	Mean time: 0.9588809523809523
Running gridsearch for: k=16
	Splits:
		0 time:		387.0089
		1 time:		385.6609
		2 time:		389.2041
		3 time:		396.2202
		4 time:		391.8705

	Mean time: 0.9599285714285715
Running gridsearch for: k=14
	Splits:
		0 time:		384.9052
		1 time:		385.9819
		2 time:		389.6350
		3 time:		393.5244
		4 time:		391.0261

	Mean time: 0.9614285714285714
Running gridsearch for: k=12
	Splits:
		0 time:		393.6519
		1 time:		397.4578
		2 time:		411.8997
		3 time:		389.9731
		4 time:		415.5881

	Mean time: 0.9633333333333333
Running gridsearch for: k=10
	Splits:
		0 time:		416.3848
		1 time:		410.9708
		2 time:		421.7465
		3 time:		399.0204
		4 time:		395.0648

	Mean time:

In [6]:
metrics_knn

{'20': {'score': 0.9573095238095238,
  'cohen-kappa': 0.9525431001038708,
  'recall': 210.71819520762972,
  'f1': 0.9572494070409222,
  'confusion-matrix': [array([[816,   1,   1,   0,   0,   0,   8,   1,   0,   0],
          [  0, 927,   1,   1,   1,   0,   3,   1,   1,   2],
          [  5,  21, 775,   5,   2,   1,   1,  18,   5,   2],
          [  1,   6,   2, 838,   0,   8,   1,   4,   4,   6],
          [  0,   7,   0,   0, 779,   0,   1,   1,   0,  26],
          [  2,   7,   0,   5,   0, 721,  11,   0,   2,  11],
          [  6,   1,   0,   0,   2,   1, 816,   0,   1,   0],
          [  0,  18,   1,   0,   3,   0,   0, 848,   0,  10],
          [  8,  14,   2,  22,   1,  13,   4,   4, 730,  15],
          [  3,   5,   1,   6,   9,   1,   2,  18,   0, 793]]),
   array([[822,   0,   0,   0,   0,   3,   0,   0,   1,   0],
          [  0, 934,   1,   1,   1,   0,   0,   0,   0,   0],
          [  8,  14, 773,   2,   2,   1,   2,  27,   4,   3],
          [  0,   0,   6, 841,   0,  1