In [8]:
import pandas as pd

from sklearn.metrics import accuracy_score, make_scorer, precision_recall_fscore_support, cohen_kappa_score, precision_recall_fscore_support, f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from joblib import Parallel, delayed

import sys
import time

sys.path.insert(1, "../src")
from knn import *
from pca import *

N_SPLITS = 2

train = pd.read_csv("../data/train.csv")[:1000]

X = train.drop(columns='label').to_numpy()
y = train.label.to_numpy()

# Corremos un gridsearch sobre los parámetros de PCA+KNN (parámetros k y α)

In [9]:
def grid_search_step(k, a, metrics):
    score_values = []
    cohen_kappa_values = []
    precision_recall_values = []
    f1_values = []
    confusion_matrix_values = []
    time_values = []
    print(f"Running gridsearch for: k={k} - alpha={a}")

    print("\tSplits:")
    for idx, (train_index, test_index) in enumerate(splits):
        print(f"\t\t{idx} time:", end="\t\t")
        _x_train, _y_train = X[train_index], y[train_index]
        _x_test, _y_test = X[test_index], y[test_index]

        _pca, _knn = PCA(alpha=a), KNNClassifier(k=k)

        _st = time.time()

        _pca.fit(_x_train)
        _x_train_transformed = _pca.transform(_x_train)
        _knn.fit(_x_train_transformed, _y_train)
        _x_test_transformed = _pca.transform(_x_test)
        _pred = _knn.predict(_x_test_transformed)

        _et = time.time()

        print(f"{(_et - _st):.4f}")

        _score_metric = accuracy_score(_y_test, _pred)
        _cohen_kappa_metric = cohen_kappa_score(_y_test, _pred)
        _precision_recall_metric = precision_recall_fscore_support(_y_test, _pred)
        _f1_metric = f1_score(_y_test, _pred, average='weighted')
        _confusion_matrix_metric = confusion_matrix(_y_test, _pred)      

        score_values.append(_score_metric)
        cohen_kappa_values.append(_cohen_kappa_metric)
        precision_recall_values.append(_precision_recall_metric)
        f1_values.append(_f1_metric)
        confusion_matrix_values.append(_confusion_matrix_metric)
        time_values.append(_et - _st)

    print("")

    metrics[f"{k} - {a}"] = dict()
    metrics[f"{k} - {a}"]['score'] = np.mean(score_values)
    metrics[f"{k} - {a}"]['cohen-kappa'] = np.mean(cohen_kappa_values)
    metrics[f"{k} - {a}"]['recall'] = np.mean(precision_recall_values)
    metrics[f"{k} - {a}"]['f1'] = np.mean(f1_values)
    metrics[f"{k} - {a}"]['confusion-matrix'] = confusion_matrix_values
    metrics[f"{k} - {a}"]['time'] = np.mean(time_values)
    print(f"\tMean time: {np.mean(score_values)}")

In [11]:
grid = {
#       "k": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
#       "alpha": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 153]
    "k": [2],
      "alpha": [1,2,5]
}

skf = StratifiedKFold(n_splits=N_SPLITS)
splits = list(skf.split(X, y))

metrics = dict()

for k in grid["k"]:
    for a in grid["alpha"]:
        grid_search_step(k,a,metrics)

Running gridsearch for: k=2 - alpha=1
	Splits:
		0 time:		0.9044
		1 time:		0.7255

	Mean time: 0.217
Running gridsearch for: k=2 - alpha=2
	Splits:
		0 time:		0.9661
		1 time:		1.0620

	Mean time: 0.36
Running gridsearch for: k=2 - alpha=5
	Splits:
		0 time:		2.0247
		1 time:		2.2874

	Mean time: 0.626


In [12]:
metrics

{'2 - 1': {'score': 0.217,
  'cohen-kappa': 0.1285753377436878,
  'recall': 12.658599553921587,
  'f1': 0.21276196928464403,
  'confusion-matrix': [array([[30,  0,  5,  2,  1,  4,  7,  2,  1,  2],
          [ 0, 34,  0,  1,  0,  1,  0,  4,  0,  8],
          [ 8,  3,  7,  5,  4,  7,  7,  7, 11,  3],
          [ 1,  1,  6,  3,  5,  6,  9,  4,  6,  4],
          [ 1,  3,  8, 10,  5,  7,  7,  4,  3,  3],
          [ 5,  2,  6,  4,  8,  3,  6,  3,  3,  5],
          [ 8,  2,  9,  5,  3,  2,  5,  3,  6,  5],
          [ 1,  6,  6,  4,  3,  5,  2,  9,  8,  8],
          [ 1,  1,  8,  4,  6,  5,  6,  5,  4,  6],
          [ 2,  2,  4,  4,  7,  5,  3,  7,  5, 10]]),
   array([[34,  0,  4,  5,  1,  1,  4,  0,  2,  2],
          [ 0, 32,  3,  1,  0,  3,  1,  3,  2,  3],
          [11,  0, 10,  7,  8,  7,  5,  4,  6,  4],
          [ 3,  1, 11,  5,  4,  6,  5,  2,  5,  3],
          [ 1,  1, 13,  7,  5,  2,  7,  3,  4,  8],
          [ 5,  0, 10,  3,  4,  4,  5,  4,  1,  8],
          [ 7,  1, 12

# Corremos un gridsearch sobre KNN (parámetro k)

In [None]:
471