In [36]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
elif PROJECT_ROOT.name.startswith("0"):
    PROJECT_ROOT = PROJECT_ROOT.parent.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
import warnings
warnings.filterwarnings("ignore")

# Data Management
import json
import numpy as np
import pandas as pd

# Modelling
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

# Opti
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from match_forecast.utils import *

In [38]:
PROJ_ROOT = Path("QRT-Challenge-2024").resolve().parents[1]
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

In [39]:
try:
    train_data = pd.read_csv(PROCESSED_DATA_DIR / "train_data.csv", index_col=0)
    train_scores = pd.read_csv(RAW_DATA_DIR / "Y_train.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [40]:
train_data.shape

(12303, 275)

In [41]:
train_scores = train_scores.loc[train_data.index]
train_scores_1c = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
label_mapping = {'HOME_WINS': 0, 'DRAW': 1, 'AWAY_WINS': 2}
train_scores_1c = train_scores_1c.replace(label_mapping)

train_scores_1c.head(5)

ID
0    2
1    1
2    2
3    0
4    1
dtype: int64

In [42]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_scores_1c, train_size=0.8, random_state=42)

In [43]:
space = {
    'n_components': hp.quniform('n_components', 2, 20, 2),
    'n_neighbors':  hp.quniform('n_neighbors', 1, 100, 2),
    'weights':      hp.choice('weights', ['uniform', 'distance']),
    'algorithm':    hp.choice('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
    'leaf_size':    hp.quniform('leaf_size', 10, 200, 10),
    'p':            hp.uniform('p', 1.0, 2.0)
}

In [44]:
n_splits = 10

def objective(params):
    """
    Hyperopt objective for KNN with:
      - 1st StandardScaler → PCA → 2nd StandardScaler
      - Stratified 3-fold CV
    """
    # Cast integer parameters
    n_comp     = int(params['n_components'])
    n_neighbors= int(params['n_neighbors'])
    leaf_size  = int(params['leaf_size'])
    weights    = params['weights']
    algorithm  = params['algorithm']
    p          = params['p']

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    accuracies = []

    for train_idx, valid_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        # 1) First scaling
        scaler1 = StandardScaler()
        X_tr_s  = scaler1.fit_transform(X_tr)
        X_val_s = scaler1.transform(X_val)

        # 2) PCA reduction
        pca = PCA(n_components=n_comp, random_state=42)
        X_tr_p  = pca.fit_transform(X_tr_s)
        X_val_p = pca.transform(X_val_s)

        # 3) Second scaling
        scaler2 = StandardScaler()
        X_tr_f  = scaler2.fit_transform(X_tr_p)
        X_val_f = scaler2.transform(X_val_p)

        # 4) KNN classifier
        model = KNeighborsClassifier(
            n_neighbors = n_neighbors,
            weights     = weights,
            algorithm   = algorithm,
            leaf_size   = leaf_size,
            p           = p,
            n_jobs      = -1
        )
        model.fit(X_tr_f, y_tr)

        # 5) Evaluate
        y_pred = model.predict(X_val_f)
        accuracies.append(accuracy_score(y_val, y_pred))

    return {'loss': 1 - np.mean(accuracies), 'status': STATUS_OK}

In [45]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,  
            trials=trials)

print("Best hyperparameters:", best)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [02:59<00:00,  1.79s/trial, best loss: 0.5127001855967965]
Best hyperparameters: {'algorithm': 2, 'leaf_size': 140.0, 'n_components': 4.0, 'n_neighbors': 80.0, 'p': 1.8386988365354668, 'weights': 1}


In [46]:
# Convert NumPy types to native Python types for JSON serialization
def convert_numpy(obj):
    if isinstance(obj, np.integer):
        return int(obj)  
    elif isinstance(obj, np.floating):
        return float(obj)  
    elif isinstance(obj, list):  
        return [convert_numpy(i) for i in obj]  
    elif isinstance(obj, dict):  
        return {key: convert_numpy(value) for key, value in obj.items()}  
    else:
        return obj

results_with_params = [
    {
        **convert_numpy(trial['result']),
        'params': {key: convert_numpy(value[0]) if value else None for key, value in trial['misc']['vals'].items()}  # Convert hyperparameters
    }
    for trial in trials.trials
]

with open('knn_model.json', 'w') as f:
    json.dump(results_with_params, f, indent=4)

print("Optimization results saved to 'knn_model.json'")

Optimization results saved to 'knn_model.json'


In [47]:
with open("knn_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("Best hyperparameters:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
Best hyperparameters: {'algorithm': 2, 'leaf_size': 140.0, 'n_components': 4.0, 'n_neighbors': 80.0, 'p': 1.8386988365354668, 'weights': 1}
Best loss: 0.5127001855967965


In [48]:
scaler1 = StandardScaler()
scaler2 = StandardScaler()

train_data_scaled = scaler1.fit_transform(train_data)

pca_object = PCA(n_components=int(best_trial["params"]['n_components']))
pca = pca_object.fit_transform(train_data_scaled)
pca_scaled = scaler2.fit_transform(pca)

n_components = pca_object.n_components_
train_data_reduced = pd.DataFrame(pca, columns=[f"PC{i+1}" for i in range(n_components)], index=train_data.index)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(train_data_reduced, train_scores_1c, train_size=0.8, random_state=42)

In [51]:
best = best_trial['params']

weights_list   = ['uniform', 'distance']
algorithm_list = ['auto', 'ball_tree', 'kd_tree', 'brute']

n_components = int(best['n_components'])
n_neighbors  = int(best['n_neighbors'])
leaf_size    = int(best['leaf_size'])
weights      = weights_list[int(best['weights'])]
algorithm    = algorithm_list[int(best['algorithm'])]
p            = best['p']


final_model = KNeighborsClassifier(
    n_neighbors = n_neighbors,
    weights     = weights,
    algorithm   = algorithm,
    leaf_size   = leaf_size,
    p           = p,
    n_jobs      = -1
)

final_model.fit(X_train, y_train)

In [52]:
y_pred_final = final_model.predict(X_test)

final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Accuracy (test) : {final_accuracy * 100:.2f}%")

Accuracy (test) : 48.23%
