In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
elif PROJECT_ROOT.name.startswith("0"):
    PROJECT_ROOT = PROJECT_ROOT.parent.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

%load_ext autoreload
%autoreload 2

In [2]:
# !pip install shap
# !pip install hyperopt
# !pip install loguru

In [3]:
import warnings
warnings.filterwarnings("ignore")

# Data Management
import json
import numpy as np
import pandas as pd

# Modelling
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import SVC

# Hyperparameter optimization
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from match_forecast.utils import *


[32m2025-04-18 19:05:56.876[0m | [1mINFO    [0m | [36mmatch_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/maichoun/QRT-Challenge-2024[0m


In [4]:
PROJ_ROOT = Path("QRT-Challenge-2024").resolve().parents[1]
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

try:
    train_data = pd.read_csv(PROCESSED_DATA_DIR / "train_data.csv", index_col=0)
    train_scores = pd.read_csv(RAW_DATA_DIR / "Y_train.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [5]:
train_data.shape

(12303, 350)

In [6]:
train_scores = train_scores.loc[train_data.index]
train_scores_1c = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
label_mapping = {'HOME_WINS': 0, 'DRAW': 1, 'AWAY_WINS': 2}
train_scores_1c = train_scores_1c.replace(label_mapping)

train_scores_1c.head(5)

ID
0    2
1    1
2    2
3    0
4    1
dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_scores_1c, train_size=0.8, random_state=42)

In [8]:
space = {
    'n_components': hp.quniform('n_components', 10, 25, 5),
    'C':             hp.loguniform('C', np.log(1e-4), np.log(1e4)),
    'kernel':        hp.choice('kernel', ['linear', 'rbf']),
    'gamma':         hp.choice('gamma', [
                          'scale',
                          'auto',
                          hp.loguniform('gamma_real', np.log(1e-4), np.log(1e0))
                      ]),
}

In [9]:
n_splits = 3

def objective(params):
    # 2) Extract & cast
    n_components = int(params.pop('n_components'))
    C            = params['C']
    kernel       = params['kernel']
    gamma        = params['gamma']

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []

    for train_idx, valid_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        # Scale → PCA → Scale
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr);   X_val_s = scaler.transform(X_val)
        pca    = PCA(n_components=n_components)
        X_tr_p = pca.fit_transform(X_tr_s);    X_val_p = pca.transform(X_val_s)
        scaler2= StandardScaler()
        X_tr_f = scaler2.fit_transform(X_tr_p);X_val_f = scaler2.transform(X_val_p)

        # Train & evaluate SVC
        model = SVC(C=C, kernel=kernel, gamma=gamma, random_state=42)
        model.fit(X_tr_f, y_tr)
        accuracies.append(accuracy_score(y_val, model.predict(X_val_f)))

    return {'loss': 1 - np.mean(accuracies), 'status': STATUS_OK}


In [10]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=5,  
            trials=trials)

print("Best hyperparameters:", best)

100%|██████████| 5/5 [00:49<00:00,  9.82s/trial, best loss: 0.5021332480926151]
Best hyperparameters: {'C': 0.2841603561910054, 'gamma': 2, 'gamma_real': 0.3262070517651283, 'kernel': 0, 'n_components': 20.0}


In [11]:
# Convert NumPy types to native Python types for JSON serialization
def convert_numpy(obj):
    if isinstance(obj, np.integer):
        return int(obj)  
    elif isinstance(obj, np.floating):
        return float(obj)  
    elif isinstance(obj, list):  
        return [convert_numpy(i) for i in obj]  
    elif isinstance(obj, dict):  
        return {key: convert_numpy(value) for key, value in obj.items()}  
    else:
        return obj

results_with_params = [
    {
        **convert_numpy(trial['result']),
        'params': {key: convert_numpy(value[0]) if value else None for key, value in trial['misc']['vals'].items()}  # Convert hyperparameters
    }
    for trial in trials.trials
]

with open('svm_model.json', 'w') as f:
    json.dump(results_with_params, f, indent=4)

print("Optimization results saved to 'svm_model.json'")

Optimization results saved to 'svm_model.json'


In [12]:
with open("svm_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("Best hyperparameters:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
Best hyperparameters: {'C': 0.2841603561910054, 'gamma': 2, 'gamma_real': 0.3262070517651283, 'kernel': 0, 'n_components': 20.0}
Best loss: 0.5021332480926151


In [17]:
best_params = best_trial['params'].copy()

best_params['n_components'] = int(best_params['n_components'])
best_params['C']            = float(best_params['C'])

kernel_options = ['linear', 'rbf']
if isinstance(best_params.get('kernel'), (int, np.integer, float)):
    best_params['kernel'] = kernel_options[int(best_params['kernel'])]

gamma_opt = best_params.get('gamma')
gamma_options = ['scale', 'auto']
if isinstance(gamma_opt, (int, np.integer, float)) and gamma_opt in (0, 1):
    best_params['gamma'] = gamma_options[int(gamma_opt)]
else:
    best_params['gamma'] = float(gamma_opt)

In [18]:
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)

pca_object = PCA(n_components=int(best_params['n_components']))
pca = pca_object.fit_transform(train_data_scaled)
pca_scaled = scaler.fit_transform(pca)

n_components = pca_object.n_components_
train_data_reduced = pd.DataFrame(pca, columns=[f"PC{i+1}" for i in range(n_components)], index=train_data.index)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(train_data_reduced, train_scores_1c, train_size=0.8, random_state=42)

In [20]:
final_model = SVC(
    C=best_params['C'],
    kernel=best_params['kernel'],
    gamma=best_params['gamma'],
    random_state=42,
    # if you’re using sklearnex patch you get n_jobs automatically
)

final_model.fit(X_train, y_train)

In [21]:
y_pred_final = final_model.predict(X_test)

final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Accuracy (test) : {final_accuracy * 100:.2f}%")

Accuracy (test) : 49.37%
