In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
elif PROJECT_ROOT.name.startswith("0"):
    PROJECT_ROOT = PROJECT_ROOT.parent.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Data Management
import yaml
import numpy as np
import pandas as pd

# Modelling
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


# Opti
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from match_forecast.utils import *
from match_forecast.formatters import *


[32m2025-04-23 19:11:06.925[0m | [1mINFO    [0m | [36mmatch_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/maichoun/QRT-Challenge-2024[0m


In [3]:
PROJ_ROOT = Path("QRT-Challenge-2024").resolve().parents[1]
CONFIG_DIR = PROJ_ROOT / "config"
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

In [4]:
try:
    train_data = pd.read_csv(PROCESSED_DATA_DIR / "train_data.csv", index_col=0)
    train_scores = pd.read_csv(RAW_DATA_DIR / "Y_train.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [5]:
train_data.shape

(12303, 275)

In [6]:
train_scores = train_scores.loc[train_data.index]
train_scores_1c = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
label_mapping = {'HOME_WINS': 0, 'DRAW': 1, 'AWAY_WINS': 2}
train_scores_1c = train_scores_1c.replace(label_mapping)

train_scores_1c.head(5)

ID
0    2
1    1
2    2
3    0
4    1
dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_scores_1c, train_size=0.8, random_state=42)

In [8]:
space = {
    'n_components': hp.quniform('n_components', 5, 100, 5),  # PCA components
    'solver':       hp.choice('solver', ['lsqr', 'eigen']),
    'shrinkage':    hp.choice('shrinkage', [
                        None,
                        'auto',
                        hp.uniform('shrinkage_val', 1e-3, 1.0)
                    ])
}


In [9]:
n_splits = 10

def objective(params):
    """
    Hyperopt objective for LDA with:
      - 1st StandardScaler → PCA → 2nd StandardScaler
      - Stratified 3‑fold CV
    """
    n_comp = int(params['n_components'])
    solver = params['solver']
    shrinkage = params['shrinkage']
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    
    for train_idx, valid_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        
        # 1) First scaling
        scaler1 = StandardScaler()
        X_tr_s = scaler1.fit_transform(X_tr)
        X_val_s = scaler1.transform(X_val)
        
        # 2) PCA reduction
        pca = PCA(n_components=n_comp, random_state=42)
        X_tr_p = pca.fit_transform(X_tr_s)
        X_val_p = pca.transform(X_val_s)
        
        # 3) Second scaling
        scaler2 = StandardScaler()
        X_tr_f = scaler2.fit_transform(X_tr_p)
        X_val_f = scaler2.transform(X_val_p)
        
        # 4) LDA model
        model = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage)
        model.fit(X_tr_f, y_tr)
        
        # 5) Evaluate
        y_pred = model.predict(X_val_f)
        accuracies.append(accuracy_score(y_val, y_pred))
    
    return {'loss': 1 - np.mean(accuracies), 'status': STATUS_OK}


In [10]:
train = False

if train:
    trials = Trials()
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=10,  
                trials=trials)

    print("Best hyperparameters:", best)

    results = [
        {
            **convert_numpy(trial['result']),
            'params': {key: convert_numpy(value[0]) if value else None for key, value in trial['misc']['vals'].items()}  # Convert hyperparameters
        }
        for trial in trials.trials
    ]

    raw_params = min(results, key=lambda x: x["loss"])['params']
    formatted_params = format_lda(raw_params)

    with open(CONFIG_DIR / "lda_params.yaml", "w") as f:
        yaml.dump(formatted_params, f)

In [11]:
with open(CONFIG_DIR / "lda_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

In [12]:
cfg

{'n_components': 65, 'shrinkage': 1, 'shrinkage_val': None, 'solver': 'eigen'}

In [13]:
lda_model = LinearDiscriminantAnalysis(solver=cfg['solver'], shrinkage=cfg['shrinkage'])

lda_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=cfg['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('lda', lda_model)  
])

In [14]:
lda_pipeline.fit(X_train, y_train)

In [15]:
y_pred_final = lda_pipeline.predict(X_test)

final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Accuracy (test) : {final_accuracy * 100:.2f}%")

Accuracy (test) : 49.57%
