In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
elif PROJECT_ROOT.name.startswith("0"):
    PROJECT_ROOT = PROJECT_ROOT.parent.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

%load_ext autoreload
%autoreload 2

In [2]:
# !pip install shap
# !pip install hyperopt
# !pip install loguru
# !pip install lightgbm
# !pip install pytorch-tabnet

In [3]:
import warnings
warnings.filterwarnings("ignore")

# Data Management
import yaml
import numpy as np
import pandas as pd

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import gc

# Opti
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from match_forecast.utils import *
from match_forecast.formatters import *

[32m2025-04-23 19:21:54.045[0m | [1mINFO    [0m | [36mmatch_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/maichoun/QRT-Challenge-2024[0m


In [4]:
PROJ_ROOT = Path("QRT-Challenge-2024").resolve().parents[1]
CONFIG_DIR = PROJ_ROOT / "config"
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

In [5]:
try:
    train_data = pd.read_csv(PROCESSED_DATA_DIR / "train_data.csv", index_col=0)
    train_scores = pd.read_csv(RAW_DATA_DIR / "Y_train.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [6]:
train_data.shape

(12303, 275)

In [7]:
train_scores = train_scores.loc[train_data.index]
train_scores_1c = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
label_mapping = {'HOME_WINS': 0, 'DRAW': 1, 'AWAY_WINS': 2}
train_scores_1c = train_scores_1c.replace(label_mapping)

train_scores_1c.head(5)

ID
0    2
1    1
2    2
3    0
4    1
dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_scores_1c, train_size=0.8, random_state=42)

In [9]:
space = {
    'n_d': hp.quniform('n_d', 8, 64, 4),  # Number of decision steps (8-64)
    'n_a': hp.quniform('n_a', 8, 64, 4),  # Number of attention steps (same as n_d)
    'n_steps': hp.quniform('n_steps', 3, 10, 1),  # Number of sequential decision steps
    'gamma': hp.uniform('gamma', 1.0, 2.0),  # Scaling factor for importance weights
    'lambda_sparse': hp.loguniform('lambda_sparse', -8, -2),  # L1 regularization
    'clip_value': hp.uniform('clip_value', 1.0, 5.0),  # Gradient clipping value
    'learning_rate': hp.loguniform('learning_rate', -5, -1),  # Learning rate (1e-5 to 1e-1)
    'n_shared': hp.quniform('n_shared', 1, 5, 1),  # Shared layers in feature transformer
    'n_independent': hp.quniform('n_independent', 1, 5, 1),  # Independent layers in feature transformer
}


In [10]:
n_splits = 3

def objective(params):
    """
    Hyperopt objective for TabNet with:
      - Single StandardScaler applied once
      - CUDA device
      - Verbose disabled
    """
    for key in ['n_d', 'n_a', 'n_steps', 'n_independent', 'n_shared']:
        params[key] = int(params[key])

    scaler = StandardScaler().fit(X_train.values)
    X_train_scaled = scaler.transform(X_train.values)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []

    for train_idx, valid_idx in skf.split(X_train_scaled, y_train):
        X_tr = X_train_scaled[train_idx]
        y_tr = y_train.values[train_idx]
        X_val = X_train_scaled[valid_idx]
        y_val = y_train.values[valid_idx]

        model = TabNetClassifier(
            n_d=params['n_d'],
            n_a=params['n_a'],
            n_steps=params['n_steps'],
            gamma=params['gamma'],
            lambda_sparse=params['lambda_sparse'],
            optimizer_fn=torch.optim.Adam,
            optimizer_params={'lr': params['learning_rate']},
            n_independent=params['n_independent'],
            n_shared=params['n_shared'],
            clip_value=params['clip_value'],
            verbose=0,
            device_name='cuda' if torch.cuda.is_available() else 'cpu'
        )

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            max_epochs=150,
            patience=30,
            batch_size=128,
            virtual_batch_size=32,
            num_workers=2,
            drop_last=False,
        )

        preds = model.predict(X_val)
        accuracies.append(accuracy_score(y_val, preds))

        # Cleanup
        del model
        torch.cuda.empty_cache()
        gc.collect()

    mean_acc = np.mean(accuracies)
    return {'loss': 1 - mean_acc, 'status': STATUS_OK}


In [11]:
train = False

if train:
    trials = Trials()
    best_params = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials
    )

    print("Best hyperparameters:", best_params)

    results = [
        {
            **convert_numpy(trial['result']),
            'params': {key: convert_numpy(value[0]) if value else None for key, value in trial['misc']['vals'].items()}  # Convert hyperparameters
        }
        for trial in trials.trials
    ]

    raw_params = min(results, key=lambda x: x["loss"])['params']
    formatted_params = format_rf(raw_params)

    with open(CONFIG_DIR / "tabnet_params.yaml", "w") as f:
        yaml.dump(formatted_params, f)

In [12]:
with open(CONFIG_DIR / "tabnet_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

In [13]:
cfg

{'clip_value': 2.039306564397546,
 'gamma': 1.0373107804827173,
 'lambda_sparse': 0.0007629251680237833,
 'learning_rate': 0.020892188021323258,
 'n_a': 16,
 'n_d': 40,
 'n_independent': 5,
 'n_shared': 2,
 'n_steps': 5}

In [14]:
tabnet_model = TabNetClassifier(
    **{k: v for k, v in cfg.items() if k != 'learning_rate'},
    optimizer_fn=torch.optim.Adam,
    optimizer_params={'lr': cfg['learning_rate']},
    verbose=0,
    device_name='cuda' if torch.cuda.is_available() else 'cpu'
)

scaler = StandardScaler().fit(X_train.values)
X_train_scaled = scaler.transform(X_train.values)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_scaled, 
    y_train.values, 
    test_size=0.20, 
    stratify=y_train, 
    random_state=42
)

tabnet_model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    max_epochs=150,
    patience=15,
    batch_size=128,
    virtual_batch_size=32,
    num_workers=2,
    drop_last=False,
)

  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core i


Early stopping occurred at epoch 25 with best_epoch = 10 and best_val_0_accuracy = 0.49264


In [15]:
y_pred_final = tabnet_model.predict(X_test.values)

final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Accuracy (test) : {final_accuracy * 100:.2f}%")

Accuracy (test) : 44.49%
