In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
elif PROJECT_ROOT.name.startswith("0"):
    PROJECT_ROOT = PROJECT_ROOT.parent.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

%load_ext autoreload
%autoreload 2

In [2]:
# !pip install shap
# !pip install hyperopt
# !pip install loguru
# !pip install lightgbm
# !pip install pytorch-tabnet


In [12]:
import warnings
warnings.filterwarnings("ignore")

# Data Management
import json
import numpy as np
import pandas as pd

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import gc

# Opti
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from match_forecast.utils import *

In [13]:
PROJ_ROOT = Path("QRT-Challenge-2024").resolve().parents[1]
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

In [14]:
try:
    train_data = pd.read_csv(PROCESSED_DATA_DIR / "train_data.csv", index_col=0)
    train_scores = pd.read_csv(RAW_DATA_DIR / "Y_train.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [15]:
train_data.shape

(12303, 275)

In [16]:
train_scores = train_scores.loc[train_data.index]
train_scores_1c = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
label_mapping = {'HOME_WINS': 0, 'DRAW': 1, 'AWAY_WINS': 2}
train_scores_1c = train_scores_1c.replace(label_mapping)

train_scores_1c.head(5)

ID
0    2
1    1
2    2
3    0
4    1
dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_scores_1c, train_size=0.8, random_state=42)

In [18]:
space = {
    'n_d': hp.quniform('n_d', 8, 64, 4),  # Number of decision steps (8-64)
    'n_a': hp.quniform('n_a', 8, 64, 4),  # Number of attention steps (same as n_d)
    'n_steps': hp.quniform('n_steps', 3, 10, 1),  # Number of sequential decision steps
    'gamma': hp.uniform('gamma', 1.0, 2.0),  # Scaling factor for importance weights
    'lambda_sparse': hp.loguniform('lambda_sparse', -8, -2),  # L1 regularization
    'clip_value': hp.uniform('clip_value', 1.0, 5.0),  # Gradient clipping value
    'learning_rate': hp.loguniform('learning_rate', -5, -1),  # Learning rate (1e-5 to 1e-1)
    'n_shared': hp.quniform('n_shared', 1, 5, 1),  # Shared layers in feature transformer
    'n_independent': hp.quniform('n_independent', 1, 5, 1),  # Independent layers in feature transformer
}


In [None]:
n_splits = 3

def objective(params):
    """
    Hyperopt objective for TabNet with:
      - Single StandardScaler applied once
      - CUDA device
      - Verbose disabled
    """
    for key in ['n_d', 'n_a', 'n_steps', 'n_independent', 'n_shared']:
        params[key] = int(params[key])

    scaler = StandardScaler().fit(X_train.values)
    X_train_scaled = scaler.transform(X_train.values)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []

    for train_idx, valid_idx in skf.split(X_train_scaled, y_train):
        X_tr = X_train_scaled[train_idx]
        y_tr = y_train.values[train_idx]
        X_val = X_train_scaled[valid_idx]
        y_val = y_train.values[valid_idx]

        model = TabNetClassifier(
            n_d=params['n_d'],
            n_a=params['n_a'],
            n_steps=params['n_steps'],
            gamma=params['gamma'],
            lambda_sparse=params['lambda_sparse'],
            optimizer_fn=torch.optim.Adam,
            optimizer_params={'lr': params['learning_rate']},
            n_independent=params['n_independent'],
            n_shared=params['n_shared'],
            clip_value=params['clip_value'],
            verbose=0,
            device_name='cuda' if torch.cuda.is_available() else 'cpu'
        )

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            max_epochs=150,
            patience=30,
            batch_size=128,
            virtual_batch_size=32,
            num_workers=2,
            drop_last=False,
        )

        preds = model.predict(X_val)
        accuracies.append(accuracy_score(y_val, preds))

        # Cleanup
        del model
        torch.cuda.empty_cache()
        gc.collect()

    mean_acc = np.mean(accuracies)
    return {'loss': 1 - mean_acc, 'status': STATUS_OK}


In [26]:
trials = Trials()
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials
)

print("Best hyperparameters:", best_params)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

                                                      
Early stopping occurred at epoch 53 with best_epoch = 23 and best_val_0_accuracy = 0.4904
                                                      
Early stopping occurred at epoch 57 with best_epoch = 27 and best_val_0_accuracy = 0.48034
                                                      
Early stopping occurred at epoch 65 with best_epoch = 35 and best_val_0_accuracy = 0.48933
                                                                                     
Early stopping occurred at epoch 83 with best_epoch = 53 and best_val_0_accuracy = 0.49649
                                                                                     
Early stopping occurred at epoch 45 with best_epoch = 15 and best_val_0_accuracy = 0.48827
                                                                                     
Early stopping occurred at epoch 138 with best_epoch = 108 and best_val_0_accuracy = 0.5
                                  

In [27]:
# Convert NumPy types to native Python types for JSON serialization
def convert_numpy(obj):
    if isinstance(obj, np.integer):
        return int(obj)  
    elif isinstance(obj, np.floating):
        return float(obj)  
    elif isinstance(obj, list):  
        return [convert_numpy(i) for i in obj]  
    elif isinstance(obj, dict):  
        return {key: convert_numpy(value) for key, value in obj.items()}  
    else:
        return obj

results_with_params = [
    {
        **convert_numpy(trial['result']),
        'params': {key: convert_numpy(value[0]) if value else None for key, value in trial['misc']['vals'].items()}  # Convert hyperparameters
    }
    for trial in trials.trials
]

with open('tabnet_model.json', 'w') as f:
    json.dump(results_with_params, f, indent=4)

print("Optimization results saved to 'tabnet_model.json'")

Optimization results saved to 'tabnet_model.json'


In [28]:
with open("tabnet_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("Best hyperparameters:", best_trial["params"])
print("Best loss:", best_trial["loss"])

best_params = best_trial['params']
best_params['n_d'] = int(best_params['n_d'])
best_params['n_a'] = int(best_params['n_a'])
best_params['n_steps'] = int(best_params['n_steps'])
best_params['n_shared'] = int(best_params['n_shared'])
best_params['n_independent'] = int(best_params['n_independent'])

final_model = TabNetClassifier(
    n_d=best_params['n_d'],
    n_a=best_params['n_a'],
    n_steps=best_params['n_steps'],
    gamma=best_params['gamma'],
    lambda_sparse=best_params['lambda_sparse'],
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=best_params['learning_rate']),
    n_independent=best_params['n_independent'],
    n_shared=best_params['n_shared'],
    clip_value=best_params['clip_value'],
    verbose=0,
    device_name='cuda' if torch.cuda.is_available() else 'cpu'
)


Best trial
Best hyperparameters: {'clip_value': 2.039306564397546, 'gamma': 1.0373107804827173, 'lambda_sparse': 0.0007629251680237833, 'learning_rate': 0.020892188021323258, 'n_a': 16.0, 'n_d': 40.0, 'n_independent': 5.0, 'n_shared': 2.0, 'n_steps': 5.0}
Best loss: 0.5050797521080972


In [29]:
scaler = StandardScaler().fit(X_train.values)
X_train_scaled = scaler.transform(X_train.values)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_scaled, 
    y_train.values, 
    test_size=0.20, 
    stratify=y_train, 
    random_state=42
)

final_model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    max_epochs=150,
    patience=30,
    batch_size=128,
    virtual_batch_size=32,
    num_workers=2,
    drop_last=False,
)


Early stopping occurred at epoch 83 with best_epoch = 53 and best_val_0_accuracy = 0.50025


In [30]:
y_pred_final = final_model.predict(X_test.values)

final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Accuracy (test) : {final_accuracy * 100:.2f}%")

Accuracy (test) : 47.09%
