In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
elif PROJECT_ROOT.name.startswith("0"):
    PROJECT_ROOT = PROJECT_ROOT.parent.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

%load_ext autoreload
%autoreload 2

In [2]:
# !pip install shap
# !pip install hyperopt
# !pip install loguru
# !pip install lightgbm
# !pip install catboost
# !pip install pytorch_tabnet

In [3]:
import warnings
warnings.filterwarnings("ignore")

# Core
import yaml
import numpy as np
import pandas as pd
from itertools import combinations

# Modelling
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

# from pytorch_tabnet.tab_model import TabNetClassifier
# import torch
# import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from match_forecast.utils import *

[32m2025-04-23 13:57:13.967[0m | [1mINFO    [0m | [36mmatch_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/maichoun/QRT-Challenge-2024[0m


In [4]:
PROJ_ROOT = Path("QRT-Challenge-2024").resolve().parents[1]
CONFIG_DIR = PROJ_ROOT / "config"
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

In [5]:
try:
    train_data = pd.read_csv(PROCESSED_DATA_DIR / "train_data.csv", index_col=0)
    train_scores = pd.read_csv(RAW_DATA_DIR / "Y_train.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [6]:
train_data.shape

(12303, 275)

In [7]:
train_scores = train_scores.loc[train_data.index]
train_scores_1c = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
label_mapping = {'HOME_WINS': 0, 'DRAW': 1, 'AWAY_WINS': 2}
train_scores_1c = train_scores_1c.replace(label_mapping)

train_scores_1c.head(5)

ID
0    2
1    1
2    2
3    0
4    1
dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_scores_1c, train_size=0.8, random_state=42)

In [9]:
with open(CONFIG_DIR / "xgb_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_jobs=-1,
    **cfg
)

In [10]:
with open(CONFIG_DIR / "xt_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

xt_model = ExtraTreesClassifier(
    **cfg,
    random_state=42,
    n_jobs=-1
)

In [11]:
with open(CONFIG_DIR / "lgb_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

lgb_model = lgb.LGBMClassifier(
    objective='multiclass', 
    boosting_type='gbdt', 
    num_class=3,
    n_jobs=-1, 
    verbose=-1,
    **cfg 
)

In [12]:
with open(CONFIG_DIR / "rf_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

rf_model = RandomForestClassifier(
    **cfg,
    n_jobs=-1,
    random_state=42
)

In [13]:
with open(CONFIG_DIR / "catboost_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

catboost_model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='MultiClass',
    verbose=False,
    thread_count=-1,
    random_seed=42,
    **cfg
)    

In [14]:
with open(CONFIG_DIR / "logreg_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

logreg_model = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    C=cfg['C'],
    l1_ratio=cfg['l1_ratio'],
    max_iter=2000,
    random_state=42
)

logreg_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=cfg['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('lr', logreg_model)  
])

In [15]:
with open(CONFIG_DIR / "sgdc_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

sgdc_model = SGDClassifier(
    loss='modified_huber',   # hinge+log hybride, avec proba
    penalty='elasticnet',
    alpha=cfg['alpha'],
    l1_ratio=cfg['l1_ratio'],
    max_iter=1000,
    tol=1e-3,
    n_jobs=-1,
    random_state=42
)

sgdc_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=cfg['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('sgdc', sgdc_model)  
])

In [16]:
with open(CONFIG_DIR / "gnb_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

gnb_model = GaussianNB(var_smoothing=cfg['var_smoothing'], priors=cfg['priors'])

gnb_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=cfg['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('gnb', gnb_model)  
])

In [17]:
with open(CONFIG_DIR / "knn_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

knn_model = KNeighborsClassifier(
    n_neighbors = cfg['n_neighbors'],
    weights     = cfg['weights'],
    algorithm   = cfg['algorithm'],
    leaf_size   = cfg['leaf_size'],
    p           = cfg['p'],
    n_jobs      = -1
)

knn_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=cfg['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('knn', knn_model)  
])

In [18]:
with open(CONFIG_DIR / "lda_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)


lda_model = LDA(solver=cfg['solver'], shrinkage=cfg['shrinkage'])

lda_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=cfg['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('lda', lda_model)  
])

In [19]:
with open(CONFIG_DIR / "qda_params.yaml", "r") as f:
    cfg = yaml.safe_load(f)

qda_model = QDA(reg_param=cfg['reg_param'])

qda_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=cfg['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('qda', qda_model)  
])

In [20]:
# with open("tabnet_model.json", "r") as f:
#     results_with_params = json.load(f)

# tabnet_model = TabNetClassifier(
#     n_d=best_params['n_d'],
#     n_a=best_params['n_a'],
#     n_steps=best_params['n_steps'],
#     gamma=best_params['gamma'],
#     lambda_sparse=best_params['lambda_sparse'],
#     optimizer_fn=torch.optim.Adam,
#     optimizer_params=dict(lr=best_params['learning_rate']),
#     n_independent=best_params['n_independent'],
#     n_shared=best_params['n_shared'],
#     clip_value=best_params['clip_value'],
#     verbose=0,
#     device_name='cuda' if torch.cuda.is_available() else 'cpu'
# )

# scaler = StandardScaler().fit(X_train.values)
# X_train_scaled = scaler.transform(X_train.values)

# X_tr, X_val, y_tr, y_val = train_test_split(
#     X_train_scaled, 
#     y_train.values, 
#     test_size=0.20, 
#     stratify=y_train, 
#     random_state=42
# )

In [21]:
logreg_pipeline.fit(X_train, y_train)
xt_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
catboost_model.fit(X_train, y_train)
sgdc_pipeline.fit(X_train, y_train)
gnb_pipeline.fit(X_train, y_train)
knn_pipeline.fit(X_train, y_train)
lda_pipeline.fit(X_train, y_train)
qda_pipeline.fit(X_train, y_train)

### Softvote

In [22]:
model_preds = {
    "logreg": logreg_pipeline.predict_proba(X_test),
    "xt": xt_model.predict_proba(X_test),
    "lgb": lgb_model.predict_proba(X_test),
    "xgb": xgb_model.predict_proba(X_test),
    "rf": rf_model.predict_proba(X_test),
    "catboost": catboost_model.predict_proba(X_test),
    "sgdc": sgdc_pipeline.predict_proba(X_test),
    "gnb": gnb_pipeline.predict_proba(X_test),
    #"tabnet": tabnet_model.predict_proba(X_test.values),
    "knn": knn_pipeline.predict_proba(X_test),
    "lda": lda_pipeline.predict_proba(X_test),
    "qda": qda_pipeline.predict_proba(X_test),
}

combi_size = 4
models = list(model_preds.keys())
results = []

# Iterate over all combinations of the given size
for combi in combinations(models, combi_size):
    combi_name = "_".join(combi)
    avg_pred = np.mean([model_preds[m] for m in combi], axis=0)
    y_pred_final = np.argmax(avg_pred, axis=1)
    acc = accuracy_score(y_test, y_pred_final)
    results.append({
        "combination": combi_name,
        "accuracy":    acc,
        "accuracy_pct": round(acc * 100, 2)
    })

# Build and sort DataFrame
df_results = pd.DataFrame(results)
df_results = df_results.sort_values("accuracy_pct", ascending=False).reset_index(drop=True)

df_results.head(10)

Unnamed: 0,combination,accuracy,accuracy_pct
0,logreg_xt_sgdc_gnb,0.508736,50.87
1,lgb_rf_gnb_qda,0.507517,50.75
2,lgb_catboost_gnb_qda,0.507111,50.71
3,logreg_catboost_sgdc_gnb,0.507111,50.71
4,xt_catboost_sgdc_gnb,0.507111,50.71
5,lgb_rf_catboost_gnb,0.506705,50.67
6,xt_lgb_gnb_qda,0.506705,50.67
7,logreg_xt_catboost_gnb,0.506705,50.67
8,logreg_xt_catboost_lda,0.506298,50.63
9,logreg_xt_rf_catboost,0.506298,50.63


### STACKING

In [26]:
base_models = [
    ("logreg", logreg_pipeline),
    ("xt", xt_model),
    ("lgb", lgb_model),
    ("xgb", xgb_model),
    ("rf", rf_model),
    ("catboost", catboost_model),
    ("sgdc", sgdc_pipeline),
    ("gnb", gnb_pipeline),
    ("lda", lda_pipeline),
    ("qda", qda_pipeline)
]

In [27]:
# 2) Prepare OOF predictions container
n_samples = X_train.shape[0]
classes = np.unique(y_train)
K = len(base_models)
n_classes = len(classes)
meta_probas = np.zeros((n_samples, K * n_classes))

In [28]:
# 3) Out-of-fold generation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, valid_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr = y_train.iloc[train_idx]
    for i, (_, model) in enumerate(base_models):
        model.fit(X_tr, y_tr)
        probas = model.predict_proba(X_val)
        meta_probas[valid_idx, i*n_classes:(i+1)*n_classes] = probas

In [29]:
meta_probas[:2, :]

array([[0.43955254, 0.27228951, 0.28815795, 0.41190642, 0.28927287,
        0.29882071, 0.40995258, 0.28191743, 0.30812999, 0.38399145,
        0.29485661, 0.321152  , 0.41145568, 0.2873312 , 0.30121312,
        0.38019004, 0.28569233, 0.33411763, 0.43155299, 0.25779576,
        0.31065124, 0.36443219, 0.27886544, 0.35670236, 0.38855467,
        0.29924796, 0.31219737, 0.40396761, 0.30438426, 0.29164813],
       [0.33314878, 0.30181122, 0.36504   , 0.34937776, 0.29783861,
        0.35278362, 0.36374138, 0.29547725, 0.34078137, 0.31676054,
        0.32172951, 0.36150992, 0.38856817, 0.28549546, 0.32593637,
        0.27570687, 0.33931183, 0.3849813 , 0.34885698, 0.28118875,
        0.36995428, 0.34250121, 0.30853619, 0.34896259, 0.31318806,
        0.32392733, 0.3628846 , 0.31651396, 0.31277776, 0.37070828]])

In [30]:
# 4) Fit meta model
cv_splitter = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

meta_lr = LogisticRegression(
    solver='saga',
    penalty='elasticnet',
    max_iter=2000,
    random_state=42,
    n_jobs=-1
)

param_dist = {
    'C':          np.logspace(-4, 4, 100),                       # inverse regularization strength
    'l1_ratio':   np.linspace(0, 1, 21),                         # only used when penalty='elasticnet'
}

rs_lr = RandomizedSearchCV(
    estimator           = meta_lr,
    param_distributions = param_dist,
    n_iter              = 100,
    cv                  = cv_splitter,
    scoring             = 'accuracy',
    n_jobs              = -1,
    random_state        = 42,
    verbose             = 0
)

#    meta_probas: array of shape (n_samples, n_models * n_classes)
#    y_train:     array of shape (n_samples,)
rs_lr.fit(meta_probas, y_train)

print("Best meta‑LR params:", rs_lr.best_params_)
print("Best CV accuracy   :", rs_lr.best_score_)

  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


Best meta‑LR params: {'l1_ratio': 0.55, 'C': 1.9179102616724888}
Best CV accuracy   : 0.5004066899096299


In [31]:
meta_model = rs_lr.best_estimator_
meta_model.fit(meta_probas, y_train)

In [32]:
# 5) Build test meta features
n_test = X_test.shape[0]
meta_test = np.zeros((n_test, K * n_classes))
for i, (_, model) in enumerate(base_models):
    model.fit(X_train, y_train)  # retrain on full train
    meta_test[:, i*n_classes:(i+1)*n_classes] = model.predict_proba(X_test)

In [33]:
# 6) Final predictions
y_pred = meta_model.predict(meta_test)
print(f"Stacked model accuracy on test: {accuracy_score(y_test, y_pred) * 100:.2f}%")

Stacked model accuracy on test: 49.98%


## Predictions

In [112]:
try:
    test_data = pd.read_csv(PROCESSED_DATA_DIR / "test_data.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [113]:
test_data.shape

(25368, 275)

In [114]:
y_pred_logreg = logreg_pipeline.predict_proba(test_data)
y_pred_sgdc = sgdc_pipeline.predict_proba(test_data)
y_pred_xgb = xgb_model.predict_proba(test_data)
y_pred_xt = xt_model.predict_proba(test_data)
y_pred_rf = rf_model.predict_proba(test_data)
y_pred_lgb = lgb_model.predict_proba(test_data)
y_pred_catboost = catboost_model.predict_proba(test_data)
y_pred_gnb = gnb_pipeline.predict_proba(test_data)

y_pred_avg = (y_pred_rf + y_pred_gnb + y_pred_logreg + y_pred_xgb + y_pred_lgb) / 5
y_sub = np.argmax(y_pred_avg, axis=1)

In [115]:
y_sub_df = pd.DataFrame(y_sub, columns=['PRED'])

# one-hot encoding
y_sub_df['HOME_WINS'] = (y_sub_df['PRED'] == 0).astype(int)
y_sub_df['DRAW'] = (y_sub_df['PRED'] == 1).astype(int)
y_sub_df['AWAY_WINS'] = (y_sub_df['PRED'] == 2).astype(int)

# processing
y_sub_df['ID'] = test_data.index
y_sub_df.drop('PRED', axis=1, inplace=True)
y_sub_df = y_sub_df[['ID', 'HOME_WINS', 'DRAW', 'AWAY_WINS']]
y_sub_df = y_sub_df.set_index('ID')

y_sub_df.head()

Unnamed: 0_level_0,HOME_WINS,DRAW,AWAY_WINS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12303,0,0,1
12304,0,0,1
12305,1,0,0
12306,1,0,0
12307,1,0,0


In [117]:
# Store sub data

sub_data_path = PROCESSED_DATA_DIR / "y_sub.csv"
y_sub_df.to_csv(sub_data_path, index=True)