In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
elif PROJECT_ROOT.name.startswith("0"):
    PROJECT_ROOT = PROJECT_ROOT.parent.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

%load_ext autoreload
%autoreload 2

In [2]:
# !pip install shap
# !pip install hyperopt
# !pip install loguru
# !pip install lightgbm
# !pip install catboost

In [31]:
import warnings
warnings.filterwarnings("ignore")

# Core
import json
import numpy as np
import pandas as pd
from itertools import combinations

# Modelling
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from match_forecast.utils import *

In [4]:
PROJ_ROOT = Path("QRT-Challenge-2024").resolve().parents[1]
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

try:
    train_data = pd.read_csv(PROCESSED_DATA_DIR / "train_data.csv", index_col=0)
    train_scores = pd.read_csv(RAW_DATA_DIR / "Y_train.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [5]:
train_data.shape

(12303, 275)

In [6]:
train_scores = train_scores.loc[train_data.index]
train_scores_1c = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
label_mapping = {'HOME_WINS': 0, 'DRAW': 1, 'AWAY_WINS': 2}
train_scores_1c = train_scores_1c.replace(label_mapping)

train_scores_1c.head(5)

ID
0    2
1    1
2    2
3    0
4    1
dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_scores_1c, train_size=0.8, random_state=42)

In [8]:
with open("xgb_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("xgb:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
xgb: {'colsample_bytree': 0.858637891200674, 'gamma': 3.811536040557793, 'learning_rate': 0.004181656803679312, 'max_depth': 16.0, 'min_child_weight': 2.0, 'n_estimators': 1900.0, 'reg_alpha': 0.7581185431863467, 'reg_lambda': 0.6869494596344811, 'subsample': 0.709156755556148}
Best loss: 0.5034541694853096


In [9]:
best_trial["params"]['max_depth'] = int(best_trial["params"]['max_depth'])
best_trial["params"]['n_estimators'] = int(best_trial["params"]['n_estimators'])
best_trial["params"]['min_child_weight'] = int(best_trial["params"]['min_child_weight'])

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_jobs=-1,
    **best_trial["params"]
)

In [10]:
with open("xt_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("rf:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
rf: {'class_weight': 0, 'criterion': 0, 'max_depth': 21.0, 'max_features': 0.48300466724197205, 'max_leaf_nodes': 580.0, 'min_impurity_decrease': 0.0003126137488028501, 'min_samples_leaf': 17.0, 'min_samples_split': 16.0, 'n_estimators': 750.0}
Best loss: 0.5056895701538546


In [11]:
best_trial['params']['n_estimators'] = int(best_trial['params']['n_estimators'])
best_trial['params']['max_depth'] = int(best_trial['params']['max_depth'])
best_trial['params']['min_samples_split'] = int(best_trial['params']['min_samples_split'])
best_trial['params']['min_samples_leaf'] = int(best_trial['params']['min_samples_leaf'])
best_trial['params']['max_leaf_nodes'] = int(best_trial['params']['max_leaf_nodes']) if best_trial['params']['max_leaf_nodes'] is not None else None

class_weight_options = [None, 'balanced', 'balanced_subsample']
if isinstance(best_trial['params']['class_weight'], (int, np.integer)):  # Ensure it's an index
    best_trial['params']['class_weight'] = class_weight_options[int(best_trial['params']['class_weight'])]


criterion_options = ['gini', 'entropy']
if isinstance(best_trial['params']['criterion'], (int, np.integer)):
    best_trial['params']['criterion'] = criterion_options[int(best_trial['params']['criterion'])]

xt_model = ExtraTreesClassifier(
    **best_trial['params'],
    random_state=42,
    n_jobs=-1
)

In [12]:
with open("lgb_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("lgb:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
lgb: {'bagging_fraction': 0.7595625938449946, 'feature_fraction': 0.5000159517931403, 'lambda_l1': 0.2249908344412504, 'lambda_l2': 0.17459959871378772, 'learning_rate': 0.0011878690517610393, 'max_depth': 7.0, 'min_child_samples': 90.0, 'n_estimators': 1500.0, 'num_leaves': 70.0}
Best loss: 0.5019296553450143


In [13]:
best_trial["params"]['max_depth'] = int(best_trial["params"]['max_depth'])
best_trial["params"]['num_leaves'] = int(best_trial["params"]['num_leaves'])
best_trial["params"]['n_estimators'] = int(best_trial["params"]['n_estimators'])
best_trial["params"]['min_child_samples'] = int(best_trial["params"]['min_child_samples'])

lgb_model = lgb.LGBMClassifier(
    objective='multiclass', 
    boosting_type='gbdt', 
    num_class=3,
    n_jobs=-1, 
    verbose=-1,
    **best_trial["params"] 
)

In [14]:
with open("rf_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("Best hyperparameters:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
Best hyperparameters: {'bootstrap': 0, 'class_weight': 0, 'criterion': 1, 'max_depth': 3.0, 'max_features': 0.5059860161196842, 'max_leaf_nodes': 10.0, 'min_impurity_decrease': 0.004748152188620212, 'min_samples_leaf': 12.0, 'min_samples_split': 2.0, 'n_estimators': 200.0}
Best loss: 0.5065021756206589


In [15]:
best_trial['params']['n_estimators'] = int(best_trial['params']['n_estimators'])
best_trial['params']['max_depth'] = int(best_trial['params']['max_depth'])
best_trial['params']['min_samples_split'] = int(best_trial['params']['min_samples_split'])
best_trial['params']['min_samples_leaf'] = int(best_trial['params']['min_samples_leaf'])
best_trial['params']['max_leaf_nodes'] = int(best_trial['params']['max_leaf_nodes']) if best_trial['params']['max_leaf_nodes'] is not None else None
best_trial['params']['bootstrap'] = bool(best_trial['params']['bootstrap'])

class_weight_options = [None, 'balanced', 'balanced_subsample']
if isinstance(best_trial['params']['class_weight'], (int, np.integer)):  # Ensure it's an index
    best_trial['params']['class_weight'] = class_weight_options[int(best_trial['params']['class_weight'])]

criterion_options = ['gini', 'entropy']
if isinstance(best_trial['params']['criterion'], (int, np.integer)):
    best_trial['params']['criterion'] = criterion_options[int(best_trial['params']['criterion'])]


rf_model = RandomForestClassifier(
    **best_trial['params'],
    n_jobs=-1,
    random_state=42
)

In [16]:
with open("catboost_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("Best hyperparameters:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
Best hyperparameters: {'auto_class_weights': 0, 'bagging_temperature': 0.7869654255816461, 'border_count': 128.0, 'depth': 3.0, 'grow_policy': 0, 'iterations': 1750.0, 'l2_leaf_reg': 9.29983250494077, 'learning_rate': 0.015657706431599187, 'random_strength': 2.571043443663362, 'rsm': 0.6929088964409026}
Best loss: 0.5013203638589265


In [17]:
best_params = best_trial['params']

# Cast integer‐only hyperparameters
for int_param in ('iterations', 'depth', 'border_count'):
    if int_param in best_params:
        best_params[int_param] = int(best_params[int_param])

# Map grow_policy index → string
grow_policy_options = ['SymmetricTree', 'Depthwise', 'Lossguide']
gp = best_params.get('grow_policy')
if isinstance(gp, (int, np.integer)):
    best_params['grow_policy'] = grow_policy_options[int(gp)]

# Map auto_class_weights index → None or 'Balanced'
auto_weights_options = [None, 'Balanced']
aw = best_params.get('auto_class_weights')
if isinstance(aw, (int, np.integer)):
    best_params['auto_class_weights'] = auto_weights_options[int(aw)]

catboost_model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='MultiClass',
    verbose=False,
    thread_count=-1,
    random_seed=42,
    **best_params
)    

In [18]:
with open("logreg_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("lr:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
lr: {'C': 0.02193864835414044, 'l1_ratio': 0.9898893281337423, 'n_components': 50.0}
Best loss: 0.5002033552061409


In [19]:
best_trial['params']['n_components'] = int(best_trial['params']['n_components'])

logreg_model = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    C=best_trial["params"]['C'],
    l1_ratio=best_trial["params"]['l1_ratio'],
    max_iter=2000,
    random_state=42
)

In [20]:
logreg_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=best_trial["params"]['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('lr', logreg_model)  
])

In [21]:
with open("sgdc_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("lr:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
lr: {'alpha': 0.01986180308618242, 'l1_ratio': 0.29867734415849595, 'n_components': 50.0}
Best loss: 0.49989899302546337


In [22]:
best_trial['params']['n_components'] = int(best_trial['params']['n_components'])

sgdc_model = SGDClassifier(
    loss='modified_huber',   # hinge+log hybride, avec proba
    penalty='elasticnet',
    alpha=best_trial['params']['alpha'],
    l1_ratio=best_trial['params']['l1_ratio'],
    max_iter=1000,
    tol=1e-3,
    n_jobs=-1,
    random_state=42
)

In [23]:
sgdc_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=best_trial["params"]['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('sgdc', sgdc_model)  
])

In [24]:
lgb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
xt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
catboost_model.fit(X_train, y_train)
logreg_pipeline.fit(X_train, y_train)
sgdc_pipeline.fit(X_train, y_train)

### Softvote

In [25]:
model_preds = {
    "logreg": logreg_pipeline.predict_proba(X_test),
    "sgdc": sgdc_pipeline.predict_proba(X_test),
    "xgb": xgb_model.predict_proba(X_test),
    "lgb": lgb_model.predict_proba(X_test),
    "xt": xt_model.predict_proba(X_test),
    "rf": rf_model.predict_proba(X_test),
    "catboost": catboost_model.predict_proba(X_test)
}

models = list(model_preds.keys())
results = []

for combi in combinations(models, 2):
    combi_name = f"{combi[0]}_{combi[1]}"
    avg_pred = np.mean([model_preds[m] for m in combi], axis=0)
    y_pred_final = np.argmax(avg_pred, axis=1)
    accuracy = accuracy_score(y_test, y_pred_final)
    results.append({
        "combination": combi_name,
        "accuracy": accuracy
    })

df_results = pd.DataFrame(results)
df_results["accuracy_pct"] = (df_results["accuracy"] * 100).round(2)
df_results = df_results.sort_values("accuracy_pct", ascending=False).reset_index(drop=True)

df_results

Unnamed: 0,combination,accuracy,accuracy_pct
0,sgdc_xt,0.505486,50.55
1,sgdc_lgb,0.505079,50.51
2,logreg_lgb,0.504267,50.43
3,logreg_xt,0.504267,50.43
4,logreg_xgb,0.50386,50.39
5,sgdc_rf,0.50386,50.39
6,sgdc_xgb,0.503454,50.35
7,xgb_xt,0.503048,50.3
8,sgdc_catboost,0.502641,50.26
9,xt_catboost,0.502235,50.22


### STACKING

In [26]:
# 1) Define base models
base_models = [
    ('sgdc', sgdc_pipeline),
    ('xt', xt_model),
]

In [27]:
# 2) Prepare OOF predictions container
n_samples = X_train.shape[0]
classes = np.unique(y_train)
K = len(base_models)
n_classes = len(classes)
meta_probas = np.zeros((n_samples, K * n_classes))

In [28]:
# 3) Out-of-fold generation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, valid_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr = y_train.iloc[train_idx]
    for i, (_, model) in enumerate(base_models):
        model.fit(X_tr, y_tr)
        probas = model.predict_proba(X_val)
        meta_probas[valid_idx, i*n_classes:(i+1)*n_classes] = probas

In [29]:
meta_probas[:2, :]

array([[0.48034364, 0.22925384, 0.29040252, 0.41190642, 0.28927287,
        0.29882071],
       [0.3245858 , 0.29344835, 0.38196585, 0.34937776, 0.29783861,
        0.35278362]])

In [32]:
# 4) Fit meta model
cv_splitter = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

meta_lr = LogisticRegression(
    solver='saga',
    penalty='elasticnet',
    max_iter=2000,
    random_state=42,
    n_jobs=-1
)

param_dist = {
    'C':          np.logspace(-4, 4, 100),                       # inverse regularization strength
    'l1_ratio':   np.linspace(0, 1, 21),                         # only used when penalty='elasticnet'
}

rs_lr = RandomizedSearchCV(
    estimator           = meta_lr,
    param_distributions = param_dist,
    n_iter              = 100,
    cv                  = cv_splitter,
    scoring             = 'accuracy',
    n_jobs              = -1,
    random_state        = 42,
    verbose             = 0
)

#    meta_probas: array of shape (n_samples, n_models * n_classes)
#    y_train:     array of shape (n_samples,)
rs_lr.fit(meta_probas, y_train)

print("Best meta‑LR params:", rs_lr.best_params_)
print("Best CV accuracy   :", rs_lr.best_score_)



Best meta‑LR params: {'l1_ratio': np.float64(1.0), 'C': np.float64(0.026560877829466867)}
Best CV accuracy   : 0.49705402254418757




In [33]:
meta_model = rs_lr.best_estimator_
meta_model.fit(meta_probas, y_train)

In [34]:
# 5) Build test meta features
n_test = X_test.shape[0]
meta_test = np.zeros((n_test, K * n_classes))
for i, (_, model) in enumerate(base_models):
    model.fit(X_train, y_train)  # retrain on full train
    meta_test[:, i*n_classes:(i+1)*n_classes] = model.predict_proba(X_test)

In [35]:
# 6) Final predictions
y_pred = meta_model.predict(meta_test)
print(f"Stacked model accuracy on test:, {accuracy_score(y_test, y_pred) * 100:.2f}%")

Stacked model accuracy on test:, 50.10%


## Predictions

In [33]:
try:
    test_data = pd.read_csv(PROCESSED_DATA_DIR / "test_data.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [34]:
test_data.shape

(25368, 275)

In [None]:
y_pred_logreg = logreg_pipeline.predict_proba(test_data)
y_pred_sgdc = sgdc_pipeline.predict_proba(test_data)
y_pred_xgb = xgb_model.predict_proba(test_data)
y_pred_xt = xt_model.predict_proba(test_data)
y_pred_rf = rf_model.predict_proba(test_data)
y_pred_lgb = lgb_model.predict_proba(test_data)
y_pred_catboost = catboost_model.predict_proba(test_data)

y_pred_avg = (y_pred_xt + y_pred_sgdc) / 2
y_sub = np.argmax(y_pred_avg, axis=1)



In [36]:
y_sub_df = pd.DataFrame(y_sub, columns=['PRED'])

# one-hot encoding
y_sub_df['HOME_WINS'] = (y_sub_df['PRED'] == 0).astype(int)
y_sub_df['DRAW'] = (y_sub_df['PRED'] == 1).astype(int)
y_sub_df['AWAY_WINS'] = (y_sub_df['PRED'] == 2).astype(int)

# processing
y_sub_df['ID'] = test_data.index
y_sub_df.drop('PRED', axis=1, inplace=True)
y_sub_df = y_sub_df[['ID', 'HOME_WINS', 'DRAW', 'AWAY_WINS']]
y_sub_df = y_sub_df.set_index('ID')

y_sub_df.head()

Unnamed: 0_level_0,HOME_WINS,DRAW,AWAY_WINS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12303,1,0,0
12304,0,0,1
12305,1,0,0
12306,1,0,0
12307,1,0,0


In [37]:
# Store sub data

sub_data_path = PROCESSED_DATA_DIR / "y_sub.csv"
y_sub_df.to_csv(sub_data_path, index=True)