In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
elif PROJECT_ROOT.name.startswith("0"):
    PROJECT_ROOT = PROJECT_ROOT.parent.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

%load_ext autoreload
%autoreload 2

In [2]:
# !pip install shap
# !pip install hyperopt
# !pip install loguru
# !pip install lightgbm
# !pip install catboost

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Core
import json
import numpy as np
import pandas as pd
from itertools import combinations

# Modelling
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from match_forecast.utils import *

[32m2025-04-19 19:04:53.319[0m | [1mINFO    [0m | [36mmatch_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/maichoun/QRT-Challenge-2024[0m


In [3]:
PROJ_ROOT = Path("QRT-Challenge-2024").resolve().parents[1]
DATA_DIR = PROJ_ROOT / "data"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

try:
    train_data = pd.read_csv(PROCESSED_DATA_DIR / "train_data.csv", index_col=0)
    train_scores = pd.read_csv(RAW_DATA_DIR / "Y_train.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [4]:
train_data.shape

(12303, 275)

In [5]:
train_scores = train_scores.loc[train_data.index]
train_scores_1c = train_scores[['HOME_WINS', 'DRAW', 'AWAY_WINS']].idxmax(axis=1)
label_mapping = {'HOME_WINS': 0, 'DRAW': 1, 'AWAY_WINS': 2}
train_scores_1c = train_scores_1c.replace(label_mapping)

train_scores_1c.head(5)

ID
0    2
1    1
2    2
3    0
4    1
dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_scores_1c, train_size=0.8, random_state=42)

In [8]:
with open("xgb_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("xgb:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
xgb: {'colsample_bytree': 0.8290759819373456, 'gamma': 3.3666118887037055, 'learning_rate': 0.003262474032926442, 'max_depth': 13.0, 'min_child_weight': 3.0, 'n_estimators': 2000.0, 'reg_alpha': 0.07920320479088216, 'reg_lambda': 0.6925627673577872, 'subsample': 0.6280952812896369}
Best loss: 0.5030479132130548


In [9]:
best_trial["params"]['max_depth'] = int(best_trial["params"]['max_depth'])
best_trial["params"]['n_estimators'] = int(best_trial["params"]['n_estimators'])
best_trial["params"]['min_child_weight'] = int(best_trial["params"]['min_child_weight'])

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_jobs=-1,
    **best_trial["params"]
)

In [10]:
with open("opt_xt.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("rf:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
rf: {'class_weight': 0, 'criterion': 1, 'max_depth': 23.0, 'max_features': 0.7058966888750476, 'max_leaf_nodes': 930.0, 'min_impurity_decrease': 0.0006137110829128878, 'min_samples_leaf': 9.0, 'min_samples_split': 34.0, 'n_estimators': 250.0}
Best loss: 0.5059944172285369


In [11]:
best_trial['params']['n_estimators'] = int(best_trial['params']['n_estimators'])
best_trial['params']['max_depth'] = int(best_trial['params']['max_depth'])
best_trial['params']['min_samples_split'] = int(best_trial['params']['min_samples_split'])
best_trial['params']['min_samples_leaf'] = int(best_trial['params']['min_samples_leaf'])
best_trial['params']['max_leaf_nodes'] = int(best_trial['params']['max_leaf_nodes']) if best_trial['params']['max_leaf_nodes'] is not None else None

class_weight_options = [None, 'balanced', 'balanced_subsample']
if isinstance(best_trial['params']['class_weight'], (int, np.integer)):  # Ensure it's an index
    best_trial['params']['class_weight'] = class_weight_options[int(best_trial['params']['class_weight'])]


criterion_options = ['gini', 'entropy']
if isinstance(best_trial['params']['criterion'], (int, np.integer)):
    best_trial['params']['criterion'] = criterion_options[int(best_trial['params']['criterion'])]

xt_model = ExtraTreesClassifier(
    **best_trial['params'],
    random_state=42,
    n_jobs=-1
)

In [12]:
with open("lgb_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("lgb:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
lgb: {'bagging_fraction': 0.5533103510496046, 'feature_fraction': 0.583732389168031, 'lambda_l1': 0.28917840747625867, 'lambda_l2': 0.4049000872532103, 'learning_rate': 0.0026180259797739494, 'max_depth': 5.0, 'min_child_samples': 70.0, 'n_estimators': 800.0, 'num_leaves': 20.0}
Best loss: 0.5061972975718785


In [13]:
best_trial["params"]['max_depth'] = int(best_trial["params"]['max_depth'])
best_trial["params"]['num_leaves'] = int(best_trial["params"]['num_leaves'])
best_trial["params"]['n_estimators'] = int(best_trial["params"]['n_estimators'])
best_trial["params"]['min_child_samples'] = int(best_trial["params"]['min_child_samples'])

lgb_model = lgb.LGBMClassifier(
    objective='multiclass', 
    boosting_type='gbdt', 
    num_class=3,
    n_jobs=-1, 
    **best_trial["params"] 
)

In [14]:
with open("rf_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("Best hyperparameters:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
Best hyperparameters: {'bootstrap': 0, 'class_weight': 0, 'criterion': 1, 'max_depth': 15.0, 'max_features': 0.35443519227617404, 'max_leaf_nodes': 230.0, 'min_impurity_decrease': 0.000528811145033164, 'min_samples_leaf': 4.0, 'min_samples_split': 36.0, 'n_estimators': 650.0}
Best loss: 0.5037588307154026


In [15]:
best_trial['params']['n_estimators'] = int(best_trial['params']['n_estimators'])
best_trial['params']['max_depth'] = int(best_trial['params']['max_depth'])
best_trial['params']['min_samples_split'] = int(best_trial['params']['min_samples_split'])
best_trial['params']['min_samples_leaf'] = int(best_trial['params']['min_samples_leaf'])
best_trial['params']['max_leaf_nodes'] = int(best_trial['params']['max_leaf_nodes']) if best_trial['params']['max_leaf_nodes'] is not None else None
best_trial['params']['bootstrap'] = bool(best_trial['params']['bootstrap'])

class_weight_options = [None, 'balanced', 'balanced_subsample']
if isinstance(best_trial['params']['class_weight'], (int, np.integer)):  # Ensure it's an index
    best_trial['params']['class_weight'] = class_weight_options[int(best_trial['params']['class_weight'])]

criterion_options = ['gini', 'entropy']
if isinstance(best_trial['params']['criterion'], (int, np.integer)):
    best_trial['params']['criterion'] = criterion_options[int(best_trial['params']['criterion'])]


rf_model = RandomForestClassifier(
    **best_trial['params'],
    n_jobs=-1,
    random_state=42
)

In [16]:
with open("catboost_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("Best hyperparameters:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
Best hyperparameters: {'auto_class_weights': 0, 'bagging_temperature': 0.6388058120072776, 'border_count': 32.0, 'depth': 3.0, 'grow_policy': 0, 'iterations': 1550.0, 'l2_leaf_reg': 3.846783294869394, 'learning_rate': 0.015867158272229342, 'random_strength': 8.153787461556025, 'rsm': 0.5851312537417908}
Best loss: 0.5009140146643771


In [17]:
best_params = best_trial['params']

# Cast integer‐only hyperparameters
for int_param in ('iterations', 'depth', 'border_count'):
    if int_param in best_params:
        best_params[int_param] = int(best_params[int_param])

# Map grow_policy index → string
grow_policy_options = ['SymmetricTree', 'Depthwise', 'Lossguide']
gp = best_params.get('grow_policy')
if isinstance(gp, (int, np.integer)):
    best_params['grow_policy'] = grow_policy_options[int(gp)]

# Map auto_class_weights index → None or 'Balanced'
auto_weights_options = [None, 'Balanced']
aw = best_params.get('auto_class_weights')
if isinstance(aw, (int, np.integer)):
    best_params['auto_class_weights'] = auto_weights_options[int(aw)]

catboost_model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='MultiClass',
    verbose=False,
    thread_count=-1,
    random_seed=42,
    **best_params
)    

In [18]:
with open("logreg_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("lr:", best_trial["params"])
print("Best loss:", best_trial["loss"])


Best trial
lr: {'C': 0.029863544880305414, 'l1_ratio': 0.34403149200938044, 'n_components': 30.0}
Best loss: 0.49817098977236596


In [19]:
best_trial['params']['n_components'] = int(best_trial['params']['n_components'])

logreg_model = LogisticRegression(
    penalty='elasticnet',
    solver='saga',
    C=best_trial["params"]['C'],
    l1_ratio=best_trial["params"]['l1_ratio'],
    max_iter=2000,
    random_state=42
)

In [20]:
logreg_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=best_trial["params"]['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('lr', logreg_model)  
])

In [None]:
with open("svm_model.json", "r") as f:
    results_with_params = json.load(f)

print("\nBest trial")
best_trial = min(results_with_params, key=lambda x: x["loss"])
print("lr:", best_trial["params"])
print("Best loss:", best_trial["loss"])

In [None]:
best_trial['params']['n_components'] = int(best_trial['params']['n_components'])

svm_model = LinearSVC(
    C=best_trial['params']['C'],
    max_iter=2000,
    tol=1e-4,
    random_state=42
)

In [None]:
svm_pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(n_components=best_trial["params"]['n_components'])),  
    ('scaler2', StandardScaler()),  
    ('svm', svm_model)  
])

In [21]:
lgb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
xt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
catboost_model.fit(X_train, y_train)
logreg_pipeline.fit(X_train, y_train)
svm_pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 72820
[LightGBM] [Info] Number of data points in the train set: 9842, number of used features: 350
[LightGBM] [Info] Start training from score -0.834343
[LightGBM] [Info] Start training from score -1.344311
[LightGBM] [Info] Start training from score -1.187047


In [27]:
model_preds = {
    "logreg": logreg_pipeline.predict_proba(X_test),
    "svm": svm_pipeline.predict_proba(X_test),
    "xgb": xgb_model.predict_proba(X_test),
    "lgb": lgb_model.predict_proba(X_test),
    "xt": xt_model.predict_proba(X_test),
    "rf": rf_model.predict_proba(X_test),
    "catboost": catboost_model.predict_proba(X_test)
}

models = list(model_preds.keys())
combi_results = {}

for combi in combinations(models, 2):
    combi_name = "_".join(combi)
    avg_pred = np.mean([model_preds[m] for m in combi], axis=0)  
    y_pred_final = np.argmax(avg_pred, axis=1)
    final_accuracy = accuracy_score(y_test, y_pred_final)
    combi_results[combi_name] = final_accuracy
    print(f"Combination: {combi}, Accuracy (test) : {final_accuracy * 100:.2f}%")

best_combi = max(combi_results, key=combi_results.get)
print(f"\nBest Combination: {best_combi}, Accuracy: {combi_results[best_combi] * 100:.2f}%")


Combination: ('logreg', 'xgb'), Accuracy (test) : 50.14%
Combination: ('logreg', 'lgb'), Accuracy (test) : 50.14%
Combination: ('logreg', 'xt'), Accuracy (test) : 50.22%
Combination: ('logreg', 'rf'), Accuracy (test) : 50.10%
Combination: ('logreg', 'catboost'), Accuracy (test) : 50.02%
Combination: ('xgb', 'lgb'), Accuracy (test) : 50.30%
Combination: ('xgb', 'xt'), Accuracy (test) : 50.47%
Combination: ('xgb', 'rf'), Accuracy (test) : 50.30%
Combination: ('xgb', 'catboost'), Accuracy (test) : 50.35%
Combination: ('lgb', 'xt'), Accuracy (test) : 50.14%
Combination: ('lgb', 'rf'), Accuracy (test) : 49.78%
Combination: ('lgb', 'catboost'), Accuracy (test) : 50.39%
Combination: ('xt', 'rf'), Accuracy (test) : 49.65%
Combination: ('xt', 'catboost'), Accuracy (test) : 50.87%
Combination: ('rf', 'catboost'), Accuracy (test) : 49.98%

Best Combination: xt_catboost, Accuracy: 50.87%


## Predictions

In [30]:
try:
    test_data = pd.read_csv(PROCESSED_DATA_DIR / "test_data.csv", index_col=0)
    print("Files loaded")
    
except FileNotFoundError as e:
    print(e)

Files loaded


In [31]:
y_pred_logreg = logreg_pipeline.predict_proba(test_data)
y_pred_svm = svm_pipeline.predict_proba(test_data)
y_pred_xgb = xgb_model.predict_proba(test_data)
y_pred_xt = xt_model.predict_proba(test_data)
y_pred_rf = rf_model.predict_proba(test_data)
y_pred_lgb = lgb_model.predict_proba(test_data)
y_pred_catboost = catboost_model.predict_proba(test_data)

y_pred_avg = (y_pred_xt + y_pred_catboost) / 2
y_sub = np.argmax(y_pred_avg, axis=1)



In [32]:
y_sub_df = pd.DataFrame(y_sub, columns=['PRED'])

# one-hot encoding
y_sub_df['HOME_WINS'] = (y_sub_df['PRED'] == 0).astype(int)
y_sub_df['DRAW'] = (y_sub_df['PRED'] == 1).astype(int)
y_sub_df['AWAY_WINS'] = (y_sub_df['PRED'] == 2).astype(int)

# processing
y_sub_df['ID'] = test_data.index
y_sub_df.drop('PRED', axis=1, inplace=True)
y_sub_df = y_sub_df[['ID', 'HOME_WINS', 'DRAW', 'AWAY_WINS']]
y_sub_df = y_sub_df.set_index('ID')

y_sub_df.head()

Unnamed: 0_level_0,HOME_WINS,DRAW,AWAY_WINS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12303,1,0,0
12304,0,0,1
12305,1,0,0
12306,1,0,0
12307,1,0,0


In [33]:
# Store sub data

sub_data_path = PROCESSED_DATA_DIR / "y_sub.csv"
y_sub_df.to_csv(sub_data_path, index=True)