In [1]:
import pandas as pd 
import optuna
import os 
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [2]:
def objective(trial, X_train, X_test, y_train, y_test):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 50),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "min_weight_fraction_leaf": trial.suggest_float("min_weight_fraction_leaf", 0.0, 0.5),
        "max_samples": trial.suggest_float("max_samples", 0.5, 1.0) if trial.suggest_categorical("bootstrap", [True, False]) else None,
        "criterion": trial.suggest_categorical("criterion", ["squared_error", "absolute_error"]),
    }

    model = RandomForestRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return mean_squared_error(y_test, y_pred)

In [3]:
leagues = ["cl", "bundesliga", "eredivise","seriea", "ligue1", "laliga", "premier_league"]

model_dir = "models"
os.makedirs(model_dir, exist_ok=True)

for league in leagues:
    print(f"Training model for {league}")

    file_path = f"../train_data/data/{league}_train.csv"
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist")
        continue

    df = pd.read_csv(file_path)

    label_encoder = LabelEncoder()
    df["Home Team"] = label_encoder.fit_transform(df["Home Team"])
    df["Away Team"] = label_encoder.fit_transform(df["Away Team"])


    X = df.drop(columns=["Home Score", "Away Score"])
    y_home = df["Home Score"]
    y_away = df["Away Score"]

    X_train, X_test, y_train_home, y_test_home = train_test_split(X, y_home, test_size=0.2, random_state=42)
    X_train, X_test, y_train_away, y_test_away = train_test_split(X, y_away, test_size=0.2, random_state=42)

    study_home = optuna.create_study(direction="minimize")
    study_home.optimize(lambda trial: objective(trial, X_train, X_test, y_train_home, y_test_home), n_trials=20)

    study_away = optuna.create_study(direction="minimize")
    study_away.optimize(lambda trial: objective(trial, X_train, X_test, y_train_away, y_test_away), n_trials=20)

    best_params_home = study_home.best_params
    best_params_away = study_away.best_params

    model_home = RandomForestRegressor(**best_params_home, random_state=42)
    model_away = RandomForestRegressor(**best_params_away, random_state=42)

    model_home.fit(X_train, y_train_home)
    model_away.fit(X_train, y_train_away)
    
    y_pred_home = model_home.predict(X_test)
    y_pred_away = model_away.predict(X_test)

    mae_home = mean_squared_error(y_test_home, y_pred_home)
    mae_away = mean_squared_error(y_test_away, y_pred_away)

    print(f"MSE (Home): {mae_home:.2f}, MSE (Away): {mae_away:.2f}")

    joblib.dump(model_home, f"{model_dir}/{league}_home.pkl")
    joblib.dump(model_away, f"{model_dir}/{league}_away.pkl")

    print(f"Model Saved: {league}")

print("Training Completed")    

[I 2025-02-27 22:08:01,857] A new study created in memory with name: no-name-084bae58-42ec-40f5-9fbe-9ebc71c7af2b


[I 2025-02-27 22:08:01,966] Trial 0 finished with value: 2.2887303824725893 and parameters: {'n_estimators': 100, 'max_depth': 33, 'min_samples_split': 17, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'bootstrap': True, 'min_weight_fraction_leaf': 0.14456529363845116, 'max_samples': 0.820005572091375, 'criterion': 'squared_error'}. Best is trial 0 with value: 2.2887303824725893.


Training model for cl


[I 2025-02-27 22:08:02,267] Trial 1 finished with value: 2.2063461409729657 and parameters: {'n_estimators': 450, 'max_depth': 30, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False, 'min_weight_fraction_leaf': 0.030077850072427248, 'criterion': 'squared_error'}. Best is trial 1 with value: 2.2063461409729657.
[I 2025-02-27 22:08:02,333] Trial 2 finished with value: 1.9290516532911184 and parameters: {'n_estimators': 50, 'max_depth': 32, 'min_samples_split': 5, 'min_samples_leaf': 7, 'max_features': None, 'bootstrap': False, 'min_weight_fraction_leaf': 0.013484379183916262, 'criterion': 'squared_error'}. Best is trial 2 with value: 1.9290516532911184.
[I 2025-02-27 22:08:02,854] Trial 3 finished with value: 2.15625 and parameters: {'n_estimators': 250, 'max_depth': 47, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': None, 'bootstrap': False, 'min_weight_fraction_leaf': 0.42130940733255706, 'criterion': 'absolute_error'}. Best is tr

MSE (Home): 1.92, MSE (Away): 1.62
Model Saved: cl
Training model for bundesliga


[I 2025-02-27 22:08:16,232] Trial 0 finished with value: 1.4555665407897314 and parameters: {'n_estimators': 500, 'max_depth': 11, 'min_samples_split': 11, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'bootstrap': True, 'min_weight_fraction_leaf': 0.3388091654666965, 'max_samples': 0.7330184998658967, 'criterion': 'squared_error'}. Best is trial 0 with value: 1.4555665407897314.
[I 2025-02-27 22:08:16,366] Trial 1 finished with value: 1.6002730158730158 and parameters: {'n_estimators': 150, 'max_depth': 12, 'min_samples_split': 17, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': False, 'min_weight_fraction_leaf': 0.3828198856143009, 'criterion': 'absolute_error'}. Best is trial 0 with value: 1.4555665407897314.
[I 2025-02-27 22:08:16,607] Trial 2 finished with value: 1.5776246666666665 and parameters: {'n_estimators': 250, 'max_depth': 22, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'min_weight_fraction_leaf': 0.2682584171931

MSE (Home): 1.45, MSE (Away): 1.34
Model Saved: bundesliga
Training model for eredivise


[I 2025-02-27 22:08:31,105] Trial 0 finished with value: 1.6044158765946943 and parameters: {'n_estimators': 150, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 7, 'max_features': None, 'bootstrap': True, 'min_weight_fraction_leaf': 0.4954956408736817, 'max_samples': 0.6430720111730451, 'criterion': 'squared_error'}. Best is trial 0 with value: 1.6044158765946943.
[I 2025-02-27 22:08:31,422] Trial 1 finished with value: 1.5197244968650863 and parameters: {'n_estimators': 400, 'max_depth': 32, 'min_samples_split': 2, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'bootstrap': True, 'min_weight_fraction_leaf': 0.42578264892717504, 'max_samples': 0.5238742784515402, 'criterion': 'squared_error'}. Best is trial 1 with value: 1.5197244968650863.
[I 2025-02-27 22:08:31,599] Trial 2 finished with value: 1.5343883597883596 and parameters: {'n_estimators': 150, 'max_depth': 10, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': False, 'min_weight_f

MSE (Home): 1.40, MSE (Away): 1.11
Model Saved: eredivise
Training model for seriea


[I 2025-02-27 22:08:46,514] Trial 0 finished with value: 0.8178307692307693 and parameters: {'n_estimators': 50, 'max_depth': 38, 'min_samples_split': 18, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': False, 'min_weight_fraction_leaf': 0.33176444573412334, 'criterion': 'absolute_error'}. Best is trial 0 with value: 0.8178307692307693.
[I 2025-02-27 22:08:46,600] Trial 1 finished with value: 0.7769846201076928 and parameters: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 12, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': True, 'min_weight_fraction_leaf': 0.36942111876098727, 'max_samples': 0.5152830263916837, 'criterion': 'squared_error'}. Best is trial 1 with value: 0.7769846201076928.
[I 2025-02-27 22:08:49,236] Trial 2 finished with value: 0.9039206153846154 and parameters: {'n_estimators': 500, 'max_depth': 46, 'min_samples_split': 19, 'min_samples_leaf': 3, 'max_features': None, 'bootstrap': False, 'min_weight_fraction_leaf': 0.3189515948671

MSE (Home): 0.70, MSE (Away): 1.20
Model Saved: seriea
Training model for ligue1


[I 2025-02-27 22:09:08,420] Trial 0 finished with value: 1.4400283095238096 and parameters: {'n_estimators': 500, 'max_depth': 34, 'min_samples_split': 3, 'min_samples_leaf': 10, 'max_features': 'log2', 'bootstrap': False, 'min_weight_fraction_leaf': 0.01912879889844582, 'criterion': 'absolute_error'}. Best is trial 0 with value: 1.4400283095238096.
[I 2025-02-27 22:09:08,519] Trial 1 finished with value: 1.4596500000000001 and parameters: {'n_estimators': 50, 'max_depth': 46, 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'bootstrap': False, 'min_weight_fraction_leaf': 0.03856807327672884, 'criterion': 'absolute_error'}. Best is trial 0 with value: 1.4400283095238096.
[I 2025-02-27 22:09:09,523] Trial 2 finished with value: 1.4523809523809523 and parameters: {'n_estimators': 350, 'max_depth': 49, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'bootstrap': False, 'min_weight_fraction_leaf': 0.48210147156650013, 'criterion': 'absolute_error

MSE (Home): 1.38, MSE (Away): 0.93


[I 2025-02-27 22:09:24,885] A new study created in memory with name: no-name-aad640f0-8e2c-4355-8d4e-ca717d12d11e


Model Saved: ligue1
Training model for laliga


[I 2025-02-27 22:09:25,831] Trial 0 finished with value: 0.8962171111111111 and parameters: {'n_estimators': 450, 'max_depth': 41, 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'bootstrap': False, 'min_weight_fraction_leaf': 0.06382911405211672, 'criterion': 'absolute_error'}. Best is trial 0 with value: 0.8962171111111111.
[I 2025-02-27 22:09:26,164] Trial 1 finished with value: 0.8848535000000003 and parameters: {'n_estimators': 200, 'max_depth': 32, 'min_samples_split': 20, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'bootstrap': True, 'min_weight_fraction_leaf': 0.014906204513907118, 'max_samples': 0.6058353812124166, 'criterion': 'absolute_error'}. Best is trial 1 with value: 0.8848535000000003.
[I 2025-02-27 22:09:26,367] Trial 2 finished with value: 0.9115027303621767 and parameters: {'n_estimators': 300, 'max_depth': 35, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': False, 'min_weight_fraction_leaf': 0.2431066950

MSE (Home): 0.74, MSE (Away): 1.18
Model Saved: laliga
Training model for premier_league


[I 2025-02-27 22:09:36,748] Trial 0 finished with value: 1.9953536577859554 and parameters: {'n_estimators': 450, 'max_depth': 11, 'min_samples_split': 8, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False, 'min_weight_fraction_leaf': 0.42061986920166905, 'criterion': 'squared_error'}. Best is trial 0 with value: 1.9953536577859554.
[I 2025-02-27 22:09:36,793] Trial 1 finished with value: 1.9577754298996046 and parameters: {'n_estimators': 50, 'max_depth': 12, 'min_samples_split': 16, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'bootstrap': True, 'min_weight_fraction_leaf': 0.3986752270415705, 'max_samples': 0.9064348568904212, 'criterion': 'squared_error'}. Best is trial 1 with value: 1.9577754298996046.
[I 2025-02-27 22:09:36,886] Trial 2 finished with value: 2.2310679245283023 and parameters: {'n_estimators': 50, 'max_depth': 19, 'min_samples_split': 18, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'bootstrap': True, 'min_weight_fraction_leaf': 0.022330456716930

MSE (Home): 1.89, MSE (Away): 1.56
Model Saved: premier_league
Training Completed
