This model performs 5-fold CV to select the best combination of classifier, base regressor and top regressor. It outputs the results in a csv file used for model selection.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
from tqdm import tqdm
from xgboost import XGBClassifier, XGBRegressor
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from scipy.special import expit  # for stable sigmoid transformation
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
import datetime


In [2]:
# Load preprocessed data (assume you already run the preprocessing script)
# after running the preprocessing script, replace the paths below with the actual path to the preprocessed data
preprocessed_folder = "../../data/preprocessed/"

df = pd.read_csv(f"{preprocessed_folder}training.csv")
#test = pd.read_csv(f"{preprocessed_folder}testing.csv")

# Manually selected features
manually_picked_features = [
    'release_clause_eur', 'wage_eur', 'log_wage_eur', "log_release_clause_eur",
    'international_reputation', 
    'overall', 
    'club_name_te',
    'potential', 
    'age', 
    'position_group_Goalkeeper','position_group_Defender', 'position_group_Midfielder', 'position_group_Attacker', 
    'height_cm', 'weight_kg', 
    #"bmi",
    #'priority',
    #"defense_work_rate", "attack_work_rate",
    #"contract_remaining",
    #"years_at_club",
    "pace","shooting","passing","dribbling","defending","physic", "goalkeeping_diving"
]

quantile = 0.95
df["is_top"] = (df["value_eur"] >= df["value_eur"].quantile(quantile)).astype(int)
features = manually_picked_features

X = df[features].copy()
y = df["value_eur"]
z = df["is_top"]

random_state = 100


In [3]:
# Models to try
classifiers = {
    # "naive_bayes": GaussianNB(),
    # "ridge_classifier": RidgeClassifier(alpha=1.0),
    "lightgbm": LGBMClassifier(n_estimators=1500, max_depth=3, learning_rate=0.1, verbosity=-1, random_state=random_state),
    "catboost": CatBoostClassifier(iterations=1500, depth=3, learning_rate=0.1, verbose=0, random_state=random_state),
    "xgb": XGBClassifier(n_estimators=500, max_depth=3, learning_rate=0.1, eval_metric="logloss", random_state=random_state),
    "random_forest": RandomForestClassifier(n_estimators=300, max_depth=None, max_features=0.5, random_state=random_state),
    "gradient_boosting": GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, random_state=random_state),
    # "logistic_regression": LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000, random_state=random_state),
    # "knn": KNeighborsClassifier(n_neighbors=5)
}

regressors = {

    # "ridge": Ridge(alpha=1.0, random_state=random_state),
    # "lasso": Lasso(alpha=0.1, max_iter=1000, random_state=random_state),
    "lightgbm": LGBMRegressor(n_estimators=1500, max_depth=3, learning_rate=0.1, verbosity=-1, random_state=random_state),
    "catboost": CatBoostRegressor(iterations=1500, depth=3, learning_rate=0.1, verbose=0, random_state=random_state),  
    "xgb": XGBRegressor(n_estimators=500, max_depth=3, learning_rate=0.1, random_state=random_state),
    "random_forest": RandomForestRegressor(n_estimators=300, max_depth=None, max_features=0.5, random_state=random_state),
    "gradient_boosting": GradientBoostingRegressor(n_estimators=200, max_depth=3, learning_rate=0.1, random_state=random_state),
    # "linear_regression": LinearRegression(),
    # "knn4": KNeighborsRegressor(n_neighbors=4),
    # "lgbm1": LGBMRegressor(n_estimators=1000, max_depth=3, learning_rate=0.05, num_leaves=15, min_child_samples=60, feature_fraction=0.6, verbosity=-1, random_state=random_state),
    # "lgbm2": LGBMRegressor(n_estimators=800, max_depth=2, learning_rate=0.05, num_leaves=7, min_child_samples=80, feature_fraction=0.7, bagging_fraction=0.7, bagging_freq=5, verbosity=-1, random_state=random_state),
    # "lgbm3": LGBMRegressor(n_estimators=1500, max_depth=3, learning_rate=0.03, num_leaves=10, min_child_samples=100, feature_fraction=0.5, bagging_fraction=0.5, bagging_freq=2, verbosity=-1, random_state=random_state),
    # "catboost1": CatBoostRegressor(iterations=1000, depth=3, learning_rate=0.05, l2_leaf_reg=10, bagging_temperature=1.5, random_strength=5, verbose=0, random_state=random_state),
    # "catboost2": CatBoostRegressor(iterations=800, depth=2, learning_rate=0.03, l2_leaf_reg=12, bagging_temperature=2.0, random_strength=7, verbose=0, random_state=random_state),
    # "catboost3": CatBoostRegressor(iterations=1200, depth=3, learning_rate=0.04, l2_leaf_reg=15, bagging_temperature=1.8, random_strength=6, verbose=0, random_state=random_state),
    # "xgb1": XGBRegressor(n_estimators=1000, max_depth=3, learning_rate=0.05, subsample=0.7, colsample_bytree=0.6, reg_lambda=5, reg_alpha=1, random_state=random_state),
    # "xgb2": XGBRegressor(n_estimators=800, max_depth=2, learning_rate=0.03, subsample=0.6, colsample_bytree=0.5, reg_lambda=7, reg_alpha=2, random_state=random_state),
    # "xgb3": XGBRegressor(n_estimators=1200, max_depth=3, learning_rate=0.04, subsample=0.65, colsample_bytree=0.55, reg_lambda=10, reg_alpha=3, random_state=random_state),


}

In [4]:
# === Setup ===
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

# === Placeholder dictionaries to cache fold-level predictions and training RMSEs ===
clf_preds = {}
base_preds = {}
top_preds = {}
base_train_rmses = {}
top_train_rmses = {}

# === Store the splits for reuse ===
splits = list(skf.split(X, z))

# === Train and cache all classifier predictions ===
for clf_name, clf_model in tqdm(classifiers.items(), desc="Training classifiers"):
    fold_preds = []
    for fold_idx, (train_idx, val_idx) in enumerate(splits):
        clf = clone(clf_model)
        clf.fit(X.iloc[train_idx], z.iloc[train_idx])
        if hasattr(clf, "predict_proba"):
            p_val = clf.predict_proba(X.iloc[val_idx])[:, 1]
        elif hasattr(clf, "decision_function"):
            p_val = expit(clf.decision_function(X.iloc[val_idx]))
        else:
            raise ValueError(f"Classifier {clf_name} does not support probability estimates")
        fold_preds.append((val_idx, p_val))
    clf_preds[clf_name] = fold_preds

# === Train and cache all base regressor predictions + train RMSE ===
for base_name, base_model in tqdm(regressors.items(), desc="Training base regressors"):
    fold_preds = []
    train_rmses = []
    for fold_idx, (train_idx, val_idx) in enumerate(splits):
        reg = clone(base_model)
        reg.fit(X.iloc[train_idx], y.iloc[train_idx])
        y_pred_val = reg.predict(X.iloc[val_idx])
        y_pred_train = reg.predict(X.iloc[train_idx])
        rmse_train = np.sqrt(mean_squared_error(y.iloc[train_idx], y_pred_train))
        train_rmses.append(rmse_train)
        fold_preds.append((val_idx, y_pred_val))
    base_preds[base_name] = fold_preds
    base_train_rmses[base_name] = np.mean(train_rmses)

# === Train and cache all top regressor predictions + train RMSE ===
for top_name, top_model in tqdm(regressors.items(), desc="Training top regressors"):
    fold_preds = []
    train_rmses = []
    for fold_idx, (train_idx, val_idx) in enumerate(splits):
        top_mask = z.iloc[train_idx] == 1
        reg = clone(top_model)
        reg.fit(X.iloc[train_idx][top_mask], y.iloc[train_idx][top_mask])
        y_pred_val = reg.predict(X.iloc[val_idx])
        y_pred_train = reg.predict(X.iloc[train_idx][top_mask])
        rmse_train = np.sqrt(mean_squared_error(y.iloc[train_idx][top_mask], y_pred_train))
        train_rmses.append(rmse_train)
        fold_preds.append((val_idx, y_pred_val))
    top_preds[top_name] = fold_preds
    top_train_rmses[top_name] = np.mean(train_rmses)

# === Aggregate Results ===
results = []

model_combinations = [
    (clf_name, base_name, top_name)
    for clf_name in classifiers
    for base_name in regressors
    for top_name in regressors
]

for clf_name, base_name, top_name in tqdm(model_combinations, desc="Aggregating results"):
    rmse_base_list = []
    rmse_soft_list = []

    for fold_idx in range(n_splits):
        val_idx, p_val = clf_preds[clf_name][fold_idx]
        _, y_base_pred = base_preds[base_name][fold_idx]
        _, y_top_pred = top_preds[top_name][fold_idx]
        y_val = y.iloc[val_idx]

        y_final_pred = (1 - p_val) * y_base_pred + p_val * y_top_pred

        rmse_base = np.sqrt(mean_squared_error(y_val, y_base_pred))
        rmse_soft = np.sqrt(mean_squared_error(y_val, y_final_pred))
        rmse_base_list.append(rmse_base)
        rmse_soft_list.append(rmse_soft)

    results.append({
        "classifier": clf_name,
        "base": base_name,
        "top": top_name,
        "base_rmse": np.mean(rmse_base_list),
        "sr_rmse": np.mean(rmse_soft_list),
        "base_std": np.std(rmse_base_list),
        "sr_std": np.std(rmse_soft_list),
        "train_rmse_base": base_train_rmses[base_name],
        "train_rmse_top": top_train_rmses[top_name]
    })

# === Save to CSV ===
now = datetime.datetime.now()
now_str = now.strftime("%Y-%m-%d_%H-%M-%S")
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("sr_rmse")
results_df.to_csv(f"results_seed_{random_state}_q{quantile}_{now_str}.csv", index=False)
print(f"\n✅ Results saved to results_seed_{random_state}_{now_str}.csv")


Training classifiers: 100%|██████████| 5/5 [01:10<00:00, 14.20s/it]
Training base regressors: 100%|██████████| 5/5 [01:49<00:00, 21.89s/it]
Training top regressors: 100%|██████████| 5/5 [00:11<00:00,  2.25s/it]
Aggregating results: 100%|██████████| 125/125 [00:00<00:00, 589.11it/s]


✅ Results saved to results_seed_100_2025-05-16_12-18-14.csv



