This notebook produces the predictions for the test set, based on the results of the notebook "model_selection.ipynb".

In [1]:
# === Imports ===
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from catboost import CatBoostRegressor

In [2]:
# === Load train and test sets ===

# Load preprocessed data (assume you already run the preprocessing script)
# after running the preprocessing script, replace the paths below with the actual path to the preprocessed data

preprocessed_folder = "../../data/preprocessed/"

train = pd.read_csv(f"{preprocessed_folder}training.csv")
test = pd.read_csv(f"{preprocessed_folder}testing.csv")

# === Feature selection ===
features = [
    'release_clause_eur', 'wage_eur', 'log_wage_eur', "log_release_clause_eur",
    'international_reputation', 
    'overall', 
    'club_name_te',
    'potential', 
    'age', 
    'position_group_Goalkeeper','position_group_Defender', 'position_group_Midfielder', 'position_group_Attacker', 
    'height_cm', 'weight_kg', 
    # "bmi",
    # 'priority',
    # "defense_work_rate", "attack_work_rate",
    # "contract_remaining",
    # "years_at_club",
    "pace","shooting","passing","dribbling","defending","physic", "goalkeeping_diving"
]

# === Define target and training features ===
quantile = 0.95
train["is_top"] = (train["value_eur"] >= train["value_eur"].quantile(quantile)).astype(int)

X_train = train[features].copy()
y_train = train["value_eur"]
z_train = train["is_top"]

X_test = test[features].copy()

In [None]:
# === Train models ===
clf = RandomForestClassifier(n_estimators=300, max_depth=None, max_features=0.5, random_state=100)
clf.fit(X_train, z_train)

base_reg = RandomForestRegressor(n_estimators=300, max_depth=None, max_features=0.5, random_state=100)
base_reg.fit(X_train, y_train)

top_mask = (z_train == 1)
top_reg = CatBoostRegressor(iterations=1500, depth=3, learning_rate=0.1, verbose=0, random_state=100)
top_reg.fit(X_train[top_mask], y_train[top_mask])

# === Predict on test set ===
probs_test = clf.predict_proba(X_test)[:, 1]
y_pred_base_test = base_reg.predict(X_test)
y_pred_top_test = top_reg.predict(X_test)

y_pred_test_final = (1 - probs_test) * y_pred_base_test + probs_test * y_pred_top_test

# === Format output ===
submission = pd.DataFrame({"value_eur": y_pred_test_final})
submission.index.name = ""
submission.to_csv("submission.csv")
print("✅ Test predictions saved to submission.csv")

✅ Test predictions saved to submission.csv
