In [1]:
import pandas as pd
import numpy as np
import joblib
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [2]:
df = pd.read_csv("../data/processed/players_full_2425_with_score.csv")
df.shape, df["good_perf"].value_counts(normalize=True)


((2854, 324),
 good_perf
 0    0.79993
 1    0.20007
 Name: proportion, dtype: float64)

In [3]:
features = [
    "Age", "90s",
    "Gls_90", "Ast_90", "G+A_90",
    "G-PK_90", "G+A-PK_90",
    "xG_90", "xAG_90", "xG+xAG_90",
    "npxG_90", "npxG+xAG_90",
    "PrgC", "PrgP", "PrgR"
]

target = "good_perf"


In [4]:
df_clean = df.dropna(subset=features + [target]).copy()

X = df_clean[features]
y = df_clean[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


In [5]:
models = {}

models["LogReg"] = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, random_state=42))
])

models["RandomForest"] = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced"
)

models["GradientBoosting"] = GradientBoostingClassifier(
    random_state=42
)

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    results.append({
        "model": name,
        "accuracy": accuracy_score(y_test, preds),
        "precision": precision_score(y_test, preds, zero_division=0),
        "recall": recall_score(y_test, preds, zero_division=0),
        "f1": f1_score(y_test, preds, zero_division=0)
    })

pd.DataFrame(results).sort_values("f1", ascending=False)


Unnamed: 0,model,accuracy,precision,recall,f1
2,GradientBoosting,0.929775,0.829787,0.818182,0.823944
1,RandomForest,0.926966,0.847328,0.776224,0.810219
0,LogReg,0.883427,0.75,0.629371,0.684411


In [6]:
best_name = pd.DataFrame(results).sort_values("f1", ascending=False).iloc[0]["model"]
best_model = models[best_name]

best_model.fit(X_train, y_train)
preds = best_model.predict(X_test)

print("Best model:", best_name)
print(classification_report(y_test, preds, zero_division=0))
print("Confusion matrix:\n", confusion_matrix(y_test, preds))


Best model: GradientBoosting
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       569
           1       0.83      0.82      0.82       143

    accuracy                           0.93       712
   macro avg       0.89      0.89      0.89       712
weighted avg       0.93      0.93      0.93       712

Confusion matrix:
 [[545  24]
 [ 26 117]]


In [7]:
joblib.dump(best_model, "../models/player_performance_model_B.pkl")

with open("../models/player_performance_features_B.json", "w") as f:
    json.dump(features, f, indent=2)

print("Saved:", "../models/player_performance_model_B.pkl")
print("Saved:", "../models/player_performance_features_B.json")


Saved: ../models/player_performance_model_B.pkl
Saved: ../models/player_performance_features_B.json


The model learns patterns associated with strong season-level performance and uses them as a proxy for upcoming match performance.
Gradient-based ensemble models perform best, suggesting non-linear relationships between player statistics and performance labels.
Results vary by position, reflecting the role-specific nature of football performance.