In [1]:
import pandas as pd
import numpy as np
import joblib
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [2]:
df = pd.read_csv("../data/processed/players_full_2425_with_score.csv")
df.shape, df["good_perf"].value_counts(normalize=True)


((1987, 324),
 good_perf
 0    0.799698
 1    0.200302
 Name: proportion, dtype: float64)

In [3]:
features = [
    "Age", "90s",
    "Gls_90", "Ast_90", "G+A_90",
    "G-PK_90", "G+A-PK_90",
    "xG_90", "xAG_90", "xG+xAG_90",
    "npxG_90", "npxG+xAG_90",
    "PrgC", "PrgP", "PrgR"
]

target = "good_perf"


## Feature Set & Interpretation

The feature set consists of **season-aggregated player statistics** (attacking, defensive, possession, discipline, and usage metrics).

Because the target `good_perf` is derived from the same statistical space (via `performance_score`),
this ML model is used for:

- learning non-linear boundaries between performance tiers
- assessing separability of player profiles
- interpretability / feature importance analysis

It is **not** used as a future match performance predictor.


In [4]:
df_clean = df.dropna(subset=features + [target]).copy()

X = df_clean[features]
y = df_clean[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


## Target Definition (Season-Level Performance Tier)

The target variable `good_perf` represents a **season-level performance tier**
derived from the aggregated `performance_score`.

- `good_perf = 1` → player belongs to the top performance tier (top 20%)
- `good_perf = 0` → otherwise

Important !!!:
This is **not a future performance prediction task**.
The model learns decision boundaries between **performance tiers defined by
the scoring framework**, using season-aggregated statistics.


In [5]:
models = {}

models["LogReg"] = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, random_state=42))
])

models["RandomForest"] = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced"
)

models["GradientBoosting"] = GradientBoostingClassifier(
    random_state=42
)

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    results.append({
        "model": name,
        "accuracy": accuracy_score(y_test, preds),
        "precision": precision_score(y_test, preds, zero_division=0),
        "recall": recall_score(y_test, preds, zero_division=0),
        "f1": f1_score(y_test, preds, zero_division=0)
    })

pd.DataFrame(results).sort_values("f1", ascending=False)


Unnamed: 0,model,accuracy,precision,recall,f1
2,GradientBoosting,0.887097,0.741573,0.666667,0.702128
0,LogReg,0.866935,0.698795,0.585859,0.637363
1,RandomForest,0.868952,0.717949,0.565657,0.632768


## Model Evaluation (Context)

The evaluation metrics below reflect how consistently the model can separate players into **predefined season-level tiers**.

High performance here indicates internal consistency of the scoring + tiering framework, not real-world forecasting ability.


In [6]:
best_name = pd.DataFrame(results).sort_values("f1", ascending=False).iloc[0]["model"]
best_model = models[best_name]

best_model.fit(X_train, y_train)
preds = best_model.predict(X_test)

print("Best model:", best_name)
print(classification_report(y_test, preds, zero_division=0))
print("Confusion matrix:\n", confusion_matrix(y_test, preds))


Best model: GradientBoosting
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       397
           1       0.74      0.67      0.70        99

    accuracy                           0.89       496
   macro avg       0.83      0.80      0.82       496
weighted avg       0.88      0.89      0.88       496

Confusion matrix:
 [[374  23]
 [ 33  66]]


In [7]:
joblib.dump(best_model, "../models/player_performance_model_B.pkl")

with open("../models/player_performance_features_B.json", "w") as f:
    json.dump(features, f, indent=2)

print("Saved:", "../models/player_performance_model_B.pkl")
print("Saved:", "../models/player_performance_features_B.json")


Saved: ../models/player_performance_model_B.pkl
Saved: ../models/player_performance_features_B.json


The model learns patterns associated with strong season-level performance and uses them as a proxy for upcoming match performance.
Gradient-based ensemble models perform best, suggesting non-linear relationships between player statistics and performance labels.
Results vary by position, reflecting the role-specific nature of football performance.

## Limitations & Future Work

- Data is season-level (aggregated); no match-by-match logs are used.
- The model does not predict future match performance.
- The target label is derived from the same feature space (no causal inference).
- True predictive modeling would require:
  - match-level data
  - temporal train/test splits
  - external validation across seasons

This will be addressed in the Match Outcome module (future extension).
