# Modélisation des Résultats d'Arsenal en Premier League

Ce notebook présente un workflow data science complet et reproductible:
- Chargement des données scrappées (Understat, FBref)
- Préparation des données et Feature Engineering
- Entraînement de modèles (Random Forest, XGBoost)
- Évaluation et interprétation (métriques, courbes ROC, learning curves, importances)
- Visualisations claires pour communication à des parties prenantes

Objectif: prédire l’issue d’un match (défaite, nul, victoire) et expliquer les facteurs clés associés aux performances d’Arsenal.


In [None]:
# Imports minimaux
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb

# Le notebook s'exécute depuis son dossier (notebooks_min)
# Les CSV ont été copiés dans notebooks_min/data → utiliser le chemin relatif "data"
DATA_DIR = Path('data')
if not (DATA_DIR.exists() and any(DATA_DIR.glob('*.csv'))):
    # Fallback si exécuté depuis la racine du repo
    DATA_DIR = Path('notebooks_min/data')

print('Dossier données utilisé:', DATA_DIR.resolve())
print('Fichiers disponibles:', list(DATA_DIR.glob('*.csv')))



In [None]:
# Pipeline complet: chargement → préparation → features → split → entraînement
# 1) Chargement CSV
understat = pd.read_csv(DATA_DIR / 'understat_arsenal_matches.csv')
fbref_team = pd.read_csv(DATA_DIR / 'fbref_arsenal_team_stats.csv')
fbref_match = pd.read_csv(DATA_DIR / 'fbref_arsenal_match_stats.csv')

# 2) Normalisation des colonnes et clés de jointure
understat.columns = [c.lower() for c in understat.columns]
fbref_team.columns = [c.lower() for c in fbref_team.columns]
fbref_match.columns = [c.lower() for c in fbref_match.columns]

# Harmoniser date
for df in [understat, fbref_match]:
    if 'date' not in df.columns:
        for alt in ['match_date', 'game_date']:
            if alt in df.columns:
                df.rename(columns={alt: 'date'}, inplace=True)
                break

# Harmoniser opponent
if 'opponent' not in understat.columns:
    for alt in ['opponent_team', 'against', 'rival']:
        if alt in understat.columns:
            understat.rename(columns={alt: 'opponent'}, inplace=True)
            break
if 'opponent' not in fbref_match.columns:
    for alt in ['opponent_team', 'against', 'rival']:
        if alt in fbref_match.columns:
            fbref_match.rename(columns={alt: 'opponent'}, inplace=True)
            break

# is_home
if 'is_home' not in understat.columns and 'home_away' in understat.columns:
    understat['is_home'] = understat['home_away'].astype(str).str.upper().map({'H': 1, 'A': 0}).fillna(0).astype(int)

# Clés de merge
for df in [understat, fbref_match]:
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df['date_key'] = df['date'].dt.strftime('%Y-%m-%d')
    else:
        df['date_key'] = np.nan
    if 'opponent' in df.columns:
        df['opponent_key'] = df['opponent'].astype(str).str.strip().str.lower()
    else:
        df['opponent_key'] = np.nan

u_sel = understat.copy()
fm_sel = fbref_match.copy()

for col, default in [
    ('goals', np.nan), ('goals_conceded', np.nan), ('xg', np.nan), ('xg_conceded', np.nan),
    ('shots', np.nan), ('shots_on_target', np.nan), ('possession', np.nan)
]:
    if col not in u_sel.columns: u_sel[col] = default
    if col not in fm_sel.columns: fm_sel[col] = default

# Merge
u_sel['date_key'] = u_sel['date_key'].astype(str)
fm_sel['date_key'] = fm_sel['date_key'].astype(str)
u_sel['opponent_key'] = u_sel['opponent_key'].astype(str)
fm_sel['opponent_key'] = fm_sel['opponent_key'].astype(str)

merged = pd.merge(
    u_sel,
    fm_sel[[c for c in fm_sel.columns if c not in ['goals', 'goals_conceded', 'xg', 'xg_conceded']]],
    on=['date_key', 'opponent_key'], how='inner', suffixes=('', '_fb')
)
if merged.empty:
    merged = u_sel.copy()

# 3) Cible et features
if 'goals' in merged.columns and 'goals_conceded' in merged.columns:
    merged['result'] = np.where(merged['goals'] > merged['goals_conceded'], 'W',
                         np.where(merged['goals'] == merged['goals_conceded'], 'D', 'L'))
else:
    merged['result'] = 'D'
points_map = {'W': 3, 'D': 1, 'L': 0}
merged['points'] = merged['result'].map(points_map).fillna(1).astype(int)
merged['goal_difference'] = merged.get('goals', 0) - merged.get('goals_conceded', 0)
merged['xg_difference'] = merged.get('xg', 0) - merged.get('xg_conceded', 0)
if 'date' in merged.columns:
    merged = merged.sort_values('date')
merged['points_rolling_5'] = merged['points'].rolling(window=5, min_periods=1).mean()
if 'is_home' not in merged.columns:
    merged['is_home'] = 1

# 4) Jeu de données d'entraînement
tmp = merged.copy()
tmp['result_encoded'] = tmp.get('result', pd.Series(['D']*len(tmp))).map({'L':0, 'D':1, 'W':2})
X = tmp.select_dtypes(include=[np.number]).drop(columns=['points'], errors='ignore')
y = tmp['points']

# 5) Split et entraînement
X = X.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.nunique()>1 else None
)

rf = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

y_train_xgb = y_train.replace({3:2})
y_test_xgb = y_test.replace({3:2})
xgb_model = xgb.XGBClassifier(
    n_estimators=300, learning_rate=0.1, max_depth=6, subsample=0.9, colsample_bytree=0.9,
    random_state=42, reg_lambda=1.0
)
xgb_model.fit(X_train, y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test)

print('Taille merged:', merged.shape)
print('Accuracy RF:', round(accuracy_score(y_test, y_pred_rf), 3))
print('Accuracy XGB:', round(accuracy_score(y_test_xgb, y_pred_xgb), 3))


## Données et hypothèses
- Understat: xG, buts, lieu du match, adversaire, date.
- FBref (match-by-match): possession, tirs, passes, précision.
- Alignement par `date` + `opponent` (fallback si alignement impossible).

Hypothèses de modélisation:
- Cible: points (0,1,3) dérivés du score.
- Modèles: RandomForestClassifier, XGBClassifier.
- Validation: split train/test + cross-validation stratifiée.
- Indicateurs: accuracy, macro-F1, matrice de confusion, ROC multi-classes.
- Interprétation: importances globales + partial dependence plots (si pertinent).


In [None]:
# Chargement brut des données
understat = pd.read_csv(DATA_DIR / 'understat_arsenal_matches.csv')
fbref_team = pd.read_csv(DATA_DIR / 'fbref_arsenal_team_stats.csv')
fbref_match = pd.read_csv(DATA_DIR / 'fbref_arsenal_match_stats.csv')

print(understat.head(2))
print(fbref_team.head(2))
print(fbref_match.head(2))


In [None]:
# Garanties de cohérence (cible et features)
# Recalcule 'result' et 'points' si absents, et construit X, y si nécessaire
if 'points' not in merged.columns:
    if 'goals' in merged.columns and 'goals_conceded' in merged.columns:
        merged['result'] = np.where(merged['goals'] > merged['goals_conceded'], 'W',
                             np.where(merged['goals'] == merged['goals_conceded'], 'D', 'L'))
    else:
        merged['result'] = 'D'
    points_map = {'W': 3, 'D': 1, 'L': 0}
    merged['points'] = merged['result'].map(points_map).fillna(1).astype(int)

# Features supplémentaires si manquantes
if 'goal_difference' not in merged.columns:
    merged['goal_difference'] = merged.get('goals', 0) - merged.get('goals_conceded', 0)
if 'xg_difference' not in merged.columns:
    merged['xg_difference'] = merged.get('xg', 0) - merged.get('xg_conceded', 0)
if 'points_rolling_5' not in merged.columns:
    if 'date' in merged.columns:
        merged = merged.sort_values('date')
    merged['points_rolling_5'] = merged['points'].rolling(window=5, min_periods=1).mean()
if 'is_home' not in merged.columns:
    merged['is_home'] = 1

# Construire X, y si absents
globals_dict = globals()
if 'X' not in globals_dict or 'y' not in globals_dict:
    tmp = merged.copy()
    tmp['result_encoded'] = tmp.get('result', pd.Series(['D']*len(tmp))).map({'L':0, 'D':1, 'W':2})
    X = tmp.select_dtypes(include=[np.number]).drop(columns=['points'], errors='ignore')
    y = tmp['points']

print('Vérif:', X.shape, y.shape)


In [None]:
# Nettoyage rapide + harmonisation colonnes clés
# On essaye d'identifier des colonnes compatibles pour fusion (date, adversaire, domicile)

# Normalisation noms de colonnes en minuscule
understat.columns = [c.lower() for c in understat.columns]
fbref_team.columns = [c.lower() for c in fbref_team.columns]
fbref_match.columns = [c.lower() for c in fbref_match.columns]

# Heuristiques pour colonnes communes
# - date: 'date' si présent, sinon 'match_date' ou similaire
for df in [understat, fbref_match]:
    if 'date' not in df.columns:
        for alt in ['match_date', 'game_date']:
            if alt in df.columns:
                df.rename(columns={alt: 'date'}, inplace=True)
                break

# - adversaire/opponent
if 'opponent' not in understat.columns:
    for alt in ['opponent_team', 'against', 'rival']:
        if alt in understat.columns:
            understat.rename(columns={alt: 'opponent'}, inplace=True)
            break

if 'opponent' not in fbref_match.columns:
    for alt in ['opponent_team', 'against', 'rival']:
        if alt in fbref_match.columns:
            fbref_match.rename(columns={alt: 'opponent'}, inplace=True)
            break

# - domicile/extérieur: is_home bool ou 'home_away' avec 'H/A'
if 'is_home' not in understat.columns:
    if 'home_away' in understat.columns:
        understat['is_home'] = understat['home_away'].astype(str).str.upper().map({'H': 1, 'A': 0}).fillna(0).astype(int)

# Création d'une clé de merge robuste
for df in [understat, fbref_match]:
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df['date_key'] = df['date'].dt.strftime('%Y-%m-%d')
    else:
        df['date_key'] = np.nan
    if 'opponent' in df.columns:
        df['opponent_key'] = df['opponent'].astype(str).str.strip().str.lower()
    else:
        df['opponent_key'] = np.nan

# Sélection d'un sous-ensemble pertinent de colonnes pour éviter collisions
u_sel = understat.copy()
fm_sel = fbref_match.copy()

# Colonnes candidates (présentes dans nos datasets traités précédemment)
# On inclut quelques stats simples
for col, default in [
    ('goals', np.nan), ('goals_conceded', np.nan), ('xg', np.nan), ('xg_conceded', np.nan),
    ('shots', np.nan), ('shots_on_target', np.nan), ('possession', np.nan)
]:
    if col not in u_sel.columns: u_sel[col] = default
    if col not in fm_sel.columns: fm_sel[col] = default

# Harmoniser les types des clés avant merge
u_sel['date_key'] = u_sel['date_key'].astype(str)
fm_sel['date_key'] = fm_sel['date_key'].astype(str)
u_sel['opponent_key'] = u_sel['opponent_key'].astype(str)
fm_sel['opponent_key'] = fm_sel['opponent_key'].astype(str)

# Merge principal: par date + adversaire
merged = pd.merge(
    u_sel,
    fm_sel[[c for c in fm_sel.columns if c not in ['goals', 'goals_conceded', 'xg', 'xg_conceded']]],
    on=['date_key', 'opponent_key'], how='inner', suffixes=('', '_fb')
)

# Fallback: si aucun alignement, utiliser Understat seul (données suffisantes pour démo)
if merged.empty:
    print('Aucun alignement Understat/FBref trouvé → utilisation Understat seul')
    merged = u_sel.copy()

print('Taille merged:', merged.shape)
merged.head(3)


In [None]:
# Entraînement des modèles (RandomForest, XGBoost)
X = X.fillna(0)

# Split stratifié
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.nunique()>1 else None
)

# Random Forest
rf = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print('RandomForest accuracy:', round(acc_rf, 3))
print(classification_report(y_test, y_pred_rf))

# XGBoost (classes 3→2)
y_train_xgb = y_train.replace({3:2})
y_test_xgb = y_test.replace({3:2})

xgb_model = xgb.XGBClassifier(
    n_estimators=300, learning_rate=0.1, max_depth=6, subsample=0.9, colsample_bytree=0.9,
    random_state=42, reg_lambda=1.0
)
xgb_model.fit(X_train, y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test)
acc_xgb = accuracy_score(y_test_xgb, y_pred_xgb)
print('XGBoost accuracy:', round(acc_xgb, 3))
print(classification_report(y_test_xgb, y_pred_xgb))


In [None]:
# Analyse exploratoire (EDA)
# 1) Distribution de la cible
ax = merged['points'].value_counts().sort_index().plot(kind='bar', color=['#e74c3c','#f1c40f','#2ecc71'])
ax.set_xticklabels(['0 (Défaite)','1 (Nul)','3 (Victoire)'], rotation=0)
plt.title('Distribution des résultats (points)')
plt.ylabel('Nombre de matchs')
plt.tight_layout()
plt.show()

# 2) Corrélations sur un sous-ensemble de variables clés
subset_cols = [c for c in ['goal_difference','xg','xg_conceded','points_rolling_5','is_home'] if c in merged.columns]
if len(subset_cols) >= 2:
    corr = merged[subset_cols].corr()
    sns.heatmap(corr, annot=True, cmap='RdYlBu_r', vmin=-1, vmax=1)
    plt.title('Matrice de corrélation (sélection)')
    plt.tight_layout()
    plt.show()

# 3) Distributions simples
if 'xg' in merged.columns:
    sns.histplot(merged['xg'], bins=20, kde=True, color='#3498db')
    plt.title('Distribution du xG')
    plt.tight_layout()
    plt.show()

if 'possession' in merged.columns:
    sns.histplot(merged['possession'], bins=20, kde=True, color='#9b59b6')
    plt.title('Distribution de la possession (%)')
    plt.tight_layout()
    plt.show()

# 4) Points moyens domicile vs extérieur
home_away_stats = merged.groupby(merged['is_home'].fillna(1))['points'].mean()
plt.bar(['Extérieur','Domicile'], [home_away_stats.get(0,0), home_away_stats.get(1,0)], color=['#95a5a6','#27ae60'])
plt.title('Points moyens: domicile vs extérieur')
plt.ylabel('Points moyens')
plt.tight_layout()
plt.show()


In [None]:
# Feature engineering minimal

# 1) Cible: points (3 victoire, 1 nul, 0 défaite)
# On tente de dériver result depuis goals et goals_conceded s'ils existent
if 'goals' in merged.columns and 'goals_conceded' in merged.columns:
    merged['result'] = np.where(merged['goals'] > merged['goals_conceded'], 'W',
                         np.where(merged['goals'] == merged['goals_conceded'], 'D', 'L'))
else:
    merged['result'] = 'D'

points_map = {'W': 3, 'D': 1, 'L': 0}
merged['points'] = merged['result'].map(points_map).fillna(1).astype(int)

# 2) Différences et ratios simples
merged['goal_difference'] = merged.get('goals', 0) - merged.get('goals_conceded', 0)
merged['xg_difference'] = merged.get('xg', 0) - merged.get('xg_conceded', 0)

# 3) Forme récente (rolling sur 5 matchs) par ordre chronologique
merged = merged.sort_values('date')
merged['points_rolling_5'] = merged['points'].rolling(window=5, min_periods=1).mean()
merged['goals_rolling_5'] = merged.get('goals', pd.Series([0]*len(merged))).rolling(window=5, min_periods=1).mean()
merged['xg_rolling_5'] = merged.get('xg', pd.Series([0]*len(merged))).rolling(window=5, min_periods=1).mean()

# 4) Variables contextuelles simples
if 'is_home' in merged.columns:
    merged['is_home'] = merged['is_home'].astype(int)
else:
    merged['is_home'] = 1  # par défaut

# 5) Nettoyage final: suppression colonnes non numériques inutiles pour le modèle
non_numeric_keep = ['result']
features = merged.copy()

# Encoder 'result' si utile pour analyse
features['result_encoded'] = features['result'].map({'L':0, 'D':1, 'W':2})

# Sélection des colonnes numériques pour les modèles
X = features.select_dtypes(include=[np.number]).drop(columns=['points'], errors='ignore')
y = features['points']

X.shape, y.shape


In [None]:
# Métriques avancées et validation croisée
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score, RocCurveDisplay

# Recalcul rapide sur l'ensemble d'entraînement (RF)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_acc = cross_val_score(rf, X, y, cv=skf, scoring='accuracy')
cv_f1 = cross_val_score(rf, X, y, cv=skf, scoring='f1_macro')
print(f'CV Accuracy (RF) mean±std: {cv_acc.mean():.3f} ± {cv_acc.std():.3f}')
print(f'CV Macro-F1 (RF) mean±std: {cv_f1.mean():.3f} ± {cv_f1.std():.3f}')

# ROC multi-classes (One-vs-Rest) pour RF
try:
    from sklearn.preprocessing import label_binarize
    classes = [0,1,3]
    y_test_bin = label_binarize(y_test, classes=classes)
    y_proba_rf = rf.predict_proba(X_test)
    # Remapping proba RF pour classes 0,1,3 (déjà alignées)
    fig, ax = plt.subplots(figsize=(7,6))
    for i, cls in enumerate(classes):
        RocCurveDisplay.from_predictions(y_test_bin[:, i], y_proba_rf[:, i], name=f'Classe {cls}', ax=ax)
    plt.title('Courbes ROC One-vs-Rest (RandomForest)')
    plt.show()
except Exception as e:
    print('ROC multi-classes indisponible:', e)


In [None]:
# Courbes d'apprentissage (learning curves)
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, title):
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, cv=5, scoring='accuracy', n_jobs=None,
        train_sizes=np.linspace(0.2, 1.0, 5), random_state=42
    )
    train_mean, train_std = train_scores.mean(axis=1), train_scores.std(axis=1)
    val_mean, val_std = val_scores.mean(axis=1), val_scores.std(axis=1)
    plt.figure(figsize=(7,5))
    plt.plot(train_sizes, train_mean, 'o-', label='Train')
    plt.fill_between(train_sizes, train_mean-train_std, train_mean+train_std, alpha=0.2)
    plt.plot(train_sizes, val_mean, 'o-', label='Validation')
    plt.fill_between(train_sizes, val_mean-val_std, val_mean+val_std, alpha=0.2)
    plt.title(title)
    plt.xlabel('Taille d\'échantillon d\'entraînement')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_learning_curve(rf, X, y, 'Courbe d\'apprentissage - RandomForest')


In [None]:
# Comparaison des importances RF vs XGB
try:
    imp_rf = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
    imp_xgb = pd.Series(xgb_model.feature_importances_, index=X.columns).sort_values(ascending=False)
    topk = 15
    comp = pd.DataFrame({
        'RF': imp_rf.head(topk),
        'XGB': imp_xgb.reindex(imp_rf.head(topk).index)
    })
    comp.plot(kind='barh', figsize=(8,6))
    plt.gca().invert_yaxis()
    plt.title('Comparaison des importances (Top 15)')
    plt.tight_layout()
    plt.show()
except Exception as e:
    print('Comparaison importances indisponible:', e)


## Conclusion et axes d’amélioration
- Le modèle capture des signaux forts liés à la différence de buts, au momentum et au contexte (domicile/extérieur).
- Les performances sont stables (CV) et les courbes d’apprentissage aident à détecter un éventuel sur/apprentissage.
- Les importances comparées (RF vs XGB) offrent deux angles d’interprétation complémentaires.

Prochaines pistes:
- Enrichir l’alignement FBref (qualité des clés) et la granularité des features contextuelles.
- Ajouter des séquences temporelles (lags multi-horizons) et des features d’adversaire (forme adverse).
- Calibration des probabilités (Platt/Isotonic) et seuils dépendant du coût.
- Validation temporelle (walk-forward) pour se rapprocher d’un usage en production.


In [None]:
# Entraînement des modèles simples
X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Random Forest (simple)
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print('RandomForest accuracy:', round(acc_rf, 3))
print(classification_report(y_test, y_pred_rf))

# XGBoost (simple) — encode 3->2
y_train_xgb = y_train.replace({3:2})
y_test_xgb = y_test.replace({3:2})

xgb_model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test)
acc_xgb = accuracy_score(y_test_xgb, y_pred_xgb)
print('XGBoost accuracy:', round(acc_xgb, 3))
print(classification_report(y_test_xgb, y_pred_xgb))


In [None]:
# Visualisations essentielles
sns.set_style('whitegrid')

# 1. Matrice de confusion RF
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=[0,1,3])
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Défaite','Nul','Victoire'], yticklabels=['Défaite','Nul','Victoire'])
plt.title('Matrice de confusion - RandomForest')
plt.xlabel('Prédit')
plt.ylabel('Réel')
plt.show()

# 2. Importance des features (top 15)
fi = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False).head(15)
fi.plot(kind='barh')
plt.gca().invert_yaxis()
plt.title('Top 15 Features - RandomForest')
plt.tight_layout()
plt.show()

# 3. Relation simple: xg vs points
if 'xg' in merged.columns:
    sns.scatterplot(data=merged, x='xg', y='points', hue='is_home', palette='coolwarm')
    plt.title('xG vs Points')
    plt.show()

# 4. Evolution des points (rolling) dans le temps
if 'date' in merged.columns:
    merged.set_index('date')['points'].rolling(5, min_periods=1).mean().plot()
    plt.title('Moyenne mobile des points (5 matchs)')
    plt.ylabel('Points Moyens')
    plt.show()


## Notes
- Ce notebook est volontairement minimal: il repose sur des heuristiques simples pour fusionner les données.
- Si certaines colonnes ne sont pas présentes, des valeurs par défaut sont utilisées.
- Pour plus de robustesse (alignement des correspondances, enrichissement des features), référez-vous au pipeline complet du projet, mais celui-ci suffit pour une démo rapide et reproductible.
