# 01 — Feature Engineering & Data Processing

Recalcule toutes les features à partir des CSV bruts et génère `data/all_leagues_combined.csv`.

**Durée estimée :** 15–30 min selon la machine.

---
### Google Colab : configuration du chemin Drive
Avant de lancer, modifie `DRIVE_PROJECT_PATH` ci-dessous avec le chemin de ton projet sur Drive.
Exemple : `'/content/drive/MyDrive/FootWork'`

In [None]:
# ============================================================
# CONFIGURATION — À modifier si nécessaire
# ============================================================
DRIVE_PROJECT_PATH = '/content/drive/MyDrive/FootWork'  # chemin sur Google Drive
LOCAL_PROJECT_PATH = '..'                               # chemin local (racine du projet)

In [None]:
# ============================================================
# Détection environnement + montage Drive
# ============================================================
import os, sys

try:
    import google.colab
    ON_COLAB = True
except ImportError:
    ON_COLAB = False

if ON_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = DRIVE_PROJECT_PATH
else:
    PROJECT_ROOT = os.path.abspath(LOCAL_PROJECT_PATH)

os.chdir(PROJECT_ROOT)
sys.path.insert(0, PROJECT_ROOT)
print(f'Environnement : {"Google Colab" if ON_COLAB else "Local"}')
print(f'Répertoire de travail : {os.getcwd()}')

In [None]:
# ============================================================
# Installation des dépendances
# ============================================================
if ON_COLAB:
    %pip install -q pyyaml xgboost lightgbm catboost scikit-learn pandas numpy tqdm
else:
    print('Local : vérifie que les dépendances de requirements.txt sont installées.')

In [None]:
# ============================================================
# Imports — chargement du processor via importlib
# (le nom du fichier contient un tiret, incompatible avec import classique)
# ============================================================
import importlib.util
from pathlib import Path
import pandas as pd
import numpy as np

from src.Config.Config_Manager import ConfigManager

# Chargement dynamique du module
_spec = importlib.util.spec_from_file_location(
    'processor',
    Path('src/Data_Processing/Multi-Season_Match_Data_Processor.py')
)
_mod = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_mod)
process_league_data = _mod.process_league_data

config_manager = ConfigManager('src/Config/data_processing_config.yaml')
leagues = list(config_manager.get_config_value('data_paths', 'leagues').keys())
print('Ligues à traiter :', leagues)

---
## Traitement ligue par ligue

Chaque cellule traite une ligue indépendamment — pratique si le kernel plante en milieu de route.

In [None]:
all_leagues_data = {}

In [None]:
# Premier League
data = process_league_data('Premier_League', config_manager)
if data is not None:
    all_leagues_data['Premier_League'] = data
    print(f'\n✓ Premier League : {len(data)} matchs, {len(data.columns)} colonnes')

In [None]:
# Ligue 1 (France)
data = process_league_data('France', config_manager)
if data is not None:
    all_leagues_data['France'] = data
    print(f'\n✓ France : {len(data)} matchs, {len(data.columns)} colonnes')

In [None]:
# Bundesliga (Allemagne)
data = process_league_data('Germany', config_manager)
if data is not None:
    all_leagues_data['Germany'] = data
    print(f'\n✓ Germany : {len(data)} matchs, {len(data.columns)} colonnes')

In [None]:
# Serie A (Italie)
data = process_league_data('Italy', config_manager)
if data is not None:
    all_leagues_data['Italy'] = data
    print(f'\n✓ Italy : {len(data)} matchs, {len(data.columns)} colonnes')

In [None]:
# La Liga (Espagne)
data = process_league_data('Spain', config_manager)
if data is not None:
    all_leagues_data['Spain'] = data
    print(f'\n✓ Spain : {len(data)} matchs, {len(data.columns)} colonnes')

In [None]:
# Série A (Brésil)
data = process_league_data('Brazil', config_manager)
if data is not None:
    all_leagues_data['Brazil'] = data
    print(f'\n✓ Brazil : {len(data)} matchs, {len(data.columns)} colonnes')

In [None]:
# ============================================================
# Combinaison de toutes les ligues → all_leagues_combined.csv
# ============================================================
combined = pd.concat(all_leagues_data.values(), ignore_index=True)
base_path = Path(config_manager.get_config_value('data_paths', 'base_path'))
output_path = base_path / 'all_leagues_combined.csv'
combined.to_csv(output_path, index=False)

print(f'\n=== Dataset combiné sauvegardé ===')
print(f'Chemin  : {output_path}')
print(f'Matchs  : {len(combined)}')
print(f'Colonnes: {len(combined.columns)}')
print(f'\nDistribution cible :')
print(combined['target_result'].value_counts())

---
## Sanity checks

In [None]:
# Vérification des nouvelles colonnes
print('=== Nouvelles features (cotes, diff, draw, H2H) ===')
new_cols = [c for c in combined.columns if any(c.startswith(p) for p in [
    'implied_prob', 'odds_ratio', 'diff_', 'combined_draw', 'match_comp',
    'elo_', 'home_elo', 'away_elo', 'h2h_'
])]
for col in new_cols:
    pct_nan = combined[col].isna().mean() * 100
    print(f'  {col:<35} NaN: {pct_nan:.1f}%')

# Vérification probabilités implicites (doit sommer à ~1)
prob_sum = (combined['implied_prob_home'] + combined['implied_prob_draw'] + combined['implied_prob_away'])
print(f'\nSomme prob implicites — mean: {prob_sum.mean():.4f}, std: {prob_sum.std():.6f} (attendu: 1.0)')

# Vérification H2H
if 'h2h_matches_count' in combined.columns:
    print(f'\nH2H matches count — mean: {combined["h2h_matches_count"].mean():.2f}, '
          f'max: {combined["h2h_matches_count"].max():.0f}')
    print(f'Matchs sans historique H2H (count=0) : {(combined["h2h_matches_count"] == 0).sum()}')

# Distribution par ligue
print(f'\nMatchs par ligue :')
print(combined['league'].value_counts())

In [None]:
# Aperçu du dataset final
combined.head(3)