# Exploration data

## Imports

In [1]:
# Basic Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Baseball
import statsapi
from datetime import datetime, timedelta
import time

# Set Style of Viz
sns.set_style("darkgrid")
# sns.set_palette(palette='dark:#5A9_r')
# Magic lines
%matplotlib inline
%load_ext autoreload
%autoreload 2

## MLB API

## Extraction de la data

In [3]:
def get_team_stats_before_game(team_id, game_date):
    try:
        # Date de la veille pour avoir les stats pré-match
        date_obj = datetime.strptime(game_date, '%Y-%m-%d')
        previous_day = (date_obj - timedelta(days=1)).strftime('%Y-%m-%d')
        season = str(date_obj.year)  # Ajout de l'année

        # Stats d'équipe
        team_stats = statsapi.get('team_stats', {
            'teamId': team_id,
            'season': season,  # Ajout du paramètre season
            'stats': 'season',
            'group': 'hitting',
            'date': previous_day
        })

        team_pitching = statsapi.get('team_stats', {
            'teamId': team_id,
            'season': season,  # Ajout du paramètre season
            'stats': 'season',
            'group': 'pitching',
            'date': previous_day
        })

        # Classement
        standings = statsapi.get('standings', {
            'leagueId': '103,104',
            'date': previous_day
        })

        return {
            'stats': team_stats,
            'pitching': team_pitching,
            'standings': standings
        }
    except Exception as e:
        print(f"Erreur pour team {team_id} à la date {game_date}: {str(e)}")
        return None

def collect_matches_data(start_year, end_year):
    all_games_data = []

    for year in range(start_year, end_year + 1):
        print(f"Récupération des données pour {year}...")

        # Récupérer tous les matchs de la saison
        schedule = statsapi.schedule(
            start_date=f'{year}-03-28',
            end_date=f'{year}-10-01'
        )

        print(f"Nombre de matchs trouvés : {len(schedule)}")

        for i, game in enumerate(schedule):
            try:
                game_date = game['game_date']
                home_team = game['home_id']
                away_team = game['away_id']

                # Afficher la progression
                print(f"Traitement du match {i+1}/{len(schedule)} : {home_team} vs {away_team} le {game_date}")

                # Récupérer les stats des deux équipes avant le match
                home_stats = get_team_stats_before_game(home_team, game_date)
                time.sleep(1)  # Pause pour éviter de surcharger l'API
                away_stats = get_team_stats_before_game(away_team, game_date)
                time.sleep(1)

                if home_stats and away_stats:
                    game_data = {
                        'game_id': game['game_id'],
                        'date': game_date,
                        'home_team': home_team,
                        'away_team': away_team,
                        'home_score': game.get('home_score', None),
                        'away_score': game.get('away_score', None),
                        'home_win': game.get('home_score', 0) > game.get('away_score', 0) if 'home_score' in game else None,
                        'home_stats': home_stats,
                        'away_stats': away_stats
                    }
                    all_games_data.append(game_data)

                    # Sauvegarde intermédiaire tous les 50 matchs
                    if (i + 1) % 50 == 0:
                        temp_df = pd.DataFrame(all_games_data)
                        temp_df.to_csv(f'mlb_games_2024_temp_{i+1}.csv', index=False)

            except Exception as e:
                print(f"Erreur lors du traitement du match: {str(e)}")
                continue

    return all_games_data

# Exécuter la collecte pour 2024
data = collect_matches_data(2017,2023)

# Convertir en DataFrame
df = pd.DataFrame(data)

# Afficher un aperçu des données
print("\nAperçu des données collectées:")
print(df.head())
print("\nColonnes disponibles:")
print(df.columns.tolist())

# Sauvegarder les données
df.to_csv('mlb_games_2017_2023.csv', index=False)

Récupération des données pour 2017...
Nombre de matchs trouvés : 2538
Traitement du match 1/2538 : 142 vs 139 le 2017-03-28
Traitement du match 2/2538 : 144 vs 110 le 2017-03-28
Traitement du match 3/2538 : 146 vs 117 le 2017-03-28
Traitement du match 4/2538 : 121 vs 138 le 2017-03-28
Traitement du match 5/2538 : 115 vs 140 le 2017-03-28
Traitement du match 6/2538 : 113 vs 119 le 2017-03-28
Traitement du match 7/2538 : 119 vs 135 le 2017-03-28
Traitement du match 8/2538 : 112 vs 137 le 2017-03-28
Traitement du match 9/2538 : 118 vs 145 le 2017-03-28
Traitement du match 10/2538 : 158 vs 114 le 2017-03-28
Traitement du match 11/2538 : 108 vs 133 le 2017-03-28
Traitement du match 12/2538 : 136 vs 109 le 2017-03-28
Traitement du match 13/2538 : 134 vs 111 le 2017-03-28
Traitement du match 14/2538 : 143 vs 141 le 2017-03-28
Traitement du match 15/2538 : 120 vs 146 le 2017-03-28
Traitement du match 16/2538 : 147 vs 116 le 2017-03-28
Traitement du match 17/2538 : 138 vs 120 le 2017-03-29
Trai

In [4]:
df = pd.read_csv("mlb_games_2017_2023.csv")
df.head()

Unnamed: 0,game_id,date,home_team,away_team,home_score,away_score,home_win,home_stats,away_stats
0,509798,2017-03-28,142,139,1,0,True,{'stats': {'copyright': 'Copyright 2025 MLB Ad...,{'stats': {'copyright': 'Copyright 2025 MLB Ad...
1,509541,2017-03-28,144,110,4,5,False,{'stats': {'copyright': 'Copyright 2025 MLB Ad...,{'stats': {'copyright': 'Copyright 2025 MLB Ad...
2,509762,2017-03-28,146,117,3,7,False,{'stats': {'copyright': 'Copyright 2025 MLB Ad...,{'stats': {'copyright': 'Copyright 2025 MLB Ad...
3,509817,2017-03-28,121,138,3,3,False,{'stats': {'copyright': 'Copyright 2025 MLB Ad...,{'stats': {'copyright': 'Copyright 2025 MLB Ad...
4,509661,2017-03-28,115,140,3,4,False,{'stats': {'copyright': 'Copyright 2025 MLB Ad...,{'stats': {'copyright': 'Copyright 2025 MLB Ad...


In [5]:
df.shape

(16059, 9)

In [None]:
def extract_stats_features(df):
    # Conversion en datetime pour tri chronologique
    df['date'] = pd.to_datetime(df['date'])

    # Ajouter numéro de match dans la saison pour chaque équipe
    df['season'] = df['date'].dt.year

    # Créer numéro de match pour équipe domicile et extérieur
    home_game_number = df.groupby(['home_team', 'season']).cumcount() + 1
    away_game_number = df.groupby(['away_team', 'season']).cumcount() + 1

    def get_season_period(game_number):
        if game_number <= 20:
            return 'early'
        elif game_number <= 120:
            return 'mid'
        else:
            return 'late'

    df['home_game_number'] = home_game_number
    df['away_game_number'] = away_game_number
    df['home_season_period'] = home_game_number.apply(get_season_period)
    df['away_season_period'] = away_game_number.apply(get_season_period)

    # Extraction des stats comme avant
    stats_dict_list = []
    for _, row in df.iterrows():
        if isinstance(row['home_stats'], str):
            home_stats = eval(row['home_stats'])
            away_stats = eval(row['away_stats'])
        else:
            home_stats = row['home_stats']
            away_stats = row['away_stats']

        home_hitting_stats = home_stats['stats']['stats'][0]['splits'][0]['stat']
        home_pitching_stats = home_stats['pitching']['stats'][0]['splits'][0]['stat']
        away_hitting_stats = away_stats['stats']['stats'][0]['splits'][0]['stat']
        away_pitching_stats = away_stats['pitching']['stats'][0]['splits'][0]['stat']

        stats_dict = {
            'home_avg': home_hitting_stats['avg'],
            'home_obp': home_hitting_stats['obp'],
            'home_slg': home_hitting_stats['slg'],
            'home_ops': home_hitting_stats['ops'],
            'home_runs': home_hitting_stats['runs'],
            'home_hits': home_hitting_stats['hits'],
            'home_era': home_pitching_stats['era'],
            'home_whip': home_pitching_stats['whip'],
            'home_win_pct': home_pitching_stats['winPercentage'],
            'away_avg': away_hitting_stats['avg'],
            'away_obp': away_hitting_stats['obp'],
            'away_slg': away_hitting_stats['slg'],
            'away_ops': away_hitting_stats['ops'],
            'away_runs': away_hitting_stats['runs'],
            'away_hits': away_hitting_stats['hits'],
            'away_era': away_pitching_stats['era'],
            'away_whip': away_pitching_stats['whip'],
            'away_win_pct': away_pitching_stats['winPercentage'],
        }
        stats_dict_list.append(stats_dict)

    # Créer DataFrame des stats
    stats_df = pd.DataFrame(stats_dict_list, index=df.index)

    # Combiner avec les infos de base
    final_df = pd.concat([
        df[['game_id', 'date', 'home_team', 'away_team', 'home_win',
            'home_game_number', 'away_game_number',
            'home_season_period', 'away_season_period', 'season']],
        stats_df
    ], axis=1)

    return final_df

# Utilisation
final_df = extract_stats_features(df)
print("\nColonnes dans le DataFrame final:")
print(final_df.columns.tolist())


Colonnes dans le DataFrame final:
['game_id', 'date', 'home_team', 'away_team', 'home_win', 'home_game_number', 'away_game_number', 'home_season_period', 'away_season_period', 'season', 'home_avg', 'home_obp', 'home_slg', 'home_ops', 'home_runs', 'home_hits', 'home_era', 'home_whip', 'home_win_pct', 'away_avg', 'away_obp', 'away_slg', 'away_ops', 'away_runs', 'away_hits', 'away_era', 'away_whip', 'away_win_pct']


In [8]:
final_df.tail()

Unnamed: 0,game_id,date,home_team,away_team,home_win,home_game_number,away_game_number,home_season_period,away_season_period,season,...,home_win_pct,away_avg,away_obp,away_slg,away_ops,away_runs,away_hits,away_era,away_whip,away_win_pct
16054,716354,2023-10-01,144,120,False,81,83,mid,mid,2023,...,0.642,0.254,0.314,0.396,0.71,700,1401,5.02,1.47,0.438
16055,716356,2023-10-01,145,135,False,85,81,mid,mid,2023,...,0.377,0.244,0.329,0.413,0.742,752,1316,3.73,1.27,0.506
16056,716352,2023-10-01,118,147,True,82,87,mid,mid,2023,...,0.346,0.227,0.304,0.397,0.701,673,1207,3.97,1.24,0.506
16057,716364,2023-10-01,158,112,True,81,82,mid,mid,2023,...,0.568,0.254,0.33,0.421,0.751,819,1399,4.08,1.28,0.512
16058,716353,2023-10-01,138,113,True,83,84,mid,mid,2023,...,0.438,0.249,0.327,0.42,0.747,783,1371,4.83,1.42,0.506


In [9]:
final_df.to_csv('mlb_games_stats_20172023.csv', index=False)

In [12]:
# Lire tous les fichiers
df_2017_2023 = pd.read_csv('mlb_games_stats_20172023.csv')
df_2024 = pd.read_csv('mlb_games_stats_2024.csv')

# Vérifier que les colonnes sont identiques
print("Colonnes 2017-2023:", df_2017_2023.columns.tolist())
print("Colonnes 2024:", df_2024.columns.tolist())

# Concaténer
df_all = pd.concat([df_2017_2023, df_2024], axis=0, ignore_index=True)

df_all.to_csv('mlb_games_2017_2024_final.csv', index=False)

# Afficher quelques infos sur le dataset final
print("\nNombre total de matchs:", len(df_all))

Colonnes 2017-2023: ['game_id', 'date', 'home_team', 'away_team', 'home_win', 'home_game_number', 'away_game_number', 'home_season_period', 'away_season_period', 'season', 'home_avg', 'home_obp', 'home_slg', 'home_ops', 'home_runs', 'home_hits', 'home_era', 'home_whip', 'home_win_pct', 'away_avg', 'away_obp', 'away_slg', 'away_ops', 'away_runs', 'away_hits', 'away_era', 'away_whip', 'away_win_pct']
Colonnes 2024: ['game_id', 'date', 'home_team', 'away_team', 'home_win', 'home_game_number', 'away_game_number', 'home_season_period', 'away_season_period', 'season', 'home_avg', 'home_obp', 'home_slg', 'home_ops', 'home_runs', 'home_hits', 'home_era', 'home_whip', 'home_win_pct', 'away_avg', 'away_obp', 'away_slg', 'away_ops', 'away_runs', 'away_hits', 'away_era', 'away_whip', 'away_win_pct']

Nombre total de matchs: 18530


## Check Data

In [13]:
df_all.columns

Index(['game_id', 'date', 'home_team', 'away_team', 'home_win',
       'home_game_number', 'away_game_number', 'home_season_period',
       'away_season_period', 'season', 'home_avg', 'home_obp', 'home_slg',
       'home_ops', 'home_runs', 'home_hits', 'home_era', 'home_whip',
       'home_win_pct', 'away_avg', 'away_obp', 'away_slg', 'away_ops',
       'away_runs', 'away_hits', 'away_era', 'away_whip', 'away_win_pct'],
      dtype='object')

In [17]:
df_all.describe()

Unnamed: 0,game_id,home_team,away_team,home_game_number,away_game_number,season,home_avg,home_obp,home_slg,home_ops,...,home_win_pct,away_avg,away_obp,away_slg,away_ops,away_runs,away_hits,away_era,away_whip,away_win_pct
count,18530.0,18530.0,18530.0,18530.0,18530.0,18530.0,18530.0,18530.0,18530.0,18530.0,...,18530.0,18530.0,18530.0,18530.0,18530.0,18530.0,18530.0,18530.0,18530.0,18530.0
mean,621751.778845,128.707447,129.069887,40.851052,40.830977,2020.533729,0.247378,0.318029,0.412756,0.730785,...,0.499758,0.247366,0.318041,0.412768,0.730809,709.362062,1309.198003,4.24332,1.305649,0.500043
std,86138.501147,14.825786,41.709278,24.232905,24.206064,2.375299,0.011692,0.013183,0.028295,0.039993,...,0.084611,0.011705,0.013189,0.028319,0.040019,130.068306,214.68357,0.564384,0.09974,0.084687
min,490098.0,108.0,103.0,1.0,1.0,2017.0,0.212,0.278,0.34,0.618,...,0.253,0.212,0.278,0.34,0.618,219.0,390.0,2.8,1.05,0.253
25%,531457.25,115.0,115.0,20.0,20.0,2018.0,0.239,0.309,0.392,0.702,...,0.438,0.239,0.309,0.392,0.702,674.0,1293.0,3.83,1.24,0.438
50%,632979.5,133.0,134.0,40.0,40.0,2021.0,0.248,0.318,0.412,0.73,...,0.506,0.248,0.318,0.412,0.73,726.0,1347.0,4.15,1.3,0.506
75%,716664.75,141.0,141.0,62.0,62.0,2023.0,0.256,0.328,0.431,0.756,...,0.562,0.256,0.328,0.431,0.756,780.0,1410.0,4.66,1.38,0.562
max,775345.0,568.0,5434.0,92.0,91.0,2024.0,0.282,0.352,0.501,0.847,...,0.717,0.282,0.352,0.501,0.847,947.0,1581.0,5.84,1.65,0.717


# Preprocessing

In [18]:
df = df_all

## Vérification data en double et NA

In [20]:
df.duplicated().sum()/len(data)

np.float64(0.0)

In [21]:
df.isnull().sum()

game_id               0
date                  0
home_team             0
away_team             0
home_win              0
home_game_number      0
away_game_number      0
home_season_period    0
away_season_period    0
season                0
home_avg              0
home_obp              0
home_slg              0
home_ops              0
home_runs             0
home_hits             0
home_era              0
home_whip             0
home_win_pct          0
away_avg              0
away_obp              0
away_slg              0
away_ops              0
away_runs             0
away_hits             0
away_era              0
away_whip             0
away_win_pct          0
dtype: int64

## Features numériques ou catégoriques

In [22]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18530 entries, 0 to 18529
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   game_id             18530 non-null  int64  
 1   date                18530 non-null  object 
 2   home_team           18530 non-null  int64  
 3   away_team           18530 non-null  int64  
 4   home_win            18530 non-null  bool   
 5   home_game_number    18530 non-null  int64  
 6   away_game_number    18530 non-null  int64  
 7   home_season_period  18530 non-null  object 
 8   away_season_period  18530 non-null  object 
 9   season              18530 non-null  int64  
 10  home_avg            18530 non-null  float64
 11  home_obp            18530 non-null  float64
 12  home_slg            18530 non-null  float64
 13  home_ops            18530 non-null  float64
 14  home_runs           18530 non-null  int64  
 15  home_hits           18530 non-null  int64  
 16  home

### Data catégorique

In [25]:
df['home_season_period'].unique(), df['away_season_period'].unique()

(array(['early', 'mid'], dtype=object), array(['early', 'mid'], dtype=object))

In [26]:
df['home_season_period'] = df['home_season_period'].replace({'mid': 1, 'early': 0})
df['away_season_period'] = df['away_season_period'].replace({'mid': 1, 'early': 0})

  df['home_season_period'] = df['home_season_period'].replace({'mid': 1, 'early': 0})
  df['away_season_period'] = df['away_season_period'].replace({'mid': 1, 'early': 0})


In [27]:
df['home_season_period'].unique(), df['away_season_period'].unique()

(array([0, 1]), array([0, 1]))

### Target

In [28]:
df['home_win'].unique()

array([ True, False])

In [32]:
df['home_win'] = df['home_win'].replace({'True': 1, 'False': 0})

In [33]:
df['home_win'].unique()

array([ True, False])

### Date

In [35]:
df['date'] = pd.to_datetime(df['date'])

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18530 entries, 0 to 18529
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   game_id             18530 non-null  int64         
 1   date                18530 non-null  datetime64[ns]
 2   home_team           18530 non-null  int64         
 3   away_team           18530 non-null  int64         
 4   home_win            18530 non-null  bool          
 5   home_game_number    18530 non-null  int64         
 6   away_game_number    18530 non-null  int64         
 7   home_season_period  18530 non-null  int64         
 8   away_season_period  18530 non-null  int64         
 9   season              18530 non-null  int64         
 10  home_avg            18530 non-null  float64       
 11  home_obp            18530 non-null  float64       
 12  home_slg            18530 non-null  float64       
 13  home_ops            18530 non-null  float64   

In [37]:
df['home_win'] = df['home_win'].astype(int)

In [38]:
df['home_win'].unique()

array([1, 0])

In [42]:
df.to_csv('clean/mlb_games_2017_2024.csv', index=False)