Ce script python permet de préparer le dataset au machine learning

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import defaultdict

# Charger les données
df = pd.read_csv("atp_matches_2023.csv")

# Sélection des colonnes pertinentes
cols = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
        'tourney_date', 'best_of', 'winner_id', 'winner_seed', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
        'loser_id', 'loser_seed', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age',
        'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
        'w_ace', 'l_ace', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_bpSaved', 'w_bpFaced',
        'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_bpSaved', 'l_bpFaced']
data = df[cols]

# Convertir tourney_date en datetime
data['tourney_date'] = pd.to_datetime(data['tourney_date'], format='%Y%m%d')

# Randomisation des joueurs
np.random.seed(42)
randomized = np.random.rand(len(data)) > 0.5

data['player1_id'] = np.where(randomized, data['winner_id'], data['loser_id'])
data['player2_id'] = np.where(randomized, data['loser_id'], data['winner_id'])
data['player1_ace'] = np.where(randomized, data['w_ace'], data['l_ace'])
data['player2_ace'] = np.where(randomized, data['l_ace'], data['w_ace'])
data['winner'] = np.where(data['player1_id'] == data['winner_id'], 1, 0)

def compute_player_stats(df, player_id_col, ace_col, num_matches=10):
    """
    Calcule la moyenne des aces et autres stats sur les X derniers matchs.
    """
    stats = defaultdict(list)
    avg_aces, avg_1stIn, avg_1stWon, avg_2ndWon = [], [], [], []
    
    for index, row in df.iterrows():
        player_id = row[player_id_col]
        player_stats = stats[player_id][-num_matches:]
        
        avg_aces.append(np.mean([s['aces'] for s in player_stats]) if player_stats else 0)
        avg_1stIn.append(np.mean([s['1stIn'] for s in player_stats]) if player_stats else 0)
        avg_1stWon.append(np.mean([s['1stWon'] for s in player_stats]) if player_stats else 0)
        avg_2ndWon.append(np.mean([s['2ndWon'] for s in player_stats]) if player_stats else 0)
        
        stats[player_id].append({
            'aces': row[ace_col],
            '1stIn': row.get(f'{ace_col[0]}_1stIn', 0),
            '1stWon': row.get(f'{ace_col[0]}_1stWon', 0),
            '2ndWon': row.get(f'{ace_col[0]}_2ndWon', 0)
        })
    
    return avg_aces, avg_1stIn, avg_1stWon, avg_2ndWon

# Appliquer aux joueurs 1 et 2
data['player1_avg_aces'], data['player1_avg_1stIn'], data['player1_avg_1stWon'], data['player1_avg_2ndWon'] = compute_player_stats(data, 'player1_id', 'player1_ace')
data['player2_avg_aces'], data['player2_avg_1stIn'], data['player2_avg_1stWon'], data['player2_avg_2ndWon'] = compute_player_stats(data, 'player2_id', 'player2_ace')

# Encodage des variables catégorielles
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(data[['surface', 'tourney_level', 'winner_hand', 'loser_hand', 'winner_ioc', 'loser_ioc']])
data = pd.concat([data, pd.DataFrame(encoded_features)], axis=1)
data.drop(columns=['surface', 'tourney_level', 'winner_hand', 'loser_hand', 'winner_ioc', 'loser_ioc'], inplace=True)

# Normalisation des variables continues
scaler = StandardScaler()
continuous_features = ['winner_ht', 'loser_ht', 'winner_age', 'loser_age', 'winner_rank', 'loser_rank', 'winner_rank_points', 'loser_rank_points']
data[continuous_features] = scaler.fit_transform(data[continuous_features])

# Sélection des colonnes finales pour le modèle
train_data = data[['player1_avg_aces', 'player2_avg_aces', 'player1_avg_1stIn', 'player2_avg_1stIn',
                   'player1_avg_1stWon', 'player2_avg_1stWon', 'player1_avg_2ndWon', 'player2_avg_2ndWon',
                   'winner_ht', 'loser_ht', 'winner_age', 'loser_age', 'winner_rank', 'loser_rank',
                   'winner_rank_points', 'loser_rank_points', 'winner']]

# Sauvegarde du dataset préparé
train_data.to_csv("train_data.csv", index=False)
print("Dataset d'entraînement prêt et sauvegardé sous train_data.csv !")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Dataset d'entraînement prêt et sauvegardé sous train_data.csv !


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Deuxième version

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import defaultdict

# Charger les données
df = pd.read_csv("atp_matches_2023.csv")

# Sélection des colonnes pertinentes
cols = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
        'tourney_date', 'best_of', 'winner_id', 'winner_seed', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
        'loser_id', 'loser_seed', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age',
        'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
        'w_ace', 'l_ace', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_bpSaved', 'w_bpFaced',
        'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_bpSaved', 'l_bpFaced']
data = df[cols]

# Convertir tourney_date en datetime
data['tourney_date'] = pd.to_datetime(data['tourney_date'], format='%Y%m%d')

# Créer un dictionnaire associant les IDs aux noms des joueurs
player_dict = {}

# Ajouter les joueurs gagnants et perdants avec leurs noms
for _, row in df.iterrows():
    player_dict[row['winner_id']] = row['winner_name']
    player_dict[row['loser_id']] = row['loser_name']

# Randomisation des joueurs
np.random.seed(42)
randomized = np.random.rand(len(data)) > 0.5

data['player1_id'] = np.where(randomized, data['winner_id'], data['loser_id'])
data['player2_id'] = np.where(randomized, data['loser_id'], data['winner_id'])
data['player1_ace'] = np.where(randomized, data['w_ace'], data['l_ace'])
data['player2_ace'] = np.where(randomized, data['l_ace'], data['w_ace'])
data['player1_ht'] = np.where(randomized, data['winner_ht'], data['loser_ht'])
data['player2_ht'] = np.where(randomized, data['loser_ht'], data['winner_ht'])
data['player1_age'] = np.where(randomized, data['winner_age'], data['loser_age'])
data['player2_age'] = np.where(randomized, data['loser_age'], data['winner_age'])
data['player1_rank'] = np.where(randomized, data['winner_rank'], data['loser_rank'])
data['player2_rank'] = np.where(randomized, data['loser_rank'], data['winner_rank'])
data['player1_rank_points'] = np.where(randomized, data['winner_rank_points'], data['loser_rank_points'])
data['player2_rank_points'] = np.where(randomized, data['loser_rank_points'], data['winner_rank_points'])
data['winner'] = np.where(data['player1_id'] == data['winner_id'], 1, 0)

# Ajouter les noms des joueurs à partir du dictionnaire
data['player1_name'] = data['player1_id'].map(player_dict)
data['player2_name'] = data['player2_id'].map(player_dict)

# Encodage des variables catégorielles (surface et best_of)
encoder = OneHotEncoder(sparse=False)
encoded_surface_best_of = encoder.fit_transform(data[['surface', 'best_of']])
encoded_df = pd.DataFrame(encoded_surface_best_of, columns=encoder.get_feature_names(['surface', 'best_of']))

# Ajouter les colonnes encodées à votre DataFrame
data = pd.concat([data, encoded_df], axis=1)

# Exemple d'affichage pour vérifier
print(data[['player1_id', 'player1_name', 'player2_id', 'player2_name']].head())

def compute_player_stats(df, player_id_col, ace_col, num_matches=10):
    """
    Calcule la moyenne des aces et autres stats sur les X derniers matchs pour chaque joueur.
    """
    # Trier le DataFrame par date pour chaque joueur
    df = df.sort_values(by='tourney_date', ascending=True)
    
    stats = defaultdict(list)
    
    avg_aces, avg_1stIn, avg_1stWon, avg_2ndWon = [], [], [], []

    # Itérer sur les lignes du DataFrame
    for index, row in df.iterrows():
        player_id = row[player_id_col]
        # On récupère les matchs précédents de ce joueur
        player_stats = stats[player_id]
        
        # Garder seulement les X derniers matchs
        player_stats = player_stats[-num_matches:]
        
        # Calcul des moyennes des stats pour ce joueur
        avg_aces.append(np.mean([s['aces'] for s in player_stats]) if player_stats else 0)
        avg_1stIn.append(np.mean([s['1stIn'] for s in player_stats]) if player_stats else 0)
        avg_1stWon.append(np.mean([s['1stWon'] for s in player_stats]) if player_stats else 0)
        avg_2ndWon.append(np.mean([s['2ndWon'] for s in player_stats]) if player_stats else 0)

        # Ajout des stats du match actuel
        stats[player_id].append({
            'aces': row[ace_col],
            '1stIn': row.get(f'{ace_col[0]}_1stIn', 0),
            '1stWon': row.get(f'{ace_col[0]}_1stWon', 0),
            '2ndWon': row.get(f'{ace_col[0]}_2ndWon', 0)
        })
    
    return avg_aces, avg_1stIn, avg_1stWon, avg_2ndWon


# Appliquer aux joueurs 1 et 2 avec les 10 derniers matchs
data['player1_avg_aces'], data['player1_avg_1stIn'], data['player1_avg_1stWon'], data['player1_avg_2ndWon'] = compute_player_stats(data, 'player1_id', 'player1_ace', num_matches=10)
data['player2_avg_aces'], data['player2_avg_1stIn'], data['player2_avg_1stWon'], data['player2_avg_2ndWon'] = compute_player_stats(data, 'player2_id', 'player2_ace', num_matches=10)

# Encodage des variables catégorielles
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(data[['surface', 'tourney_level']])
data = pd.concat([data, pd.DataFrame(encoded_features)], axis=1)
data.drop(columns=['surface', 'tourney_level'], inplace=True)

# Normalisation des variables continues
scaler = StandardScaler()
continuous_features = ['player1_ht', 'player2_ht', 'player1_age', 'player2_age', 'player1_rank', 'player2_rank', 'player1_rank_points', 'player2_rank_points']
data[continuous_features] = scaler.fit_transform(data[continuous_features])

# Sélection des colonnes finales pour le modèle
train_data = data[['player1_avg_aces', 'player2_avg_aces', 'player1_avg_1stIn', 'player2_avg_1stIn',
                   'player1_avg_1stWon', 'player2_avg_1stWon', 'player1_avg_2ndWon', 'player2_avg_2ndWon',
                   'player1_ht', 'player2_ht', 'player1_age', 'player2_age', 'player1_rank', 'player2_rank',
                   'player1_rank_points', 'player2_rank_points', 'winner','encoded_surface_best_of']]

# Sauvegarde du dataset préparé
train_data.to_csv("train_data.csv", index=False)
print("Dataset d'entraînement prêt et sauvegardé sous train_data.csv !")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

   player1_id       player1_name  player2_id        player2_name
0      126610  Matteo Berrettini      126203        Taylor Fritz
1      126207     Frances Tiafoe      207518     Lorenzo Musetti
2      126203       Taylor Fritz      128034      Hubert Hurkacz
3      126207     Frances Tiafoe      200390          Kacper Zuk
4      126610  Matteo Berrettini      126774  Stefanos Tsitsipas


KeyError: "['encoded_surface_best_of'] not in index"

In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import defaultdict

# Charger les données
df = pd.read_csv("atp_matches_2023.csv")

# Sélection des colonnes pertinentes
cols = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
        'tourney_date', 'best_of', 'winner_id', 'winner_seed', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
        'loser_id', 'loser_seed', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age',
        'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
        'w_ace', 'l_ace', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_bpSaved', 'w_bpFaced',
        'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_bpSaved', 'l_bpFaced']
data = df[cols]

# Convertir tourney_date en datetime
data['tourney_date'] = pd.to_datetime(data['tourney_date'], format='%Y%m%d')

# Créer un dictionnaire associant les IDs aux noms des joueurs
player_dict = {}

# Ajouter les joueurs gagnants et perdants avec leurs noms
for _, row in df.iterrows():
    player_dict[row['winner_id']] = row['winner_name']
    player_dict[row['loser_id']] = row['loser_name']

# Randomisation des joueurs
np.random.seed(42)
randomized = np.random.rand(len(data)) > 0.5

data['player1_id'] = np.where(randomized, data['winner_id'], data['loser_id'])
data['player2_id'] = np.where(randomized, data['loser_id'], data['winner_id'])
data['player1_ace'] = np.where(randomized, data['w_ace'], data['l_ace'])
data['player2_ace'] = np.where(randomized, data['l_ace'], data['w_ace'])
data['player1_ht'] = np.where(randomized, data['winner_ht'], data['loser_ht'])
data['player2_ht'] = np.where(randomized, data['loser_ht'], data['winner_ht'])
data['player1_age'] = np.where(randomized, data['winner_age'], data['loser_age'])
data['player2_age'] = np.where(randomized, data['loser_age'], data['winner_age'])
data['player1_rank'] = np.where(randomized, data['winner_rank'], data['loser_rank'])
data['player2_rank'] = np.where(randomized, data['loser_rank'], data['winner_rank'])
data['player1_rank_points'] = np.where(randomized, data['winner_rank_points'], data['loser_rank_points'])
data['player2_rank_points'] = np.where(randomized, data['loser_rank_points'], data['winner_rank_points'])
data['winner'] = np.where(data['player1_id'] == data['winner_id'], 1, 0)

# Ajouter les noms des joueurs à partir du dictionnaire
data['player1_name'] = data['player1_id'].map(player_dict)
data['player2_name'] = data['player2_id'].map(player_dict)

# Exemple d'affichage pour vérifier
print(data[['player1_id', 'player1_name', 'player2_id', 'player2_name']].head())

def compute_player_stats(df, player_id_col, ace_col, num_matches=10):
    """
    Calcule la moyenne des aces et autres stats sur les X derniers matchs pour chaque joueur.
    """
    # Trier le DataFrame par date pour chaque joueur
    df = df.sort_values(by='tourney_date', ascending=True)
    
    stats = defaultdict(list)
    
    avg_aces, avg_1stIn, avg_1stWon, avg_2ndWon = [], [], [], []

    # Itérer sur les lignes du DataFrame
    for index, row in df.iterrows():
        player_id = row[player_id_col]
        # On récupère les matchs précédents de ce joueur
        player_stats = stats[player_id]
        
        # Garder seulement les X derniers matchs
        player_stats = player_stats[-num_matches:]
        
        # Calcul des moyennes des stats pour ce joueur
        avg_aces.append(np.mean([s['aces'] for s in player_stats]) if player_stats else 0)
        avg_1stIn.append(np.mean([s['1stIn'] for s in player_stats]) if player_stats else 0)
        avg_1stWon.append(np.mean([s['1stWon'] for s in player_stats]) if player_stats else 0)
        avg_2ndWon.append(np.mean([s['2ndWon'] for s in player_stats]) if player_stats else 0)

        # Ajout des stats du match actuel
        stats[player_id].append({
            'aces': row[ace_col],
            '1stIn': row.get(f'{ace_col[0]}_1stIn', 0),
            '1stWon': row.get(f'{ace_col[0]}_1stWon', 0),
            '2ndWon': row.get(f'{ace_col[0]}_2ndWon', 0)
        })
    
    return avg_aces, avg_1stIn, avg_1stWon, avg_2ndWon


# Appliquer aux joueurs 1 et 2 avec les 10 derniers matchs
data['player1_avg_aces'], data['player1_avg_1stIn'], data['player1_avg_1stWon'], data['player1_avg_2ndWon'] = compute_player_stats(data, 'player1_id', 'player1_ace', num_matches=10)
data['player2_avg_aces'], data['player2_avg_1stIn'], data['player2_avg_1stWon'], data['player2_avg_2ndWon'] = compute_player_stats(data, 'player2_id', 'player2_ace', num_matches=10)

# Encodage des variables catégorielles
encoder = OneHotEncoder(sparse=False)
encoded_features = ['surface', 'best_of']

#encoded_features = encoder.fit_transform(data[['surface', 'best_of']])
#data = pd.concat([data, pd.DataFrame(encoded_features)], axis=1)
data[encoded_features] = encoder.fit_transform(data[encoded_features])
data = pd.concat([data, pd.DataFrame(encoded_features)], axis=1)


#data.drop(columns=['surface', 'tourney_level'], inplace=True)

# Normalisation des variables continues
scaler = StandardScaler()
continuous_features = ['player1_ht', 'player2_ht', 'player1_age', 'player2_age', 'player1_rank', 'player2_rank', 'player1_rank_points', 'player2_rank_points']
data[continuous_features] = scaler.fit_transform(data[continuous_features])

# Sélection des colonnes finales pour le modèle
train_data = data[['surface','best_of','player1_avg_aces', 'player2_avg_aces', 'player1_avg_1stIn', 'player2_avg_1stIn',
                   'player1_avg_1stWon', 'player2_avg_1stWon', 'player1_avg_2ndWon', 'player2_avg_2ndWon',
                   'player1_ht', 'player2_ht', 'player1_age', 'player2_age', 'player1_rank', 'player2_rank',
                   'player1_rank_points', 'player2_rank_points', 'winner']]

# Sauvegarde du dataset préparé
train_data.to_csv("train_data.csv", index=False)
print("Dataset d'entraînement prêt et sauvegardé sous train_data.csv !")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

   player1_id       player1_name  player2_id        player2_name
0      126610  Matteo Berrettini      126203        Taylor Fritz
1      126207     Frances Tiafoe      207518     Lorenzo Musetti
2      126203       Taylor Fritz      128034      Hubert Hurkacz
3      126207     Frances Tiafoe      200390          Kacper Zuk
4      126610  Matteo Berrettini      126774  Stefanos Tsitsipas


ValueError: Columns must be same length as key

In [31]:
print(data.columns)


Index([         'tourney_id',        'tourney_name',           'draw_size',
              'tourney_date',             'best_of',           'winner_id',
               'winner_seed',         'winner_hand',           'winner_ht',
                'winner_ioc',          'winner_age',            'loser_id',
                'loser_seed',          'loser_hand',            'loser_ht',
                 'loser_ioc',           'loser_age',         'winner_rank',
        'winner_rank_points',          'loser_rank',   'loser_rank_points',
                     'w_ace',               'l_ace',             'w_1stIn',
                  'w_1stWon',            'w_2ndWon',           'w_bpSaved',
                 'w_bpFaced',             'l_1stIn',            'l_1stWon',
                  'l_2ndWon',           'l_bpSaved',           'l_bpFaced',
                'player1_id',          'player2_id',         'player1_ace',
               'player2_ace',          'player1_ht',          'player2_ht',
            