# 1- Load and DATA preparation :

## Import des bibliothéques Pandas et numpy: 

In [None]:
import pandas as pd
import numpy as np

In [None]:
NBA_2022='https://www.basketball-reference.com/playoffs/NBA_2023.html'

## l'objectif du projet est de prédire le vainqueur des playoffs 2023 en utilisant une approche d'apprentissage supervisé basée sur les données historiques. Pour ce faire, nous prendrons en compte les performances des équipes lors des 7 matchs auxquels elles participent pour designer un vinceur.

* Nous examinerons les statistiques de chaque match disputé par les équipes 1 et 2 pendant les playoffs, en les associant à l'issue du match (victoire ou défaite) pour créer une base de données complète. Cette base de données comprendra différents facteurs et leur impact sur le score final.

* Il convient également de noter l'importance des joueurs vedettes, car une équipe sans sa star aura généralement un rendement inférieur. Cependant, compte tenu des contraintes de temps, nous ne pourrons pas tenir compte de tous les facteurs pertinents.

* Ainsi, l'accent sera principalement mis sur les statistiques du jeu collectif de chaque équipe, afin de pouvoir réaliser une analyse approfondie et fournir des prédictions sur les performances des équipes lors des playoffs 2023.

In [None]:
# Cette fonction prend en paramétre un lien vers le site en question et revoie une tables de tout les matchs des palyoff de chaque années précise puis contient aussi un liens vers les stats avancée de chaque match 

################# Voir les explication en bas du resultat de la fonction #################


def load_data_game(path,annee):
  df = pd.read_html(path)

  # selectionner la tables des matchs palyoff pour une season definie 
  df = df[0]

  indices_nan = np.where(df.isna())
  indices = list(set(indices_nan[0]))
  indices_f = [indices[i] for i in range(len(indices)-1) if indices[i+1] - indices[i] > 4]
  indices_f.append(indices[-1])
  indices_f[0]=0
  indices_f.append(len(df))
  valeurs = ['Finals', 'Eastern conference finals', 'Western conference finals', 'Eastern conference semifinals','Eastern conference semifinals',
           'Western conference semifinals','Western conference semifinals', 'Eastern Conference First Round', 'Eastern Conference First Round',
           'Eastern Conference First Round', 'Eastern Conference First Round', 'Western Conference First Round',
           'Western Conference First Round', 'Western Conference First Round', 'Western Conference First Round']


  liste_valeurs = np.repeat(valeurs, np.diff(indices_f)).tolist()
  df['level'] = liste_valeurs
  df = df.dropna()
  df[1] = pd.to_datetime(df[1], format='%a, %B %d')
  df['NewDate'] = df[1].dt.strftime('%m%d')
  df[4] = df[4].str.replace('@','')
  df=df.drop(columns=[1])

  # Recuperation de la bases des données des achronymes des equipes

  d = pd.read_html('https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations',header=0)
  d = d[0]
  d.at[23,'Abbreviation/ Acronym'] = 'PHO'
  d.at[2,'Abbreviation/ Acronym'] = 'NJN'

  # merger les données 

  merged_df = pd.merge(df, d, left_on= 2, right_on='Franchise', how= 'left')
  merged_df[4]= merged_df[4].str.strip()
  merged_df = pd.merge(merged_df, d , left_on= 4, right_on='Franchise', how= 'left')

  # labelisation des données 
  merged_df['Winner'] = merged_df.apply(lambda row: 1 if float(row[3]) > float(row[5]) else 0, axis=1)
  # Créer le lien qui redirige directement vers une autre page web qui contient les stats du match en question 
  merged_df['lien_game1'] = merged_df.apply(lambda row: 'https://www.basketball-reference.com/boxscores/20'+str(annee)+ str(row['NewDate']) + '0' +str(row['Abbreviation/ Acronym_x'])+ '.html', axis=1)
  merged_df['lien_game2'] = merged_df.apply(lambda row: 'https://www.basketball-reference.com/boxscores/20'+str(annee) + str(row['NewDate']) + '0' +str(row['Abbreviation/ Acronym_y'])+ '.html', axis=1)
  merged_df = merged_df.drop(['NewDate','Franchise_x','Franchise_y'],axis=1)
  merged_df.columns =['NumberGames','Team1','PointTeam1','Team2','PointTeam2','Level','AbbreviationTeam1','AbbreviationTeam2','Winner','lien1','lien2']

  return merged_df, list(merged_df['lien1']),list(merged_df['lien2'])








* Voici une boucle qui utilise la fonction précédente pour récupérer tous les matchs des playoffs 18-22


In [None]:
# charger les données de chaque match de la saison 2018 jusqua 2022 et les essayer sur les préstation de l'equipe en 2023
df_Data= pd.DataFrame()
m=[]
n=[]


for i in range (18,22):
  k = 'https://www.basketball-reference.com/playoffs/NBA_20'+str(i)+'.html'
  m+=load_data_game(k,i)[1]
  n+=load_data_game(k,i)[2]
  Data = load_data_game(k,i)[0]
  df_Data = pd.concat([df_Data, Data], ignore_index=True)
df_Data

Unnamed: 0,NumberGames,Team1,PointTeam1,Team2,PointTeam2,Level,AbbreviationTeam1,AbbreviationTeam2,Winner,lien1,lien2
0,Game 1,Cleveland Cavaliers,114.0,Golden State Warriors,124.0,Finals,CLE,GSW,0,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
1,Game 2,Cleveland Cavaliers,103.0,Golden State Warriors,122.0,Finals,CLE,GSW,0,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
2,Game 3,Golden State Warriors,110.0,Cleveland Cavaliers,102.0,Finals,GSW,CLE,1,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
3,Game 4,Golden State Warriors,108.0,Cleveland Cavaliers,85.0,Finals,GSW,CLE,1,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
4,Game 1,Cleveland Cavaliers,83.0,Boston Celtics,108.0,Eastern conference finals,CLE,BOS,0,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
...,...,...,...,...,...,...,...,...,...,...,...
327,Game 1,Memphis Grizzlies,112.0,Utah Jazz,109.0,Western Conference First Round,MEM,UTA,1,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
328,Game 2,Memphis Grizzlies,129.0,Utah Jazz,141.0,Western Conference First Round,MEM,UTA,0,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
329,Game 3,Utah Jazz,121.0,Memphis Grizzlies,111.0,Western Conference First Round,UTA,MEM,1,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
330,Game 4,Utah Jazz,120.0,Memphis Grizzlies,113.0,Western Conference First Round,UTA,MEM,1,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...



* Chaque match des playoffs de 2018 à 2022 est répertorié, avec l'équipe 1 à droite et l'équipe 2 à gauche. Une colonne "winner" est présente pour indiquer le gagnant, et les liens 1 et 2 contiennent les statistiques individuelles de chaque équipe pendant ce match.
* Ici on peut voir qu'il ya u 332 match en playoff durant la periode 2018-2022 
* La prochaine étape consistera à ajouter ces facteurs à notre tableau de données et à les intégrer dans un modèle adapté à l'apprentissage supervisé. Une fois cela fait, nous serons en mesure de former le modèle directement.
* Cela nous permettra de réaliser des prédictions sur les résultats des playoffs en utilisant ces facteurs et d'entraîner le modèle pour qu'il puisse généraliser à n'importe quel autre ensemble de données.


In [None]:
# cette fonction permet de recuperer tout les advanced stats des equipe pour chaque game et les mettre dans un data frame 
import time
import urllib.error
def get_stats_games(data1, data2):
    df_team2 = pd.DataFrame()
    df_team1 = pd.DataFrame()
    f = 0
    r =7
    for i, j in zip(data1, data2):
        time.sleep(4)
        try:
              print(i)
              df = pd.read_html(i)            
              m= df[((len(df) - 16) // 2 + 7)].iloc[[-1],:]
              indexes =pd.Index([column[1] for column in m.keys()])
              m.columns = indexes
              n= df[len(df)-1].iloc[[-1],:]
              indexes =pd.Index([column[1] for column in n.keys()])
              n.columns = indexes
              df_team1 = pd.concat([df_team1, m], ignore_index=True)
              df_team1.loc[f,'id']=i
              df_team2 = pd.concat([df_team2, n], ignore_index=True)
              df_team2.loc[f, 'id']=i
              print(df_team2)

          
        except (urllib.error.HTTPError, ValueError) as e:
            print("Erreur :", e)
            try:
              print(j)
              df = pd.read_html(j)
              m= df[((len(df) - 16) // 2 + 7)].iloc[[-1],:]
              indexes =pd.Index([column[1] for column in m.keys()])
              m.columns = indexes
              n= df[len(df)-1].iloc[[-1],:]
              indexes =pd.Index([column[1] for column in n.keys()])
              n.columns = indexes
              df_team1 = pd.concat([df_team1, m], ignore_index=True)
              df_team1.loc[f,'id']=j
              df_team2 = pd.concat([df_team2, n], ignore_index=True)
              df_team2.loc[f, 'id']=j
              print(df_team2)
            except (urllib.error.HTTPError, ValueError) as e:
                 print("Erreur :", e)
              



        f+=1
        print(f)
    df = df_team1.merge(df_team2,on='id')
    return df

In [None]:
df = get_stats_games(n,m)

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
        Starters   MP   TS%  eFG%  3PAr   FTr  ORB%  DRB%  TRB%  AST%  STL%  \
0    Team Totals  265  .628  .583  .400  .222  10.5  64.2  41.8  67.4  10.1   
1    Team Totals  240  .669  .665  .439  .256  21.2  68.0  49.4  59.6   3.2   
2    Team Totals  240  .513  .484  .337  .185  32.6  84.2  56.0  50.0   6.5   
3    Team Totals  240  .434  .391  .310  .287  33.3  73.0  50.0  70.0   5.7   
4    Team Totals  240  .602  .577  .357  .155  16.2  82.4  54.5  62.8   6.5   
..           ...  ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   
201  Team Totals  240  .643  .639  .325  .217  25.0  80.0  53.9  63.8  11.5   
202  Team Totals  240  .585  .543  .370  .173  15.0  72.7  41.1  60.5   9.0   
203  Team Totals  240  .609  .538  .508  .600   2.4  67.6  33.3  85.7  10.5   
204  Team Totals  240  .686  .655  .500  .392  21.4  83.0  61.7  64.1   5.1   
205  Team Totals  240  .537  .478  .344

In [None]:
df

Unnamed: 0,Starters_x,MP_x,TS%_x,eFG%_x,3PAr_x,FTr_x,ORB%_x,DRB%_x,TRB%_x,AST%_x,...,DRB%_y,TRB%_y,AST%_y,STL%_y,BLK%_y,TOV%_y,USG%_y,ORtg_y,DRtg_y,BPM_y
0,Team Totals,265,.524,.495,.374,.222,35.8,89.5,58.2,40.9,...,64.2,41.8,67.4,10.1,9.7,6.6,100.0,125.4,115.3,
1,Team Totals,240,.508,.461,.300,.289,32.0,78.8,50.6,67.6,...,68.0,49.4,59.6,3.2,12.7,11.6,100.0,130.7,110.3,
2,Team Totals,240,.615,.574,.321,.235,15.8,67.4,44.0,64.3,...,84.2,56.0,50.0,6.5,7.3,11.6,100.0,109.9,118.5,
3,Team Totals,240,.580,.535,.442,.186,27.0,66.7,50.0,64.1,...,73.0,50.0,70.0,5.7,10.4,10.1,100.0,97.4,123.8,
4,Team Totals,240,.438,.384,.302,.233,17.6,83.8,45.5,58.1,...,82.4,54.5,62.8,6.5,6.7,9.1,100.0,117.6,90.4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,Team Totals,240,.513,.485,.200,.210,29.6,68.4,45.7,40.0,...,70.4,54.3,58.8,3.1,13.8,12.8,100.0,113.4,116.5,
438,Team Totals,240,.634,.588,.271,.447,23.1,66.7,44.0,43.5,...,76.9,56.0,57.1,2.0,6.5,10.4,100.0,140.1,128.1,
439,Team Totals,240,.651,.611,.531,.333,17.1,69.2,46.2,42.5,...,82.9,53.8,53.5,7.2,2.6,7.8,100.0,114.9,125.3,
440,Team Totals,240,.617,.559,.400,.329,16.7,83.3,50.0,59.0,...,83.3,50.0,60.5,8.0,7.8,8.2,100.0,113.4,120.4,


Ci dessus la base de données des facteurs de chaque equipes pendant chaque match precis 
* prochaine etape et de merger ces information avec notre premiére base de données 
* Faire du modeling car la plupart des modéle de machine learning n'accpte que des valeurs numirique  

In [None]:
# Ceci est notre Table principale sur la quelle on va se baser :
df_Data

Unnamed: 0,NumberGames,Team1,PointTeam1,Team2,PointTeam2,Level,AbbreviationTeam1,AbbreviationTeam2,Winner,lien1,lien2
0,Game 1,Cleveland Cavaliers,114.0,Golden State Warriors,124.0,Finals,CLE,GSW,0,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
1,Game 2,Cleveland Cavaliers,103.0,Golden State Warriors,122.0,Finals,CLE,GSW,0,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
2,Game 3,Golden State Warriors,110.0,Cleveland Cavaliers,102.0,Finals,GSW,CLE,1,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
3,Game 4,Golden State Warriors,108.0,Cleveland Cavaliers,85.0,Finals,GSW,CLE,1,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
4,Game 1,Cleveland Cavaliers,83.0,Boston Celtics,108.0,Eastern conference finals,CLE,BOS,0,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
...,...,...,...,...,...,...,...,...,...,...,...
327,Game 1,Memphis Grizzlies,112.0,Utah Jazz,109.0,Western Conference First Round,MEM,UTA,1,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
328,Game 2,Memphis Grizzlies,129.0,Utah Jazz,141.0,Western Conference First Round,MEM,UTA,0,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
329,Game 3,Utah Jazz,121.0,Memphis Grizzlies,111.0,Western Conference First Round,UTA,MEM,1,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...
330,Game 4,Utah Jazz,120.0,Memphis Grizzlies,113.0,Western Conference First Round,UTA,MEM,1,https://www.basketball-reference.com/boxscores...,https://www.basketball-reference.com/boxscores...


In [None]:
# ceci et notre bases de données des stats de chaque match ci dessus
df

Unnamed: 0,Starters_x,MP_x,TS%_x,eFG%_x,3PAr_x,FTr_x,ORB%_x,DRB%_x,TRB%_x,AST%_x,...,DRB%_y,TRB%_y,AST%_y,STL%_y,BLK%_y,TOV%_y,USG%_y,ORtg_y,DRtg_y,BPM_y
0,Team Totals,265,.524,.495,.374,.222,35.8,89.5,58.2,40.9,...,64.2,41.8,67.4,10.1,9.7,6.6,100.0,125.4,115.3,
1,Team Totals,240,.508,.461,.300,.289,32.0,78.8,50.6,67.6,...,68.0,49.4,59.6,3.2,12.7,11.6,100.0,130.7,110.3,
2,Team Totals,240,.615,.574,.321,.235,15.8,67.4,44.0,64.3,...,84.2,56.0,50.0,6.5,7.3,11.6,100.0,109.9,118.5,
3,Team Totals,240,.580,.535,.442,.186,27.0,66.7,50.0,64.1,...,73.0,50.0,70.0,5.7,10.4,10.1,100.0,97.4,123.8,
4,Team Totals,240,.438,.384,.302,.233,17.6,83.8,45.5,58.1,...,82.4,54.5,62.8,6.5,6.7,9.1,100.0,117.6,90.4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,Team Totals,240,.513,.485,.200,.210,29.6,68.4,45.7,40.0,...,70.4,54.3,58.8,3.1,13.8,12.8,100.0,113.4,116.5,
438,Team Totals,240,.634,.588,.271,.447,23.1,66.7,44.0,43.5,...,76.9,56.0,57.1,2.0,6.5,10.4,100.0,140.1,128.1,
439,Team Totals,240,.651,.611,.531,.333,17.1,69.2,46.2,42.5,...,82.9,53.8,53.5,7.2,2.6,7.8,100.0,114.9,125.3,
440,Team Totals,240,.617,.559,.400,.329,16.7,83.3,50.0,59.0,...,83.3,50.0,60.5,8.0,7.8,8.2,100.0,113.4,120.4,


*  Pour fusionner les données, nous utilisons la colonne "lien1" et "lien2", qui servent d'identifiants dans la première base de données, et la colonne "id" qui contient les liens correspondants

In [None]:
df_final = df_Data.merge(df,left_on='lien2',right_on='id', how= 'left')
df_final

Unnamed: 0,NumberGames,Team1,PointTeam1,Team2,PointTeam2,Level,AbbreviationTeam1,AbbreviationTeam2,Winner,lien1,...,DRB%_y,TRB%_y,AST%_y,STL%_y,BLK%_y,TOV%_y,USG%_y,ORtg_y,DRtg_y,BPM_y
0,Game 1,Cleveland Cavaliers,114.0,Golden State Warriors,124.0,Finals,CLE,GSW,0,https://www.basketball-reference.com/boxscores...,...,64.2,41.8,67.4,10.1,9.7,6.6,100.0,125.4,115.3,
1,Game 2,Cleveland Cavaliers,103.0,Golden State Warriors,122.0,Finals,CLE,GSW,0,https://www.basketball-reference.com/boxscores...,...,68.0,49.4,59.6,3.2,12.7,11.6,100.0,130.7,110.3,
2,Game 3,Golden State Warriors,110.0,Cleveland Cavaliers,102.0,Finals,GSW,CLE,1,https://www.basketball-reference.com/boxscores...,...,84.2,56.0,50.0,6.5,7.3,11.6,100.0,109.9,118.5,
3,Game 4,Golden State Warriors,108.0,Cleveland Cavaliers,85.0,Finals,GSW,CLE,1,https://www.basketball-reference.com/boxscores...,...,73.0,50.0,70.0,5.7,10.4,10.1,100.0,97.4,123.8,
4,Game 1,Cleveland Cavaliers,83.0,Boston Celtics,108.0,Eastern conference finals,CLE,BOS,0,https://www.basketball-reference.com/boxscores...,...,82.4,54.5,62.8,6.5,6.7,9.1,100.0,117.6,90.4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,Game 1,Memphis Grizzlies,112.0,Utah Jazz,109.0,Western Conference First Round,MEM,UTA,1,https://www.basketball-reference.com/boxscores...,...,70.4,54.3,58.8,3.1,13.8,12.8,100.0,113.4,116.5,
328,Game 2,Memphis Grizzlies,129.0,Utah Jazz,141.0,Western Conference First Round,MEM,UTA,0,https://www.basketball-reference.com/boxscores...,...,76.9,56.0,57.1,2.0,6.5,10.4,100.0,140.1,128.1,
329,Game 3,Utah Jazz,121.0,Memphis Grizzlies,111.0,Western Conference First Round,UTA,MEM,1,https://www.basketball-reference.com/boxscores...,...,82.9,53.8,53.5,7.2,2.6,7.8,100.0,114.9,125.3,
330,Game 4,Utah Jazz,120.0,Memphis Grizzlies,113.0,Western Conference First Round,UTA,MEM,1,https://www.basketball-reference.com/boxscores...,...,83.3,50.0,60.5,8.0,7.8,8.2,100.0,113.4,120.4,


In [None]:
df_final = df_final[['NumberGames', 'Team1', 'Team2', 'Level', 'Winner', 'MP_x', 'TS%_x', 'eFG%_x', '3PAr_x', 'FTr_x', 'ORB%_x','DRB%_x', 'TRB%_x', 'AST%_x', 'STL%_x', 'BLK%_x', 'TOV%_x', 'USG%_x',
       'ORtg_x', 'DRtg_x', 'MP_y', 'TS%_y','eFG%_y', '3PAr_y', 'FTr_y', 'ORB%_y', 'DRB%_y', 'TRB%_y', 'AST%_y','STL%_y', 'BLK%_y', 'TOV%_y', 'USG%_y', 'ORtg_y', 'DRtg_y']]

# Pandas a une fonction itégrer pour transformer les données repetitif en données numirique 
df_final = pd.get_dummies(df_final, columns=['NumberGames'])
df_final = pd.get_dummies(df_final, columns=['Level'])
df_final

Unnamed: 0,Team1,Team2,Winner,MP_x,TS%_x,eFG%_x,3PAr_x,FTr_x,ORB%_x,DRB%_x,...,NumberGames_Game 5,NumberGames_Game 6,NumberGames_Game 7,Level_Eastern Conference First Round,Level_Eastern conference finals,Level_Eastern conference semifinals,Level_Finals,Level_Western Conference First Round,Level_Western conference finals,Level_Western conference semifinals
0,Cleveland Cavaliers,Golden State Warriors,0,265,.524,.495,.374,.222,35.8,89.5,...,0,0,0,0,0,0,1,0,0,0
1,Cleveland Cavaliers,Golden State Warriors,0,240,.508,.461,.300,.289,32.0,78.8,...,0,0,0,0,0,0,1,0,0,0
2,Golden State Warriors,Cleveland Cavaliers,1,240,.615,.574,.321,.235,15.8,67.4,...,0,0,0,0,0,0,1,0,0,0
3,Golden State Warriors,Cleveland Cavaliers,1,240,.580,.535,.442,.186,27.0,66.7,...,0,0,0,0,0,0,1,0,0,0
4,Cleveland Cavaliers,Boston Celtics,0,240,.438,.384,.302,.233,17.6,83.8,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,Memphis Grizzlies,Utah Jazz,1,240,.513,.485,.200,.210,29.6,68.4,...,0,0,0,0,0,0,0,1,0,0
328,Memphis Grizzlies,Utah Jazz,0,240,.634,.588,.271,.447,23.1,66.7,...,0,0,0,0,0,0,0,1,0,0
329,Utah Jazz,Memphis Grizzlies,1,240,.651,.611,.531,.333,17.1,69.2,...,0,0,0,0,0,0,0,1,0,0
330,Utah Jazz,Memphis Grizzlies,1,240,.617,.559,.400,.329,16.7,83.3,...,0,0,0,0,0,0,0,1,0,0


In [None]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 332 entries, 0 to 331
Data columns (total 47 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Team1                                 332 non-null    object
 1   Team2                                 332 non-null    object
 2   Winner                                332 non-null    int64 
 3   MP_x                                  310 non-null    object
 4   TS%_x                                 310 non-null    object
 5   eFG%_x                                310 non-null    object
 6   3PAr_x                                310 non-null    object
 7   FTr_x                                 310 non-null    object
 8   ORB%_x                                310 non-null    object
 9   DRB%_x                                310 non-null    object
 10  TRB%_x                                310 non-null    object
 11  AST%_x                          

* Dans notre base de données, nous constatons que la plupart des colonnes sont de type "objet". Afin de pouvoir effectuer des calculs et des analyses appropriées, nous devons convertir ces colonnes en type "float". De plus, nous avons également remarqué la présence de certaines valeurs nulles dans nos données, ce qui peut affecter la qualité de nos analyses. Par conséquent, il est nécessaire de supprimer ces valeurs nulles pour garantir l'intégrité de notre base de données.

In [None]:
# conversion au float :
l=['MP_x','TS%_x','eFG%_x','3PAr_x','FTr_x','ORB%_x','DRB%_x','TRB%_x','AST%_x','STL%_x','BLK%_x','TOV%_x','USG%_x','ORtg_x','DRtg_x','MP_y','TS%_y','eFG%_y','3PAr_y','FTr_y','ORB%_y','DRB%_y','TRB%_y','AST%_y','STL%_y','BLK%_y','TOV%_y','USG%_y','ORtg_y','DRtg_y']                               
for i in l:
  df_final[i] = df_final[i].astype(float)


df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 332 entries, 0 to 331
Data columns (total 47 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Team1                                 332 non-null    object 
 1   Team2                                 332 non-null    object 
 2   Winner                                332 non-null    int64  
 3   MP_x                                  310 non-null    float64
 4   TS%_x                                 310 non-null    float64
 5   eFG%_x                                310 non-null    float64
 6   3PAr_x                                310 non-null    float64
 7   FTr_x                                 310 non-null    float64
 8   ORB%_x                                310 non-null    float64
 9   DRB%_x                                310 non-null    float64
 10  TRB%_x                                310 non-null    float64
 11  AST%_x             

* Maintenant que les données sont transformer on doit convertir team1 et team2 en entier 
* puis suprimer les valeurs vides

In [None]:

d = pd.read_html('https://en.wikipedia.org/wiki/Wikipedia:WikiProject_National_Basketball_Association/National_Basketball_Association_team_abbreviations',header=0)
d = d[0]
d.at[23,'Abbreviation/ Acronym'] = 'PHO'
d.at[2,'Abbreviation/ Acronym'] = 'NJN'
d
dic={}
for i in range(30):
  dic.setdefault(d.loc[i,'Franchise'], i)

dic
  

{'Atlanta Hawks': 0,
 'Boston Celtics': 1,
 'Brooklyn Nets': 2,
 'Charlotte Hornets': 3,
 'Chicago Bulls': 4,
 'Cleveland Cavaliers': 5,
 'Dallas Mavericks': 6,
 'Denver Nuggets': 7,
 'Detroit Pistons': 8,
 'Golden State Warriors': 9,
 'Houston Rockets': 10,
 'Indiana Pacers': 11,
 'Los Angeles Clippers': 12,
 'Los Angeles Lakers': 13,
 'Memphis Grizzlies': 14,
 'Miami Heat': 15,
 'Milwaukee Bucks': 16,
 'Minnesota Timberwolves': 17,
 'New Orleans Pelicans': 18,
 'New York Knicks': 19,
 'Oklahoma City Thunder': 20,
 'Orlando Magic': 21,
 'Philadelphia 76ers': 22,
 'Phoenix Suns': 23,
 'Portland Trail Blazers': 24,
 'Sacramento Kings': 25,
 'San Antonio Spurs': 26,
 'Toronto Raptors': 27,
 'Utah Jazz': 28,
 'Washington Wizards': 29}

* Ceci est notre dictionnaire des franchises 

In [None]:
# remplacer par des entier 
df_final['Team1'] = df_final['Team1'].replace(dic)
df_final['Team2'] = df_final['Team2'].replace(dic)

df_not_null = df_final.dropna(how='any')



In [None]:
df_not_null.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 310 entries, 0 to 331
Data columns (total 47 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Team1                                 310 non-null    int64  
 1   Team2                                 310 non-null    int64  
 2   Winner                                310 non-null    int64  
 3   MP_x                                  310 non-null    float64
 4   TS%_x                                 310 non-null    float64
 5   eFG%_x                                310 non-null    float64
 6   3PAr_x                                310 non-null    float64
 7   FTr_x                                 310 non-null    float64
 8   ORB%_x                                310 non-null    float64
 9   DRB%_x                                310 non-null    float64
 10  TRB%_x                                310 non-null    float64
 11  AST%_x             

* Maintenant que notre base de données est prête, nous pouvons l'utiliser pour entraîner un modèle de machine learning. Les données sont nettoyées, les valeurs manquantes ont été traitées et les colonnes ont été converties au bon type de données.

* Nous pouvons maintenant sélectionner les caractéristiques pertinentes pour notre modèle, diviser les données en ensembles d'entraînement et de test, puis appliquer l'algorithme de machine learning de notre choix pour entraîner le modèle.

In [None]:
x = df_not_null.drop(columns='Winner')
y = df_not_null['Winner']

print(x.shape)
print(y.shape)

(310, 46)
(310,)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

accuracy = []

for fold in range(0, 10):
    # Instantiate algorithm
    model = SVC()
    scaler = StandardScaler()

    # Create training and test samples
    X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.65, random_state=42)

    # Scale X data
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Train model
    trained_model = model.fit(X_train, y_train)

    # Generate predictions on test sample
    y_pred = trained_model.predict(X_test)

    # Compute accuracy
    acc = accuracy_score(y_true=y_test, y_pred=y_pred)
    accuracy.append(acc)
    print(f"Fold {fold + 1}: Accuracy = {acc:.3f}")

    # Get the score of the trained model
    score = trained_model.score(X_test, y_test)
    print(f"Fold {fold + 1}: Model Score = {score:.3f}")

print(f"Average Accuracy: {(sum(accuracy) / len(accuracy)):.2f}")

Fold 1: Accuracy = 0.908
Fold 1: Model Score = 0.908
Fold 2: Accuracy = 0.908
Fold 2: Model Score = 0.908
Fold 3: Accuracy = 0.908
Fold 3: Model Score = 0.908
Fold 4: Accuracy = 0.908
Fold 4: Model Score = 0.908
Fold 5: Accuracy = 0.908
Fold 5: Model Score = 0.908
Fold 6: Accuracy = 0.908
Fold 6: Model Score = 0.908
Fold 7: Accuracy = 0.908
Fold 7: Model Score = 0.908
Fold 8: Accuracy = 0.908
Fold 8: Model Score = 0.908
Fold 9: Accuracy = 0.908
Fold 9: Model Score = 0.908
Fold 10: Accuracy = 0.908
Fold 10: Model Score = 0.908
Average Accuracy: 0.91


* Finalement, j'ai décidé d'utiliser les SVM (Support Vector Machines) pour la classification binaire. Les SVM sont des modèles puissants qui cherchent à trouver un hyperplan optimal pour séparer les deux classes dans un espace de plus grande dimension. J'ai choisi les SVM en raison de leur capacité éprouvée en matière de classification, bien que les réseaux neuronaux soient également une option viable et ont fitleur preuve dans la classification.

* En utilisant les SVM sur ce jeu de données, j'ai obtenu des résultats satisfaisants. Le modèle s'est révélé robuste, avec une précision de prédiction de 90% lorsque les données étaient réparties à 65% pour l'ensemble d'entraînement et à 35% pour l'ensemble de test. Cela démontre l'efficacité du modèle pour cette tâche de classification.

# Application sur les données de 2023: 
* Pour déterminer le vainqueur des finales, nous utilisons les statistiques moyennes de chaque équipe finaliste et tentons de prédire les résultats de chaque match. Le processus consiste à comparer les performances des deux équipes lors de chaque affrontement et à accumuler les victoires. L'équipe qui remporte quatre victoires est couronnée championne de la NBA 2023. Cette approche permet de prendre en compte les performances globales des équipes tout au long des matchs et de déterminer le vainqueur en fonction de leur performance collective.

In [None]:
# prestations de l'année 2023 de 	Boston Celtics 

Boston = pd.read_html('https://www.basketball-reference.com/boxscores/202305170BOS.html')[15].iloc[[-1],:]

indexes =pd.Index([column[1] for column in Boston.keys()])
Boston.columns = indexes
print(Boston)


# prestations de l'année 2023 de Miami Heat
Miami = pd.read_html('https://www.basketball-reference.com/boxscores/202305170BOS.html')[7].iloc[[-1],:]

indexes =pd.Index([column[1] for column in Miami.keys()])
Miami.columns = indexes
print(Miami)


# prestations de l'année 2023 de	Denver Nuggets 
Denver = pd.read_html('https://www.basketball-reference.com/boxscores/202305160DEN.html')[15].iloc[[-1],:]

indexes =pd.Index([column[1] for column in Denver.keys()])
Denver.columns = indexes
print(Denver)

# prestations de l'année 2023 de Los Angeles Lakers 


Lakers = pd.read_html('https://www.basketball-reference.com/boxscores/202305160DEN.html')[7].iloc[[-1],:]

indexes =pd.Index([column[1] for column in Lakers.keys()])
Lakers.columns = indexes
print(Lakers)





       Starters   MP   TS%  eFG%  3PAr   FTr  ORB%  DRB%  TRB%  AST% STL%  \
15  Team Totals  240  .619  .580  .358  .358  24.3  78.1  49.3  52.4  6.2   

   BLK%  TOV%   USG%   ORtg   DRtg  BPM  
15  5.6  13.8  100.0  120.3  127.5  NaN  
       Starters   MP   TS%  eFG%  3PAr   FTr  ORB%  DRB%  TRB%  AST%  STL%  \
14  Team Totals  240  .659  .635  .365  .224  21.9  75.7  50.7  43.5  12.4   

   BLK%  TOV%   USG%   ORtg   DRtg  BPM  
14  5.8  11.4  100.0  127.5  120.3  NaN  
       Starters   MP   TS%  eFG%  3PAr   FTr  ORB%  DRB%  TRB%  AST% STL%  \
16  Team Totals  240  .656  .632  .352  .242  37.5  86.5  61.0  58.0  5.3   

    BLK% TOV%   USG%   ORtg   DRtg  BPM  
16  11.7  9.8  100.0  138.8  132.5  NaN  
       Starters   MP   TS%  eFG%  3PAr   FTr  ORB%  DRB%  TRB%  AST% STL%  \
15  Team Totals  240  .660  .613  .286  .310  13.5  62.5  39.0  65.2  6.3   

   BLK% TOV%   USG%   ORtg   DRtg  BPM  
15  6.8  6.8  100.0  132.5  138.8  NaN  


* Ci dessus les states moyennes des derniéres rencontres

In [None]:
df_2023 = df_not_null.copy()
df_2023.keys()

Index(['Team1', 'Team2', 'Winner', 'MP_x', 'TS%_x', 'eFG%_x', '3PAr_x',
       'FTr_x', 'ORB%_x', 'DRB%_x', 'TRB%_x', 'AST%_x', 'STL%_x', 'BLK%_x',
       'TOV%_x', 'USG%_x', 'ORtg_x', 'DRtg_x', 'MP_y', 'TS%_y', 'eFG%_y',
       '3PAr_y', 'FTr_y', 'ORB%_y', 'DRB%_y', 'TRB%_y', 'AST%_y', 'STL%_y',
       'BLK%_y', 'TOV%_y', 'USG%_y', 'ORtg_y', 'DRtg_y', 'NumberGames_Game 1',
       'NumberGames_Game 2', 'NumberGames_Game 3', 'NumberGames_Game 4',
       'NumberGames_Game 5', 'NumberGames_Game 6', 'NumberGames_Game 7',
       'Level_Eastern Conference First Round',
       'Level_Eastern conference finals',
       'Level_Eastern conference semifinals', 'Level_Finals',
       'Level_Western Conference First Round',
       'Level_Western conference finals',
       'Level_Western conference semifinals'],
      dtype='object')

In [None]:
def Match(team1, team2, n_ieme_match, level, df1, df2):
    M = {'Team1': 0, 'Team2': 0, 'NumberGames_Game 1': 0,
         'NumberGames_Game 2': 0, 'NumberGames_Game 3': 0, 'NumberGames_Game 4': 0,
         'NumberGames_Game 5': 0, 'NumberGames_Game 6': 0, 'NumberGames_Game 7': 0,
         'Level_Eastern Conference First Round': 0,
         'Level_Eastern conference finals': 0,
         'Level_Eastern conference semifinals': 0, 'Level_Finals': 0,
         'Level_Western Conference First Round': 0,
         'Level_Western conference finals': 0,
         'Level_Western conference semifinals': 0}

    M['Team1'] = team1
    M['Team2'] = team2
    M[n_ieme_match] = 1
    M[level] = 1

    df = pd.DataFrame.from_records([M], columns=['Team1', 'Team2', 'NumberGames_Game 1',
                                                 'NumberGames_Game 2', 'NumberGames_Game 3', 'NumberGames_Game 4',
                                                 'NumberGames_Game 5', 'NumberGames_Game 6', 'NumberGames_Game 7',
                                                 'Level_Eastern Conference First Round',
                                                 'Level_Eastern conference finals',
                                                 'Level_Eastern conference semifinals', 'Level_Finals',
                                                 'Level_Western Conference First Round',
                                                 'Level_Western conference finals',
                                                 'Level_Western conference semifinals'])
    print(df)
    print(df1)
    print(df2)
    df_concatenated = pd.merge(df1, df2, on = 'Starters', how = 'outer',suffixes=('_x','_y'))
    print(df_concatenated)
    df_concatenated = pd.merge(df_concatenated, df, left_index=True, right_index=True)
    df_concatenated = df_concatenated[['Team1', 'Team2', 'MP_x', 'TS%_x', 'eFG%_x', '3PAr_x',
       'FTr_x', 'ORB%_x', 'DRB%_x', 'TRB%_x', 'AST%_x', 'STL%_x', 'BLK%_x',
       'TOV%_x', 'USG%_x', 'ORtg_x', 'DRtg_x', 'MP_y', 'TS%_y', 'eFG%_y',
       '3PAr_y', 'FTr_y', 'ORB%_y', 'DRB%_y', 'TRB%_y', 'AST%_y', 'STL%_y',
       'BLK%_y', 'TOV%_y', 'USG%_y', 'ORtg_y', 'DRtg_y', 'NumberGames_Game 1',
       'NumberGames_Game 2', 'NumberGames_Game 3', 'NumberGames_Game 4',
       'NumberGames_Game 5', 'NumberGames_Game 6', 'NumberGames_Game 7',
       'Level_Eastern Conference First Round',
       'Level_Eastern conference finals',
       'Level_Eastern conference semifinals', 'Level_Finals',
       'Level_Western Conference First Round',
       'Level_Western conference finals',
       'Level_Western conference semifinals']]
    return df_concatenated

# Cette fonction crée le match ci dessus qu'on veut jouer  

In [None]:
# Match  Eastern Conference Finals :
################## Boston Celtics xxx Miami Heat#############
# 4 premier match 
BxM_1 = Match(1,15,'NumberGames_Game 1','Level_Eastern conference finals',Boston,Miami)
BxM_2 = Match(1,15,'NumberGames_Game 2','Level_Eastern conference finals',Boston,Miami)
BxM_3 = Match(1,15,'NumberGames_Game 3','Level_Eastern conference finals',Boston,Miami)
BxM_4 = Match(1,15,'NumberGames_Game 4','Level_Eastern conference finals',Boston,Miami)

# Match  Western Conference Finals :     

################## 	Denver Nuggets xxx  Los Angeles Lakers#############
# 4 premier match 
NxL_1 = Match(7,13,'NumberGames_Game 1','Level_Western conference finals',Denver,Lakers)
NxL_2 = Match(7,13,'NumberGames_Game 2','Level_Western conference finals',Denver,Lakers)
NxL_3 = Match(7,13,'NumberGames_Game 3','Level_Western conference finals',Denver,Lakers)
NxL_4 = Match(7,13,'NumberGames_Game 4','Level_Western conference finals',Denver,Lakers)



   Team1  Team2  NumberGames_Game 1  NumberGames_Game 2  NumberGames_Game 3  \
0      1     15                   1                   0                   0   

   NumberGames_Game 4  NumberGames_Game 5  NumberGames_Game 6  \
0                   0                   0                   0   

   NumberGames_Game 7  Level_Eastern Conference First Round  \
0                   0                                     0   

   Level_Eastern conference finals  Level_Eastern conference semifinals  \
0                                1                                    0   

   Level_Finals  Level_Western Conference First Round  \
0             0                                     0   

   Level_Western conference finals  Level_Western conference semifinals  
0                                0                                    0  
       Starters   MP   TS%  eFG%  3PAr   FTr  ORB%  DRB%  TRB%  AST% STL%  \
15  Team Totals  240  .619  .580  .358  .358  24.3  78.1  49.3  52.4  6.2   

   BLK%  TOV% 

In [None]:
X_new_scaled = scaler.transform(BxM_1)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Miami gagne, sinon Boston gagne :", predictions)

X_new_scaled = scaler.transform(BxM_2)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Miami gagne, sinon Boston gagne :", predictions)

X_new_scaled = scaler.transform(BxM_3)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Miami gagne, sinon Boston gagne :", predictions)

X_new_scaled = scaler.transform(BxM_4)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Miami gagne, sinon Boston gagne :", predictions)

X_new_scaled = scaler.transform(NxL_1)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Lakers gagne, sinon Denver gagne :", predictions)

X_new_scaled = scaler.transform(NxL_2)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Lakers gagne, sinon Denver gagne :", predictions)


X_new_scaled = scaler.transform(NxL_3)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Lakers gagne, sinon Denver gagne :", predictions)


X_new_scaled = scaler.transform(NxL_4)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Lakers gagne, sinon Denver gagne :", predictions)



Si 0 Miami gagne, sinon Boston gagne : [0]
Si 0 Miami gagne, sinon Boston gagne : [0]
Si 0 Miami gagne, sinon Boston gagne : [0]
Si 0 Miami gagne, sinon Boston gagne : [0]
Si 0 Lakers gagne, sinon Denver gagne : [1]
Si 0 Lakers gagne, sinon Denver gagne : [1]
Si 0 Lakers gagne, sinon Denver gagne : [1]
Si 0 Lakers gagne, sinon Denver gagne : [1]


* Note : Il est remarquable que notre modèle a correctement prédit les trois matchs déjà joués. Par conséquent, selon les prédictions, la finale opposera Miami à Denver.

# Qui est le grand gagnant de la NBA 2023 ... 

In [None]:
# Match Finals :     

################## 	Denver Nuggets xxx  Miami Heat #############
# 4 premier matchs 
NxD_1 = Match(15,7,'NumberGames_Game 1','Level_finals',Miami,Denver)
NxD_2 = Match(15,7,'NumberGames_Game 2','Level_finals',Miami,Denver)
NxD_3 = Match(15,7,'NumberGames_Game 3','Level_finals',Miami,Denver)
NxD_4 = Match(15,7,'NumberGames_Game 4','Level_finals',Miami,Denver)
NxD_5 = Match(15,7,'NumberGames_Game 5','Level_finals',Miami,Denver)

   Team1  Team2  NumberGames_Game 1  NumberGames_Game 2  NumberGames_Game 3  \
0     15      7                   1                   0                   0   

   NumberGames_Game 4  NumberGames_Game 5  NumberGames_Game 6  \
0                   0                   0                   0   

   NumberGames_Game 7  Level_Eastern Conference First Round  \
0                   0                                     0   

   Level_Eastern conference finals  Level_Eastern conference semifinals  \
0                                0                                    0   

   Level_Finals  Level_Western Conference First Round  \
0             0                                     0   

   Level_Western conference finals  Level_Western conference semifinals  
0                                0                                    0  
       Starters   MP   TS%  eFG%  3PAr   FTr  ORB%  DRB%  TRB%  AST%  STL%  \
14  Team Totals  240  .659  .635  .365  .224  21.9  75.7  50.7  43.5  12.4   

   BLK%  TOV

In [None]:

X_new_scaled = scaler.transform(NxD_1)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Denver gagne, sinon Miami gagne :", predictions)

X_new_scaled = scaler.transform(NxD_2)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Denver gagne, sinon Miami gagne :", predictions)


X_new_scaled = scaler.transform(NxD_3)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Denver gagne, sinon Miami gagne :", predictions)


X_new_scaled = scaler.transform(NxD_4)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Denver gagne, sinon Miami gagne :", predictions)


X_new_scaled = scaler.transform(NxD_5)
# Make predictions on the new data
predictions = trained_model.predict(X_new_scaled)
print("Si 0 Denver gagne, sinon Miami gagne :", predictions)


Si 0 Denver gagne, sinon Miami gagne : [1]
Si 0 Denver gagne, sinon Miami gagne : [0]
Si 0 Denver gagne, sinon Miami gagne : [1]
Si 0 Denver gagne, sinon Miami gagne : [1]
Si 0 Denver gagne, sinon Miami gagne : [1]


## **Bravo** : Après avoir disputé cinq matchs, les Miami Heat ont été sacrés champions de la NBA 2023 avec un score de (4-1).