In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config; set_config(display='diagram')

In [3]:
# sklearn preproc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import make_scorer

# Chargement des Datasets

In [7]:
data_X = pd.read_csv('../nba_forecast/data/dataset_X_cleaned.csv')

In [8]:
data_y = pd.read_csv('../nba_forecast/data/player_ratio_scores.csv')

# Vérification de la concordance des deux Datasets

In [9]:
data_X.shape

(335, 41)

In [10]:
data_y.shape

(334, 9)

In [11]:
data_y.head()

Unnamed: 0,player_id,player_name,pos,off_score,def_score,uni_off_score,uni_def_score,ratio_off,ratio_def
0,anthony-davis-5,Anthony Davis,PF,3.96,3.34,3.68,3.79,1.08,0.88
1,michael-kidd-gilchrist-1,Michael Kidd-Gilchrist,SF,2.68,1.89,2.73,1.96,0.98,0.96
2,bradley-beal-1,Bradley Beal,SG,2.73,1.71,2.93,1.99,0.93,0.86
3,dion-waiters-1,Dion Waiters,SG,2.36,1.49,2.97,2.04,0.79,0.73
4,thomas-robinson-2,Thomas Robinson,PF,2.68,2.17,3.03,2.67,0.88,0.81


In [13]:
data_X[~data_X.player_name.isin(data_y.player_name)]

Unnamed: 0,player_name,season,school_name,conf_abbr,g,gs,mp,per,ts_pct,efg_pct,...,body_fat_pct,hand_length,hand_width,height_wo_shoes,height_w_shoes,standing_reach,weight,wingspan,"('Alex Len', 'weight')","('Markelle Fultz', 'weight')"
29,Jeff Taylor,2011-12,Vanderbilt,SEC,36,36,1157,23.7,0.58,0.57,...,4.2,210,248,1981,2013,2578,212.8,1988,0,0
30,Bernard James,2011-12,Florida State,ACC,35,35,979,23.3,0.602,0.606,...,5.0,229,235,2051,2083,2756,229.8,2210,0,0
77,Glen Rice Jr,2011-12,Georgia Tech,ACC,21,10,628,23.0,0.539,0.525,...,,0,0,0,0,0,,0,0,0
122,Joe Harris,2013-14,Virginia,ACC,37,37,1066,18.7,0.565,0.547,...,6.2,216,248,1949,1988,2540,215.0,1981,0,0
185,Sir 'Dominic Pointer,2014-15,St. John's (NY),Big East,33,32,1143,24.7,0.566,0.527,...,,0,0,0,0,0,,0,0,0
240,Dennis Smith Jr,2016-17,NC State,ACC,32,32,1114,23.1,0.563,0.52,...,,0,0,0,0,0,,0,0,0
246,Justin Jackson (UNC),2016-17,UNC,ACC,40,39,1281,21.3,0.555,0.531,...,8.1,222,235,2007,2038,2654,200.8,2108,0,0
304,Chandler Hutchinson,2017-18,Boise State,MWC,31,31,961,25.9,0.575,0.528,...,,0,0,0,0,0,,0,0,0
313,Devonte Graham,2017-18,Kansas,Big 12,39,39,1474,21.2,0.568,0.51,...,4.8,216,241,1835,1867,2438,186.4,1988,0,0
319,Justin Jackson (UMD),2017-18,Maryland,Big Ten,11,10,321,14.2,0.47,0.416,...,7.1,229,241,1975,2007,2718,219.4,2216,0,0


In [36]:
data_y[data_y.player_name.str.contains('Spalding')]

Unnamed: 0,player_id,player_name,pos,off_score,def_score,uni_off_score,uni_def_score,ratio_off,ratio_def
330,raymond-spalding-1,Raymond Spalding,,,,2.71,2.56,,


**On prend <u>data_X</u> comme référence concernant les noms de joueurs.**

In [41]:
data_y.loc[26,'player_name'] = 'Jeff Taylor'
data_y.loc[27,'player_name'] = 'Bernard James'
data_y.loc[257,'player_name'] = 'Glen Rice Jr'
data_y.loc[94,'player_name'] = 'Joe Harris'
data_y.loc[292,'player_name'] = "Sir 'Dominic Pointer"
data_y.loc[175,'player_name'] = 'Dennis Smith Jr'
data_y.loc[326,'player_name'] = 'Justin Jackson (UNC)'
data_y.loc[225,'player_name'] = 'Chandler Hutchinson'
data_y.loc[232,'player_name'] = 'Devonte Graham'
data_X.drop(319,inplace=True) # a drop dans X
data_y.loc[238,'player_name'] = 'DeAnthony Melton' 
data_y.loc[330,'player_name'] = 'Ray Spalding'

# Merge du dataset X et Y sur le nom des joueurs

## Drop des colonnes abérrantes *data_X*

In [43]:
data_X.head()

Unnamed: 0,player_name,season,school_name,conf_abbr,g,gs,mp,per,ts_pct,efg_pct,...,body_fat_pct,hand_length,hand_width,height_wo_shoes,height_w_shoes,standing_reach,weight,wingspan,"('Alex Len', 'weight')","('Markelle Fultz', 'weight')"
0,Anthony Davis,2011-12,Kentucky,SEC,40,40,1281,35.1,0.654,0.628,...,7.9,229,216,2064,2096,2743,221.8,2273,0,0
1,Michael Kidd-Gilchrist,2011-12,Kentucky,SEC,40,39,1245,21.2,0.57,0.511,...,7.0,229,260,1975,2019,2654,232.8,2134,0,0
2,Bradley Beal,2011-12,Florida,SEC,37,37,1267,22.0,0.575,0.525,...,6.0,216,229,1911,1949,2540,201.8,2032,0,0
3,Dion Waiters,2011-12,Syracuse,Big East,37,0,891,26.3,0.565,0.534,...,8.5,216,241,1892,1930,2489,221.0,2013,0,0
4,Thomas Robinson,2011-12,Kansas,Big 12,39,39,1242,27.4,0.549,0.512,...,5.0,248,267,2026,2051,2692,244.2,2216,0,0


In [46]:
data_X.drop(["('Alex Len', 'weight')", "('Markelle Fultz', 'weight')"], axis=1, inplace=True)

## Merge des deux Datasets

In [47]:
dataset_complet = data_X.set_index('player_name').join(data_y.set_index('player_name'))

In [48]:
dataset_complet.head()

Unnamed: 0_level_0,season,school_name,conf_abbr,g,gs,mp,per,ts_pct,efg_pct,fg3a_per_fga_pct,...,weight,wingspan,player_id,pos,off_score,def_score,uni_off_score,uni_def_score,ratio_off,ratio_def
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anthony Davis,2011-12,Kentucky,SEC,40,40,1281,35.1,0.654,0.628,0.059,...,221.8,2273,anthony-davis-5,PF,3.96,3.34,3.68,3.79,1.08,0.88
Michael Kidd-Gilchrist,2011-12,Kentucky,SEC,40,39,1245,21.2,0.57,0.511,0.156,...,232.8,2134,michael-kidd-gilchrist-1,SF,2.68,1.89,2.73,1.96,0.98,0.96
Bradley Beal,2011-12,Florida,SEC,37,37,1267,22.0,0.575,0.525,0.473,...,201.8,2032,bradley-beal-1,SG,2.73,1.71,2.93,1.99,0.93,0.86
Dion Waiters,2011-12,Syracuse,Big East,37,0,891,26.3,0.565,0.534,0.317,...,221.0,2013,dion-waiters-1,SG,2.36,1.49,2.97,2.04,0.79,0.73
Thomas Robinson,2011-12,Kansas,Big 12,39,39,1242,27.4,0.549,0.512,0.027,...,244.2,2216,thomas-robinson-2,PF,2.68,2.17,3.03,2.67,0.88,0.81


In [49]:
dataset_complet.to_csv('../nba_forecast/data/dataset_complet_X_y.csv', index=False)