In [13]:
import pandas as pd
from pathlib import Path

raw = Path("nba_rating/data/raw")
phys_files = sorted(raw.glob("player_phys_*.parquet"))

for path in phys_files:
    df = pd.read_parquet(path)
    # repère les joueurs dont EXP est "0" (string) et pas "R"
    mask_str0 = df["EXP"] == "0"
    if mask_str0.any():
        print(f"{path.name} → {mask_str0.sum()} joueurs avec EXP=='0'")


In [14]:
import pandas as pd
df = pd.read_parquet("../data/curated/all_seasons_scores.parquet")
df

Unnamed: 0,PLAYER_ID,season,score_100
0,3,1999-00,45.628462
1,15,1999-00,45.466705
2,21,1999-00,50.659263
3,22,1999-00,62.031509
4,23,1999-00,39.266827
...,...,...,...
12094,1641926,2023-24,31.717632
12095,1641931,2023-24,25.017670
12096,1641970,2023-24,47.228638
12097,1641998,2023-24,39.596597


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12099 entries, 0 to 12098
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   PLAYER_ID  12099 non-null  int64  
 1   season     12099 non-null  object 
 2   score_100  12099 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 283.7+ KB


In [16]:
df[df["score_100"] >= 100]

Unnamed: 0,PLAYER_ID,season,score_100
139,406,1999-00,100.0
560,406,2000-01,100.0
974,406,2001-02,100.0
1533,1717,2002-03,100.0
2505,2405,2004-05,100.0
3174,1495,2006-07,100.0
3611,977,2007-08,100.0
4195,2544,2008-09,100.0
4614,2544,2009-10,100.0
5028,2544,2010-11,100.0


In [17]:
import pandas as pd
from pathlib import Path

# 1) Chargement du dataset final
CURATED = Path("../data/curated")
path    = CURATED / "dataset_ml.parquet"
df      = pd.read_parquet(path)

print(f"✔️ Nombre de lignes : {len(df)}")
print(f"✔️ Nombre de colonnes : {len(df.columns)}")
print("\nColonnes présentes :")
print(df.columns.tolist())

# 2) Vérifier que toutes les features attendues sont bien là
features_used = [
    "pts_mean", "reb_mean", "ast_mean", "plus_minus_mean",
    "efg_pct", "ts_pct",
    "stl_mean", "blk_mean", "tov_mean",
    "pts36", "reb36", "ast36", "stl36", "blk36", "tov36", "pm36",
    "min_per_game", "avail",
    "esv_mean", "pace",
    "height_cm", "bmi", "age", "exp",
    "delta_score"
]
expected = set(features_used + ["target_note_n1"])
missing = expected - set(df.columns)
if missing:
    print(f"❌ Il manque ces colonnes : {sorted(missing)}")
else:
    print("✔️ Toutes les colonnes attendues sont présentes.")

# 3) Vérifier qu’il n’y a pas de valeurs manquantes sur les features et la cible
na_counts = df[ list(expected) ].isna().sum()
print("\nValeurs manquantes par colonne :")
print(na_counts[na_counts>0] if na_counts.any() else "✔️ Aucune valeur manquante.")

# 4) Afficher un aperçu
print("\nAperçu des premières lignes :")
df.head()


✔️ Nombre de lignes : 9659
✔️ Nombre de colonnes : 29

Colonnes présentes :
['PLAYER_ID', 'season', 'player_name', 'pts_mean', 'reb_mean', 'ast_mean', 'plus_minus_mean', 'efg_pct', 'ts_pct', 'stl_mean', 'blk_mean', 'tov_mean', 'pts36', 'reb36', 'ast36', 'stl36', 'blk36', 'tov36', 'pm36', 'min_per_game', 'avail', 'esv_mean', 'pace', 'height_cm', 'bmi', 'age', 'exp', 'delta_score', 'target_note_n1']
✔️ Toutes les colonnes attendues sont présentes.

Valeurs manquantes par colonne :
delta_score    1899
dtype: int64

Aperçu des premières lignes :


Unnamed: 0,PLAYER_ID,season,player_name,pts_mean,reb_mean,ast_mean,plus_minus_mean,efg_pct,ts_pct,stl_mean,...,min_per_game,avail,esv_mean,pace,height_cm,bmi,age,exp,delta_score,target_note_n1
0,3,1999-00,,4.833333,5.571429,1.02381,-1.666667,0.43881,0.512479,1.071429,...,21.928571,0.512195,3.52381,93.182927,200.933994,24.956995,28.067055,5,,42.905841
1,15,1999-00,,8.72,2.96,1.08,-6.04,0.470132,0.494466,0.586667,...,22.893333,0.914634,7.586667,95.821951,200.66,24.216326,29.0,5,,50.819439
2,21,1999-00,,6.268293,1.621951,2.536585,2.52439,0.478188,0.520484,0.719512,...,18.914634,1.0,5.195122,91.690244,185.42,23.743829,32.0,8,,40.188669
3,26,1999-00,,6.277778,4.486111,1.069444,-0.027778,0.460773,0.50923,0.305556,...,19.680556,0.878049,5.166667,96.226829,218.44,24.711575,31.0,8,,31.616836
4,28,1999-00,,4.0,1.868421,1.131579,-0.210526,0.43047,0.480733,0.315789,...,15.342105,0.463415,3.105263,93.182927,198.12,25.418941,33.0,10,,23.956305


In [18]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1) Charger le dataset ML
df = pd.read_parquet(Path("../data/curated") / "dataset_ml.parquet")

# 2) Split temporel
train = df[df["season"] <= "2021-22"]
test  = df[df["season"] >  "2021-22"]

X_train, y_train = train.drop(columns=["target_note_n1","season"]), train["target_note_n1"]
X_test,  y_test  = test.drop(columns=["target_note_n1","season"]), test["target_note_n1"]

# 3) Entraîner un modèle simple
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 4) Prédire & mesurer
y_pred = model.predict(X_test)
print("Test RMSE :", mean_squared_error(y_test, y_pred, squared=False))
print("Test R²   :", r2_score(y_test, y_pred))


ValueError: could not convert string to float: 'LeBron James'