In [4]:
import pandas as pd

# Dictionnaire des fichiers à vérifier et des colonnes attendues
FILES = {
    "dataset_ml"            : {
        "path": "../data/curated/dataset_ml.parquet",
        "expected_cols": [
            "PLAYER_ID", "season", "score_100", "target_note_n1"
        ]
    },
    "all_seasons_scores"    : {
        "path": "../data/curated/all_seasons_scores.parquet",
        "expected_cols": [
            "PLAYER_ID", "season", "score_100"
        ]
    },
    "wins_shares_vorp"      : {
        "path": "../data/curated/wins_shares_vorp.parquet",
        "expected_cols": [
            "PLAYER_ID", "season", "win_shares", "vorp"
        ]
    },
    "player_clusters"       : {
        "path": "../data/curated/player_clusters.parquet",
        "expected_cols": [
            "PLAYER_ID", "cluster"
        ]
    },
    "all_player_gamelogs"   : {
        "path": "../data/curated/all_player_gamelogs.parquet",
        "expected_cols": [
            "PLAYER_ID", "game_id", "pts", "reb", "ast"
            # ajuster selon vos colonnes essentielles
        ]
    }
}


In [None]:
def validate_file(name, info):
    print(f"---\n🔍 Vérification de `{name}`")
    path = info["path"]
    try:
        df = pd.read_parquet(path)
    except Exception as e:
        print(f"❌ Impossible de charger {path} : {e}")
        return
    
    cols = set(df.columns)
    expected = set(info["expected_cols"])
    missing = expected - cols
    extra   = cols - expected
    
    if missing:
        print(f"⚠️ Colonnes manquantes dans `{name}` : {sorted(missing)}")
    else:
        print("✅ Toutes les colonnes attendues sont présentes.")
    
    # Optionnel : lister quelques colonnes supplémentaires
    if extra:
        print(f"ℹ️ Colonnes en plus (aperçu) : {sorted(list(extra))[:10]}")
    
    print(f"  • Nombre de lignes : {len(df):,}")
    print(f"  • Aperçu des types de colonnes :")
    print(df.dtypes.apply(lambda dt: dt.name).value_counts())
    print("  • Extrait:")
    print(df.head(3))


In [6]:
for name, info in FILES.items():
    validate_file(name, info)


---
🔍 Vérification de `dataset_ml`
⚠️ Colonnes manquantes dans `dataset_ml` : ['score_100']
ℹ️ Colonnes en plus (aperçu) : ['age', 'ast36', 'ast_mean', 'avail', 'blk36', 'blk_mean', 'bmi', 'delta_score', 'efg_pct', 'esv_mean']
  • Nombre de lignes : 9,659
  • Aperçu des types de colonnes :
float64    25
int64       2
object      2
Name: count, dtype: int64
  • Extrait:


Unnamed: 0,PLAYER_ID,season,player_name,pts_mean,reb_mean,ast_mean,plus_minus_mean,efg_pct,ts_pct,stl_mean,...,min_per_game,avail,esv_mean,pace,height_cm,bmi,age,exp,delta_score,target_note_n1
0,3,1999-00,,4.833333,5.571429,1.02381,-1.666667,0.43881,0.512479,1.071429,...,21.928571,0.512195,3.52381,93.182927,200.933994,24.956995,28.067055,5,,42.905841
1,15,1999-00,,8.72,2.96,1.08,-6.04,0.470132,0.494466,0.586667,...,22.893333,0.914634,7.586667,95.821951,200.66,24.216326,29.0,5,,50.819439
2,21,1999-00,,6.268293,1.621951,2.536585,2.52439,0.478188,0.520484,0.719512,...,18.914634,1.0,5.195122,91.690244,185.42,23.743829,32.0,8,,40.188669


---
🔍 Vérification de `all_seasons_scores`
✅ Toutes les colonnes attendues sont présentes.
  • Nombre de lignes : 12,099
  • Aperçu des types de colonnes :
int64      1
object     1
float64    1
Name: count, dtype: int64
  • Extrait:


Unnamed: 0,PLAYER_ID,season,score_100
0,3,1999-00,45.628462
1,15,1999-00,45.466705
2,21,1999-00,50.659263


---
🔍 Vérification de `wins_shares_vorp`
⚠️ Colonnes manquantes dans `wins_shares_vorp` : ['vorp', 'win_shares']
ℹ️ Colonnes en plus (aperçu) : ['VORP', 'Win_Shares']
  • Nombre de lignes : 15,827
  • Aperçu des types de colonnes :
float64    3
object     1
Name: count, dtype: int64
  • Extrait:


Unnamed: 0,PLAYER_ID,season,Win_Shares,VORP
0,714.0,1999-00,8.3,3.6
1,56.0,1999-00,13.9,7.3
2,56.0,1999-00,13.9,7.3


---
🔍 Vérification de `player_clusters`
⚠️ Colonnes manquantes dans `player_clusters` : ['cluster']
ℹ️ Colonnes en plus (aperçu) : ['player_cluster', 'profile_name', 'season']
  • Nombre de lignes : 12,099
  • Aperçu des types de colonnes :
object    2
int64     1
int32     1
Name: count, dtype: int64
  • Extrait:


Unnamed: 0,PLAYER_ID,season,player_cluster,profile_name
0,3,1999-00,0,Playmaker
1,15,1999-00,3,All-Around
2,21,1999-00,3,All-Around


---
🔍 Vérification de `all_player_gamelogs`
⚠️ Colonnes manquantes dans `all_player_gamelogs` : ['ast', 'game_id', 'pts', 'reb']
ℹ️ Colonnes en plus (aperçu) : ['AST', 'BLK', 'DREB', 'FANTASY_PTS', 'FG3A', 'FG3M', 'FG3_PCT', 'FGA', 'FGM', 'FG_PCT']
  • Nombre de lignes : 620,776
  • Aperçu des types de colonnes :
int64      20
object      8
float64     4
Name: count, dtype: int64
  • Extrait:


Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,...,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,FANTASY_PTS,VIDEO_AVAILABLE
0,21999,1502,Adonal Foyle,1610612744,GSW,Golden State Warriors,29900007,1999-11-02,GSW @ DAL,L,...,3,0,0,1,4,2,2,-5,4.6,0
1,21999,1505,Tariq Abdul-Wahad,1610612753,ORL,Orlando Magic,29900005,1999-11-02,ORL @ CHH,L,...,4,0,0,0,2,1,4,-15,6.8,0
2,21999,763,Tony Massenburg,1610612745,HOU,Houston Rockets,29900008,1999-11-02,HOU vs. MIL,L,...,1,1,0,0,0,0,0,6,2.7,0


In [11]:
import pandas as pd
from pathlib import Path

# 1) Chemins
DATA_DIR    = Path("../data/curated")
PATH_DF     = DATA_DIR / "dataset_ml.parquet"
PATH_WS     = DATA_DIR / "wins_shares_vorp.parquet"
PATH_CL     = DATA_DIR / "player_clusters.parquet"

# 2) Chargement
print("🔄 Chargement des fichiers…")
df       = pd.read_parquet(PATH_DF)
ws       = pd.read_parquet(PATH_WS)
cl_exists = PATH_CL.exists()
cl       = pd.read_parquet(PATH_CL) if cl_exists else None

# 3) Colonnes & dimensions
print("\n--- dataset_ml ---")
print("Colonnes :", df.columns.tolist())
print("Shape   :", df.shape)

print("\n--- wins_shares_vorp ---")
print("Colonnes :", ws.columns.tolist())
print("Shape   :", ws.shape)

if cl_exists:
    print("\n--- player_clusters ---")
    print("Colonnes :", cl.columns.tolist())
    print("Shape   :", cl.shape)
else:
    print("\n⚠️  Aucun fichier `player_clusters.parquet` trouvé.")

# 4) Aperçu rapide
print("\n--- Aperçu dataset_ml.head() ---")
print(df.head(5))
print("\n--- Aperçu wins_shares_vorp.head() ---")
print(ws.head(5))
if cl_exists:
    print("\n--- Aperçu player_clusters.head() ---")
    print(cl.head(5))

# 5) Statistiques sommaires
print("\n--- Stats numériques — dataset_ml.describe() ---")
print(df.describe(include="number"))

# 6) Comptages d’ID & saisons
print("\n— Nombre de joueurs uniques :", df["PLAYER_ID"].nunique())
print("— Nombre de saisons        :", df["season"].nunique())
print("  Saisons disponibles       :", sorted(df["season"].unique()))

# 7) Détection “souple” de la colonne cluster
if cl_exists:
    cluster_cols = [c for c in cl.columns if "cluster" in c.lower()]
    if cluster_cols:
        cluster_col = cluster_cols[0]
        print(f"\n— Colonne de cluster détectée : `{cluster_col}`")
        print("  Nombre de clusters :", cl[cluster_col].nunique())
        print("  Valeurs :", sorted(cl[cluster_col].unique()))
    else:
        print("\n⚠️  Aucune colonne contenant 'cluster' trouvée dans player_clusters.parquet")

# 8) Valeurs manquantes
print("\n--- % de valeurs manquantes dataset_ml ---")
mis = df.isna().mean().mul(100).round(2)
print(mis[mis > 0])

print("\n--- % de valeurs manquantes wins_shares_vorp ---")
mis_ws = ws.isna().mean().mul(100).round(2)
print(mis_ws[mis_ws > 0])

if cl_exists and cluster_cols:
    print("\n--- % de valeurs manquantes player_clusters ---")
    mis_cl = cl.isna().mean().mul(100).round(2)
    print(mis_cl[mis_cl > 0])


🔄 Chargement des fichiers…

--- dataset_ml ---
Colonnes : ['PLAYER_ID', 'season', 'player_name', 'pts_mean', 'reb_mean', 'ast_mean', 'plus_minus_mean', 'efg_pct', 'ts_pct', 'stl_mean', 'blk_mean', 'tov_mean', 'pts36', 'reb36', 'ast36', 'stl36', 'blk36', 'tov36', 'pm36', 'min_per_game', 'avail', 'esv_mean', 'pace', 'height_cm', 'bmi', 'age', 'exp', 'delta_score', 'target_note_n1']
Shape   : (9659, 29)

--- wins_shares_vorp ---
Colonnes : ['PLAYER_ID', 'season', 'Win_Shares', 'VORP']
Shape   : (15827, 4)

--- player_clusters ---
Colonnes : ['PLAYER_ID', 'season', 'player_cluster', 'profile_name']
Shape   : (12099, 4)

--- Aperçu dataset_ml.head() ---
   PLAYER_ID   season player_name  pts_mean  reb_mean  ast_mean  \
0          3  1999-00        None  4.833333  5.571429  1.023810   
1         15  1999-00        None  8.720000  2.960000  1.080000   
2         21  1999-00        None  6.268293  1.621951  2.536585   
3         26  1999-00        None  6.277778  4.486111  1.069444   
4       

In [37]:
import pandas as pd
from pathlib import Path

# Dossier où sont stockés les player_season_*.parquet
CURATED = Path("../data/curated")

# 1) Lister tous les fichiers player_season_YYYY-YY.parquet
season_files = sorted(CURATED.glob("player_season_*.parquet"))
print(f"→ {len(season_files)} fichiers détectés")

# 2) Boucler, charger et extraire les colonnes d’intérêt
records = []
for path in season_files:
    # on déduit la saison depuis le nom de fichier
    season = path.stem.split("_")[-1]
    df = pd.read_parquet(path, columns=["PLAYER_ID","player_name"])
    df = df.dropna(subset=["player_name"]).drop_duplicates(["PLAYER_ID"])
    df["season"] = season
    records.append(df)

# 3) Concaténer et dédupliquer
names_df = pd.concat(records, ignore_index=True)
names_df = names_df.drop_duplicates(["PLAYER_ID","season"]).reset_index(drop=True)

print("\nAperçu des noms extraits :")
display(names_df.head(10))

# 4) Statistiques de couverture
total = len(names_df)
unique_ids = names_df["PLAYER_ID"].nunique()
unique_seasons = names_df["season"].nunique()
print(f"\n→ {total} lignes au total, {unique_ids} joueurs uniques sur {unique_seasons} saisons")

# 5) Merge d’exemple avec votre dataset_ml
ds = pd.read_parquet(CURATED/"dataset_ml.parquet")
merged = ds.merge(names_df, on=["PLAYER_ID","season"], how="left")
print(f"\nAprès merge : {len(merged)} lignes × {merged.shape[1]} colonnes")
print("Colonnes disponibles :", merged.columns.tolist())
display(merged.head(5))


→ 25 fichiers détectés


ArrowInvalid: No match for FieldRef.Name(player_name) in PLAYER_ID: int64
pts_mean: double
reb_mean: double
ast_mean: double
plus_minus_mean: double
gp: int64
min_per_game: double
POSITION: string
height_cm: double
weight_kg: double
bmi: double
age: double
exp: double
esv_mean: double
TEAM_ID: int64
pace: double
efg_pct: double
ts_pct: double
fg2_pct: double
fg3_pct: double
ft_pct: double
stl_mean: double
blk_mean: double
tov_mean: double
ast_tov_ratio: double
usage_rate: double
pts36: double
reb36: double
ast36: double
stl36: double
blk36: double
tov36: double
pm36: double
avail: double
score_100: double
season: string
__fragment_index: int32
__batch_index: int32
__last_in_fragment: bool
__filename: string

In [14]:
df_all = pd.read_parquet(CURATED/"all_seasons_scores.parquet")
df_all

Unnamed: 0,PLAYER_ID,season,score_100
0,3,1999-00,45.628462
1,15,1999-00,45.466705
2,21,1999-00,50.659263
3,22,1999-00,62.031509
4,23,1999-00,39.266827
...,...,...,...
12094,1641926,2023-24,31.717632
12095,1641931,2023-24,25.017670
12096,1641970,2023-24,47.228638
12097,1641998,2023-24,39.596597


In [16]:
df_all = pd.read_parquet(CURATED/"all_player_gamelogs.parquet")
print(df_all.columns.tolist())
df_all

['SEASON_ID', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS', 'FANTASY_PTS', 'VIDEO_AVAILABLE']


Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,...,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,FANTASY_PTS,VIDEO_AVAILABLE
0,21999,1502,Adonal Foyle,1610612744,GSW,Golden State Warriors,0029900007,1999-11-02,GSW @ DAL,L,...,3,0,0,1,4,2,2,-5,4.6,0
1,21999,1505,Tariq Abdul-Wahad,1610612753,ORL,Orlando Magic,0029900005,1999-11-02,ORL @ CHH,L,...,4,0,0,0,2,1,4,-15,6.8,0
2,21999,763,Tony Massenburg,1610612745,HOU,Houston Rockets,0029900008,1999-11-02,HOU vs. MIL,L,...,1,1,0,0,0,0,0,6,2.7,0
3,21999,345,Terry Porter,1610612759,SAS,San Antonio Spurs,0029900009,1999-11-02,SAS vs. PHI,W,...,3,5,0,0,0,2,15,18,26.1,0
4,21999,228,Adam Keefe,1610612762,UTA,Utah Jazz,0029900011,1999-11-02,UTA vs. LAL,L,...,2,1,1,0,1,2,2,-8,7.9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620771,22023,1629647,Darius Bazley,1610612762,UTA,Utah Jazz,0022301198,2024-04-14,UTA @ GSW,L,...,0,0,0,1,0,1,2,-9,5.0,1
620772,22023,1629010,Jerome Robinson,1610612744,GSW,Golden State Warriors,0022301198,2024-04-14,GSW vs. UTA,W,...,0,0,1,0,0,0,5,7,8.0,1
620773,22023,1626172,Kevon Looney,1610612744,GSW,Golden State Warriors,0022301198,2024-04-14,GSW vs. UTA,W,...,2,2,0,1,0,1,4,11,12.4,1
620774,22023,203952,Andrew Wiggins,1610612744,GSW,Golden State Warriors,0022301198,2024-04-14,GSW vs. UTA,W,...,4,3,2,0,1,2,19,-12,33.3,1


In [60]:
df_dash = pd.read_parquet(CURATED/"dashboard_data.parquet")
df_dash['player_name'].unique()
# Count how many players have missing names
total_players = len(df_dash)
missing_names = df_dash['player_name'].isna().sum()
with_names = total_players - missing_names

print(f"Nombre de joueurs total: {total_players}")
print(f"Joueurs avec nom: {with_names} ({with_names/total_players:.1%})")
print(f"Joueurs sans nom: {missing_names} ({missing_names/total_players:.1%})")

Nombre de joueurs total: 9655
Joueurs avec nom: 9655 (100.0%)
Joueurs sans nom: 0 (0.0%)


In [35]:
player = pd.read_parquet(CURATED/"player_season_2023-24.parquet")
player

Unnamed: 0,PLAYER_ID,pts_mean,reb_mean,ast_mean,plus_minus_mean,gp,min_per_game,POSITION,height_cm,weight_kg,...,pts36,reb36,ast36,stl36,blk36,tov36,pm36,avail,score_100,season
0,2544,25.661972,7.295775,8.295775,3.112676,71,35.323944,F,205.740000,113.378685,...,26.153110,7.435407,8.454545,1.277512,0.545455,3.516746,3.172249,0.865854,86.025158,2023-24
1,101108,9.189655,3.879310,6.775862,1.896552,58,26.431034,G,182.880000,79.365079,...,12.516634,5.283757,9.228963,1.643836,0.140900,1.784736,2.583170,0.707317,56.736848,2023-24
2,200768,8.116667,3.233333,4.216667,0.033333,60,28.183333,G,182.880000,88.888889,...,10.367830,4.130101,5.386162,1.256062,0.447073,1.809580,0.042578,0.731707,45.072855,2023-24
3,200782,1.677419,2.741935,0.516129,-0.741935,31,15.709677,F,195.580000,111.111111,...,3.843943,6.283368,1.182752,1.182752,0.517454,0.591376,-1.700205,0.378049,23.486656,2023-24
4,201142,27.093333,6.600000,5.040000,3.866667,75,37.200000,F,210.820000,108.843537,...,26.219355,6.387097,4.877419,0.890323,1.174194,3.148387,3.741935,0.914634,82.122491,2023-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567,1641926,5.500000,2.250000,1.000000,-2.250000,4,7.500000,,199.409464,,...,26.400000,10.800000,4.800000,0.000000,1.200000,3.600000,-10.800000,0.048780,31.717632,2023-24
568,1641931,3.478261,1.391304,0.565217,-1.956522,23,11.652174,F,198.120000,97.505669,...,10.746269,4.298507,1.746269,0.402985,0.268657,1.343284,-6.044776,0.280488,25.017670,2023-24
569,1641970,6.857143,5.285714,0.285714,1.285714,7,17.285714,,199.409464,,...,14.280992,11.008264,0.595041,1.785124,1.190083,1.487603,2.677686,0.085366,47.228638,2023-24
570,1641998,6.840000,5.360000,1.080000,-4.400000,25,22.880000,C,208.280000,117.913832,...,10.762238,8.433566,1.699301,0.755245,1.762238,2.139860,-6.923077,0.304878,39.596597,2023-24


In [39]:
win = pd.read_parquet(CURATED/"wins_shares_vorp.parquet")
win

Unnamed: 0,PLAYER_ID,season,Win_Shares,VORP
0,714.0,1999-00,8.3,3.6
1,56.0,1999-00,13.9,7.3
2,56.0,1999-00,13.9,7.3
3,56.0,1999-00,13.9,7.3
4,56.0,1999-00,13.9,7.3
...,...,...,...,...
15822,1630622.0,2023-24,0.0,0.0
15823,1631376.0,2023-24,0.0,0.0
15824,1628382.0,2023-24,0.0,0.0
15825,1630606.0,2023-24,0.0,0.0


In [50]:
ml = pd.read_parquet(CURATED/"dataset_ml.parquet")
ml[ml["player_name"] == 'LeBron James']

Unnamed: 0,PLAYER_ID,season,player_name,score_100,pts_mean,reb_mean,ast_mean,plus_minus_mean,efg_pct,ts_pct,...,min_per_game,avail,esv_mean,pace,height_cm,bmi,age,exp,delta_score,target_note_n1
1817,2544,2003-04,LeBron James,71.52682,20.936709,5.468354,5.886076,-1.797468,0.426239,0.476645,...,39.506329,0.963415,16.544304,93.390244,203.2,26.360597,19.0,5,22.950625,94.477445
2145,2544,2004-05,LeBron James,94.477445,27.1875,7.35,7.2125,1.9,0.503492,0.553494,...,42.35,0.97561,21.225,92.446341,203.2,26.360597,20.0,1,3.643756,98.121201
2480,2544,2005-06,LeBron James,98.121201,31.367089,7.037975,6.594937,3.481013,0.515646,0.569464,...,42.518987,0.963415,23.759494,91.673171,203.2,26.360597,21.0,2,-0.41819,97.703011
2834,2544,2006-07,LeBron James,97.703011,27.333333,6.74359,6.025641,4.820513,0.503526,0.550257,...,40.910256,0.95122,21.064103,93.282927,203.2,26.360597,22.0,3,-0.399858,97.303153
3194,2544,2007-08,LeBron James,97.303153,30.0,7.893333,7.186667,1.826667,0.513796,0.566508,...,40.373333,0.914634,22.68,92.370732,203.2,27.458955,23.0,4,2.696847,100.0
3543,2544,2008-09,LeBron James,100.0,28.444444,7.567901,7.246914,10.753086,0.532607,0.594611,...,37.728395,0.987805,21.111111,90.460976,203.2,27.458955,24.0,5,0.0,100.0
3906,2544,2009-10,LeBron James,100.0,29.710526,7.289474,8.565789,8.552632,0.541833,0.603986,...,39.052632,0.926829,21.907895,92.792683,203.2,27.458955,25.0,6,0.0,100.0
4268,2544,2010-11,LeBron James,100.0,26.721519,7.468354,7.012658,7.835443,0.537379,0.593224,...,38.772152,0.963415,20.35443,92.295122,203.2,27.458955,26.0,7,0.0,100.0
4643,2544,2011-12,LeBron James,100.0,27.145161,7.935484,6.241935,7.645161,0.555959,0.608145,...,37.516129,0.756098,20.903226,93.442424,203.2,27.458955,27.0,8,0.0,100.0
5017,2544,2012-13,LeBron James,100.0,26.789474,8.026316,7.25,9.473684,0.610773,0.648383,...,37.907895,0.926829,21.486842,92.314634,203.2,27.458955,28.0,9,-8.300234,91.699766


In [42]:
seasons = pd.read_parquet(CURATED/"all_seasons_scores.parquet")
seasons

Unnamed: 0,PLAYER_ID,season,score_100
0,3,1999-00,45.628462
1,15,1999-00,45.466705
2,21,1999-00,50.659263
3,22,1999-00,62.031509
4,23,1999-00,39.266827
...,...,...,...
12094,1641926,2023-24,31.717632
12095,1641931,2023-24,25.017670
12096,1641970,2023-24,47.228638
12097,1641998,2023-24,39.596597


In [64]:
df_dash = pd.read_parquet(CURATED/"dashboard_data.parquet")

In [65]:
df_dash.columns

Index(['PLAYER_ID', 'season', 'pts_mean', 'reb_mean', 'ast_mean',
       'plus_minus_mean', 'efg_pct', 'ts_pct', 'stl_mean', 'blk_mean',
       'tov_mean', 'pts36', 'reb36', 'ast36', 'stl36', 'blk36', 'tov36',
       'pm36', 'min_per_game', 'avail', 'esv_mean', 'pace', 'height_cm', 'bmi',
       'age', 'exp', 'delta_score', 'target_note_n1', 'score_100',
       'player_name', 'Win_Shares', 'VORP', 'cluster'],
      dtype='object')

In [67]:
import pandas as pd

# 1) Charger le dataset de dashboard
df = pd.read_parquet("../data/curated/dashboard_data.parquet")

# 2) Lister toutes les colonnes
print("Colonnes disponibles :")
print(df.columns.tolist())

# 3) Déterminer les colonnes candidates
possible_position_cols = [c for c in df.columns if "position" in c.lower() or c.lower() in ["pos"]]
possible_team_cols     = [c for c in df.columns if "team" in c.lower() or "club" in c.lower()]

print("\nColonnes candidates pour le poste :", possible_position_cols)
for col in possible_position_cols:
    print(f"  → '{col}':", df[col].dropna().unique()[:5])

print("\nColonnes candidates pour l'équipe :", possible_team_cols)
for col in possible_team_cols:
    print(f"  → '{col}':", df[col].dropna().unique()[:5])

# 4) Exemples pour un joueur
first_player = df[df.columns[0]].iloc[0]
print(f"\nExemple pour le joueur ID={first_player} :")
for col in possible_position_cols + possible_team_cols:
    if col in df.columns:
        print(f"  • {col} =", df[df[df.columns[0]] == first_player][col].iloc[0])

Colonnes disponibles :
['PLAYER_ID', 'season', 'pts_mean', 'reb_mean', 'ast_mean', 'plus_minus_mean', 'efg_pct', 'ts_pct', 'stl_mean', 'blk_mean', 'tov_mean', 'pts36', 'reb36', 'ast36', 'stl36', 'blk36', 'tov36', 'pm36', 'min_per_game', 'avail', 'esv_mean', 'pace', 'height_cm', 'bmi', 'age', 'exp', 'delta_score', 'target_note_n1', 'score_100', 'player_name', 'photo_url', 'Win_Shares', 'VORP', 'cluster']

Colonnes candidates pour le poste : []

Colonnes candidates pour l'équipe : []

Exemple pour le joueur ID=3 :


In [68]:
proj = pd.read_parquet("../data/curated/projections.parquet")
print(proj.columns)
proj

Index(['PLAYER_NAME', 'horizon', 'pred_score'], dtype='object')


Unnamed: 0,PLAYER_NAME,horizon,pred_score
0,A.C. Green,1,43.748883
1,A.J. Guyton,1,45.641678
2,A.J. Lawson,1,40.866929
3,AJ Green,1,43.576164
4,AJ Griffin,1,57.591771
...,...,...,...
1889,Ziaire Williams,1,45.086468
1890,Zion Williamson,1,86.354922
1891,Zoran Planinic,1,45.872981
1892,Zydrunas Ilgauskas,1,49.888737
