In [33]:
from pathlib import Path
import pandas as pd
import numpy as np

DATA = Path("../data/processed")
OUT = Path("../data/processed")

pd.set_option("display.max_columns", 200)


In [34]:
regular_team_season = pd.read_csv(DATA / "regular_team_season_features.csv")
tourney_seeds = pd.read_csv(DATA / "tourney_seeds.csv")
tourney = pd.read_csv(Path("../data/raw/MNCAATourneyCompactResults.csv"))

print(regular_team_season.shape, tourney_seeds.shape, tourney.shape)


(13388, 9) (2626, 4) (2518, 8)


In [35]:
# Make a copy so we don't mutate originals
tourney_games = tourney.copy()

# Rename for clarity
tourney_games = tourney_games.rename(columns={
    "WTeamID": "TeamA",
    "LTeamID": "TeamB",
    "WScore": "ScoreA",
    "LScore": "ScoreB"
})

# Label: Team A wins
tourney_games["label"] = 1

tourney_games.head()


Unnamed: 0,Season,DayNum,TeamA,ScoreA,TeamB,ScoreB,WLoc,NumOT,label
0,1985,136,1116,63,1234,54,N,0,1
1,1985,136,1120,59,1345,58,N,0,1
2,1985,136,1207,68,1250,43,N,0,1
3,1985,136,1229,58,1425,55,N,0,1
4,1985,136,1242,49,1325,38,N,0,1


In [36]:
team_feats = regular_team_season.copy()

team_feats_A = team_feats.add_prefix("A_")
team_feats_B = team_feats.add_prefix("B_")

# Join Team A features
train = tourney_games.merge(
    team_feats_A,
    left_on=["Season", "TeamA"],
    right_on=["A_Season", "A_TeamID"],
    how="left"
)

print("After joining Team A features:", train.shape)


After joining Team A features: (2518, 18)


In [37]:
train = train.merge(
    team_feats_B,
    left_on=["Season", "TeamB"],
    right_on=["B_Season", "B_TeamID"],
    how="left"
)

print("After joining Team B features:", train.shape)


After joining Team B features: (2518, 27)


In [38]:
if 'tourney_seeds' in globals():
    seeds_A = tourney_seeds.add_prefix("A_")
    seeds_B = tourney_seeds.add_prefix("B_")

    train = train.merge(
        seeds_A,
        left_on=["Season", "TeamA"],
        right_on=["A_Season", "A_TeamID"],
        how="left"
    )

    train = train.merge(
        seeds_B,
        left_on=["Season", "TeamB"],
        right_on=["B_Season", "B_TeamID"],
        how="left"
    )

print("After seeds join:", train.shape)


After seeds join: (2518, 35)


In [39]:
# Check missing features
missing_A = train.filter(like="A_win_pct").isna().mean()[0]
missing_B = train.filter(like="B_win_pct").isna().mean()[0]

print("Missing A win_pct %:", round(missing_A * 100, 2))
print("Missing B win_pct %:", round(missing_B * 100, 2))


Missing A win_pct %: 0.0
Missing B win_pct %: 0.0


  missing_A = train.filter(like="A_win_pct").isna().mean()[0]
  missing_B = train.filter(like="B_win_pct").isna().mean()[0]


In [40]:
feature_pairs = [
    ("win_pct", "win_pct"),
    ("avg_margin", "avg_margin"),
    ("avg_pf", "avg_pf"),
    ("avg_pa", "avg_pa"),
    ("std_margin", "std_margin"),
]

for fA, fB in feature_pairs:
    train[f"{fA}_diff"] = train[f"A_{fA}"] - train[f"B_{fB}"]

# Seed difference (lower seed is better)
if "A_SeedNum" in train.columns:
    train["seed_diff"] = train["B_SeedNum"] - train["A_SeedNum"]

train.head()


Unnamed: 0,Season,DayNum,TeamA,ScoreA,TeamB,ScoreB,WLoc,NumOT,label,A_Season_x,A_TeamID_x,A_games,A_wins,A_avg_pf,A_avg_pa,A_avg_margin,A_std_margin,A_win_pct,B_Season_x,B_TeamID_x,B_games,B_wins,B_avg_pf,B_avg_pa,B_avg_margin,B_std_margin,B_win_pct,A_Season_y,A_TeamID_y,A_Seed,A_SeedNum,B_Season_y,B_TeamID_y,B_Seed,B_SeedNum,win_pct_diff,avg_margin_diff,avg_pf_diff,avg_pa_diff,std_margin_diff,seed_diff
0,1985,136,1116,63,1234,54,N,0,1,1985,1116,33,21,65.333333,61.69697,3.636364,11.661417,0.636364,1985,1234,30,20,69.733333,59.266667,10.466667,16.074682,0.666667,1985,1116,X09,9,1985,1234,X08,8,-0.030303,-6.830303,-4.4,2.430303,-4.413265,-1
1,1985,136,1120,59,1345,58,N,0,1,1985,1120,29,18,70.344828,66.655172,3.689655,14.757815,0.62069,1985,1345,25,17,69.12,65.32,3.8,16.140012,0.68,1985,1120,Z11,11,1985,1345,Z06,6,-0.05931,-0.110345,1.224828,1.335172,-1.382197,-5
2,1985,136,1207,68,1250,43,N,0,1,1985,1207,27,25,75.740741,60.074074,15.666667,10.845063,0.925926,1985,1250,29,11,65.758621,70.206897,-4.448276,12.069993,0.37931,1985,1207,W01,1,1985,1250,W16,16,0.546616,20.114943,9.98212,-10.132822,-1.22493,15
3,1985,136,1229,58,1425,55,N,0,1,1985,1229,27,20,71.592593,65.62963,5.962963,11.85321,0.740741,1985,1425,28,19,68.392857,64.607143,3.785714,11.631239,0.678571,1985,1229,Y09,9,1985,1425,Y08,8,0.062169,2.177249,3.199735,1.022487,0.221971,-1
4,1985,136,1242,49,1325,38,N,0,1,1985,1242,30,23,76.033333,70.4,5.633333,10.60736,0.766667,1985,1325,27,20,67.555556,63.0,4.555556,9.32463,0.740741,1985,1242,Z03,3,1985,1325,Z14,14,0.025926,1.077778,8.477778,7.4,1.282731,11


In [41]:
diff_cols = [c for c in train.columns if c.endswith("_diff")]

before = train.shape[0]
train = train.dropna(subset=diff_cols)
after = train.shape[0]

print(f"Dropped {before - after} rows due to missing features")


Dropped 0 rows due to missing features


In [42]:
# Copy and flip A/B
train_flipped = train.copy()

for col in diff_cols:
    train_flipped[col] = -train_flipped[col]

if "seed_diff" in train.columns:
    train_flipped["seed_diff"] = -train_flipped["seed_diff"]

train_flipped["label"] = 0

train_full = pd.concat([train, train_flipped], ignore_index=True)

print(train_full["label"].value_counts())


label
1    2518
0    2518
Name: count, dtype: int64


In [55]:
X_cols = diff_cols + (["seed_diff"] if "seed_diff" in train_full.columns else [])

final_train = train_full[["Season"] + X_cols + ["label"]]

final_train.to_csv(OUT / "train.csv", index=False)

print("Saved training set:", final_train.shape)
final_train.head()


Saved training set: (5036, 9)


Unnamed: 0,Season,win_pct_diff,avg_margin_diff,avg_pf_diff,avg_pa_diff,std_margin_diff,seed_diff,seed_diff.1,label
0,1985,-0.030303,-6.830303,-4.4,2.430303,-4.413265,-1,-1,1
1,1985,-0.05931,-0.110345,1.224828,1.335172,-1.382197,-5,-5,1
2,1985,0.546616,20.114943,9.98212,-10.132822,-1.22493,15,15,1
3,1985,0.062169,2.177249,3.199735,1.022487,0.221971,-1,-1,1
4,1985,0.025926,1.077778,8.477778,7.4,1.282731,11,11,1
