## Import Required Packages and Set File Path

In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import brier_score_loss, accuracy_score, log_loss
from sklearn.model_selection import GroupKFold
import xgboost as xgb


# Set the file path
DATA_PATH = "/Users/nicholasrichards/Desktop/march_madness_prediction_project/march-machine-learning-mania-2025/"


## Preparing Dataframes

In [3]:
# Loads tournament seeds data for both men's and women's tournaments
m_tourney_seeds = pd.read_csv(os.path.join(DATA_PATH, "MNCAATourneySeeds.csv")).assign(League="M")
w_tourney_seeds = pd.read_csv(os.path.join(DATA_PATH, "WNCAATourneySeeds.csv")).assign(League="W")
df_seeds = pd.concat([m_tourney_seeds, w_tourney_seeds], ignore_index=True)

# Loads regular season detailed results data for both men's and women's tournaments
m_reg_season = pd.read_csv(os.path.join(DATA_PATH, "MRegularSeasonDetailedResults.csv")).assign(League="M")
w_reg_season = pd.read_csv(os.path.join(DATA_PATH, "WRegularSeasonDetailedResults.csv")).assign(League="W")
df_season_results = pd.concat([m_reg_season, w_reg_season], ignore_index=True)

# Loads NCAA tournament detailed results data for both men's and women's tournaments
m_tourney_results = pd.read_csv(os.path.join(DATA_PATH, "MNCAATourneyDetailedResults.csv")).assign(League="M")
w_tourney_results = pd.read_csv(os.path.join(DATA_PATH, "WNCAATourneyDetailedResults.csv")).assign(League="W")
df_tourney_results = pd.concat([m_tourney_results, w_tourney_results], ignore_index=True)

# Loads team conference data for both men's and women's tournaments
m_team_conferences = pd.read_csv(os.path.join(DATA_PATH, "MTeamConferences.csv")).assign(League="M")
w_team_conferences = pd.read_csv(os.path.join(DATA_PATH, "WTeamConferences.csv")).assign(League="W")
df_team_conferences = pd.concat([m_team_conferences, w_team_conferences], ignore_index=True)

# Loads team data for both men's and women's tournaments 
m_teams = pd.read_csv(os.path.join(DATA_PATH, "MTeams.csv"))
w_teams = pd.read_csv(os.path.join(DATA_PATH, "WTeams.csv"))
df_teams = pd.concat([m_teams, w_teams], ignore_index=True)

# Loads the overall conference data
df_conferences = pd.read_csv(os.path.join(DATA_PATH, "Conferences.csv"))

# Load men's team coaches data
df_team_coaches = pd.read_csv(os.path.join(DATA_PATH, "MTeamCoaches.csv"))

# Load conference tournament games data for both men's and women's tournaments
m_conf_tourney_games = pd.read_csv(os.path.join(DATA_PATH, "MConferenceTourneyGames.csv")).assign(League="M")
w_conf_tourney_games = pd.read_csv(os.path.join(DATA_PATH, "WConferenceTourneyGames.csv")).assign(League="W")
df_conf_tourney_games = pd.concat([m_conf_tourney_games, w_conf_tourney_games], ignore_index=True)

# Load the Sample Submission Stage 2 file
df_sample_submission = pd.read_csv(os.path.join(DATA_PATH, "SampleSubmissionStage2.csv"))

df_season_results.head()


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,League
0,2003,10,1104,68,1328,62,N,0,27,58,...,16,22,10,22,8,18,9,2,20,M
1,2003,10,1272,70,1393,63,N,0,26,62,...,9,20,20,25,7,12,8,6,16,M
2,2003,11,1266,73,1437,61,N,0,24,58,...,14,23,31,22,9,12,2,5,23,M
3,2003,11,1296,56,1457,50,N,0,18,38,...,8,15,17,20,9,19,4,3,23,M
4,2003,11,1400,77,1208,71,N,0,30,61,...,17,27,21,15,12,10,7,1,14,M


## Creating Matrix Completion Dataframe

### Adapting Regular Season Results for Matrix Completion Model

In [4]:
# Renaming dictionaries as provided:
winner_rename = {
    "WTeamID": "TeamID",
    "WScore": "TeamScore",
    "LScore": "OppScore",
    "WFGM": "FGM",
    "WFGA": "FGA",
    "WFGM3": "FGM3",
    "WFGA3": "FGA3",
    "WFTM": "FTM",
    "WFTA": "FTA",
    "WOR": "OR",
    "WDR": "DR",
    "WAst": "Ast",
    "WTO": "TO",
    "WStl": "Stl",
    "WBlk": "Blk",
    "WPF": "PF",
    "LFGM": "OppFGM",
    "LFGA": "OppFGA",
    "LFGM3": "OppFGM3",
    "LFGA3": "OppFGA3",
    "LFTM": "OppFTM",
    "LFTA": "OppFTA",
    "LOR": "OppOR",
    "LDR": "OppDR",
    "LAst": "OppAst",
    "LTO": "OppTO",
    "LStl": "OppStl",
    "LBlk": "OppBlk",
    "LPF": "OppPF"
}

loser_rename = {
    "LTeamID": "TeamID",
    "LScore": "TeamScore",
    "WScore": "OppScore",
    "LFGM": "FGM",
    "LFGA": "FGA",
    "LFGM3": "FGM3",
    "LFGA3": "FGA3",
    "LFTM": "FTM",
    "LFTA": "FTA",
    "LOR": "OR",
    "LDR": "DR",
    "LAst": "Ast",
    "LTO": "TO",
    "LStl": "Stl",
    "LBlk": "Blk",
    "LPF": "PF",
    "WFGM": "OppFGM",
    "WFGA": "OppFGA",
    "WFGM3": "OppFGM3",
    "WFGA3": "OppFGA3",
    "WFTM": "OppFTM",
    "WFTA": "OppFTA",
    "WOR": "OppOR",
    "WDR": "OppDR",
    "WAst": "OppAst",
    "WTO": "OppTO",
    "WStl": "OppStl",
    "WBlk": "OppBlk",
    "WPF": "OppPF"
}

# Working on a copy of the original dataframe
df_temp = df_season_results.copy()

# Creates new columns with lower team as TeamID, higher as OppTeamID and Outcome from lower team's perspective
df_temp['TeamID'] = np.minimum(df_temp['WTeamID'], df_temp['LTeamID'])
df_temp['OppTeamID'] = np.maximum(df_temp['WTeamID'], df_temp['LTeamID'])
df_temp['Outcome'] = (df_temp['WTeamID'] == df_temp['TeamID']).astype(int)

# Build a mapping from new column name to tuple (winner_source, loser_source)
mapping = {}
for orig, new_name in winner_rename.items():
    mapping[new_name] = (orig, None)
for orig, new_name in loser_rename.items():
    if new_name in mapping:
        winner_source = mapping[new_name][0]
        mapping[new_name] = (winner_source, orig)
    else:
        mapping[new_name] = (None, orig)

# For each new column, selects values based on Outcome
for new_col, (w_source, l_source) in mapping.items():
    if (w_source is not None) and (l_source is not None):
        df_temp[new_col] = np.where(df_temp['Outcome'] == 1, df_temp[w_source], df_temp[l_source])
    elif w_source is not None:
        df_temp[new_col] = df_temp[w_source]
    elif l_source is not None:
        df_temp[new_col] = df_temp[l_source]

# Drops original detailed stat columns and the original team IDs
detailed_cols = set(winner_rename.keys()).union(set(loser_rename.keys()))
cols_to_drop = ['WTeamID', 'LTeamID'] + list(detailed_cols)
df_cf = df_temp.drop(columns=cols_to_drop, errors='ignore')

# Ensures that Season and DayNum are in the dataframe
keep_front = ['Season', 'DayNum', 'TeamID', 'OppTeamID', 'Outcome']
other_cols = [col for col in df_cf.columns if col not in keep_front]
new_order = keep_front + other_cols
df_cf = df_cf[new_order]

if 'NumOT' in df_cf.columns:
    df_cf['Min'] = 40 + 5 * df_cf['NumOT']
    cols = list(df_cf.columns)
    if "NumOT" in cols and "Min" in cols:
        idx = cols.index("NumOT")
        cols.remove("Min")
        cols.insert(idx + 1, "Min")
        df_cf = df_cf[cols]

df_cf.head()


Unnamed: 0,Season,DayNum,TeamID,OppTeamID,Outcome,WLoc,NumOT,Min,League,TeamScore,...,OppFGA3,OppFTM,OppFTA,OppOR,OppDR,OppAst,OppTO,OppStl,OppBlk,OppPF
0,2003,10,1104,1328,1,N,0,40,M,68,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,1393,1,N,0,40,M,70,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,1437,1,N,0,40,M,73,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,1457,1,N,0,40,M,56,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1208,1400,0,N,0,40,M,71,...,14,11,13,17,22,12,14,4,4,20


### Creating Shooting Statistics for CF Dataframe

In [5]:
# Calculates team shooting percentages
df_cf["FG_pct"] = np.where(
    df_cf["FGA"] > 0,
    df_cf["FGM"] / df_cf["FGA"],
    0
)
df_cf["FG3_pct"] = np.where(
    df_cf["FGA3"] > 0,
    df_cf["FGM3"] / df_cf["FGA3"],
    0
)
df_cf["FT_pct"] = np.where(
    df_cf["FTA"] > 0,
    df_cf["FTM"] / df_cf["FTA"],
    0
)

# Calculates opponent shooting percentages
df_cf["Opp_FG_pct"] = np.where(
    df_cf["OppFGA"] > 0,
    df_cf["OppFGM"] / df_cf["OppFGA"],
    0
)
df_cf["Opp_FG3_pct"] = np.where(
    df_cf["OppFGA3"] > 0,
    df_cf["OppFGM3"] / df_cf["OppFGA3"],
    0
)
df_cf["Opp_FT_pct"] = np.where(
    df_cf["OppFTA"] > 0,
    df_cf["OppFTM"] / df_cf["OppFTA"],
    0
)

# Calculates effective field goal percentage
df_cf["eFG_pct"] = np.where(
    df_cf["FGA"] > 0,
    (df_cf["FGM"] + 0.5 * df_cf["FGM3"]) / df_cf["FGA"],
    0
)

# Calculates effective field goal percentage for opponent
df_cf["Opp_eFG_pct"] = np.where(
    df_cf["OppFGA"] > 0,
    (df_cf["OppFGM"] + 0.5 * df_cf["OppFGM3"]) / df_cf["OppFGA"],
    0
)

df_cf.sample(10, random_state=349)


Unnamed: 0,Season,DayNum,TeamID,OppTeamID,Outcome,WLoc,NumOT,Min,League,TeamScore,...,OppBlk,OppPF,FG_pct,FG3_pct,FT_pct,Opp_FG_pct,Opp_FG3_pct,Opp_FT_pct,eFG_pct,Opp_eFG_pct
174255,2020,96,3184,3188,1,A,0,40,W,68,...,1,19,0.418182,0.444444,0.666667,0.254237,0.133333,0.5,0.490909,0.271186
70398,2016,111,1371,1385,1,A,0,40,M,62,...,10,26,0.365385,0.4,0.6,0.442308,0.263158,0.416667,0.423077,0.490385
81460,2018,117,1323,1448,1,A,0,40,M,76,...,7,15,0.438596,0.347826,0.9,0.440678,0.346154,0.666667,0.508772,0.516949
129599,2012,23,3119,3264,1,H,0,40,W,58,...,3,12,0.456522,0.6,0.684211,0.372093,0.176471,0.571429,0.48913,0.406977
99681,2022,84,1314,1439,1,H,0,40,M,78,...,2,21,0.365079,0.4,0.846154,0.465517,0.388889,0.636364,0.444444,0.525862
68436,2016,71,1123,1444,1,A,0,40,M,74,...,1,16,0.518519,0.454545,0.615385,0.365385,0.277778,0.724138,0.611111,0.413462
55435,2014,9,1337,1415,1,A,0,40,M,69,...,4,19,0.416667,0.5,0.769231,0.448276,0.1875,0.647059,0.510417,0.474138
129539,2012,21,3265,3343,0,H,0,40,W,51,...,3,9,0.372549,0.380952,0.555556,0.4,0.357143,0.833333,0.45098,0.483333
32096,2009,87,1204,1359,0,H,0,40,M,46,...,0,11,0.351852,0.0,0.727273,0.480769,0.391304,0.909091,0.351852,0.567308
89555,2020,56,1260,1434,1,A,0,40,M,66,...,9,23,0.42,0.357143,0.633333,0.431034,0.333333,0.666667,0.47,0.508621


### Creating Possession and Pace Statistics for CF Dataframe

In [6]:
# Calculates team possessions
df_cf["Team_Possessions"] = (
    df_cf["FGA"] +
    0.44 * df_cf["FTA"] -
    df_cf["OR"] +
    df_cf["TO"]
)

# Calculates opponent possessions similarly
df_cf["Opp_Possessions"] = (
    df_cf["OppFGA"] +
    0.44 * df_cf["OppFTA"] -
    df_cf["OppOR"] +
    df_cf["OppTO"]
)

# Calculates possession margin
df_cf["PossessionMargin"] = (
    df_cf["Team_Possessions"] - df_cf["Opp_Possessions"]
)

# Calculates team's tempo
df_cf["Team_Tempo"] = df_cf["Team_Possessions"] / (df_cf["Min"] / 40)

# Calculates opponent's tempo
df_cf["Opp_Tempo"] = df_cf["Opp_Possessions"] / (df_cf["Min"] / 40)

df_cf.sample(10, random_state=529)

Unnamed: 0,Season,DayNum,TeamID,OppTeamID,Outcome,WLoc,NumOT,Min,League,TeamScore,...,Opp_FG_pct,Opp_FG3_pct,Opp_FT_pct,eFG_pct,Opp_eFG_pct,Team_Possessions,Opp_Possessions,PossessionMargin,Team_Tempo,Opp_Tempo
168194,2019,76,3321,3458,1,H,0,40,W,72,...,0.246154,0.125,0.611111,0.439655,0.269231,62.88,64.92,-2.04,62.88,64.92
136497,2013,68,3183,3240,1,H,0,40,W,76,...,0.289855,0.130435,0.769231,0.4,0.311594,77.4,77.72,-0.32,77.4,77.72
131681,2012,75,3372,3427,0,H,0,40,W,46,...,0.309524,0.222222,0.709677,0.33,0.333333,68.56,67.64,0.92,68.56,67.64
145900,2015,41,3294,3429,0,H,0,40,W,61,...,0.431818,0.2,0.666667,0.419355,0.443182,66.6,69.16,-2.56,66.6,69.16
170614,2020,1,3200,3323,0,A,0,40,W,55,...,0.365385,0.2,0.606061,0.375,0.384615,72.88,75.52,-2.64,72.88,75.52
36015,2010,58,1159,1306,0,H,0,40,M,55,...,0.391304,0.142857,0.615385,0.358333,0.423913,70.24,70.16,0.08,70.24,70.16
50031,2013,7,1401,1407,1,H,0,40,M,83,...,0.436364,0.40625,1.0,0.590909,0.554545,58.8,57.76,1.04,58.8,57.76
164700,2018,117,3111,3114,0,H,0,40,W,35,...,0.421875,0.4,0.428571,0.282609,0.453125,61.48,63.08,-1.6,61.48,63.08
167919,2019,71,3244,3251,0,A,0,40,W,61,...,0.55102,0.454545,0.571429,0.445455,0.602041,67.04,70.16,-3.12,67.04,70.16
150004,2016,16,3306,3357,1,A,0,40,W,81,...,0.4,0.416667,0.611111,0.535088,0.483333,66.44,68.92,-2.48,66.44,68.92


### Creating Efficiency Statistics for CF Dataframe

In [7]:
# Calculates team's offensive efficiency
df_cf["Team_OffEff"] = (df_cf["TeamScore"] / df_cf["Team_Possessions"]) * 100

# Calculates team's defensive efficiency
df_cf["Team_DefEff"] = (df_cf["OppScore"] / df_cf["Opp_Possessions"]) * 100

# Calculates opponent's offensive efficiency
df_cf["Opp_OffEff"] = (df_cf["OppScore"] / df_cf["Opp_Possessions"]) * 100

# Calculates opponent's defensive efficiency
df_cf["Opp_DefEff"] = (df_cf["TeamScore"] / df_cf["Team_Possessions"]) * 100

df_cf.sample(10, random_state=529)


Unnamed: 0,Season,DayNum,TeamID,OppTeamID,Outcome,WLoc,NumOT,Min,League,TeamScore,...,Opp_eFG_pct,Team_Possessions,Opp_Possessions,PossessionMargin,Team_Tempo,Opp_Tempo,Team_OffEff,Team_DefEff,Opp_OffEff,Opp_DefEff
168194,2019,76,3321,3458,1,H,0,40,W,72,...,0.269231,62.88,64.92,-2.04,62.88,64.92,114.503817,70.856439,70.856439,114.503817
136497,2013,68,3183,3240,1,H,0,40,W,76,...,0.311594,77.4,77.72,-0.32,77.4,77.72,98.191214,68.193515,68.193515,98.191214
131681,2012,75,3372,3427,0,H,0,40,W,46,...,0.333333,68.56,67.64,0.92,68.56,67.64,67.094516,73.920757,73.920757,67.094516
145900,2015,41,3294,3429,0,H,0,40,W,61,...,0.443182,66.6,69.16,-2.56,66.6,69.16,91.591592,93.984962,93.984962,91.591592
170614,2020,1,3200,3323,0,A,0,40,W,55,...,0.384615,72.88,75.52,-2.64,72.88,75.52,75.46652,79.449153,79.449153,75.46652
36015,2010,58,1159,1306,0,H,0,40,M,55,...,0.423913,70.24,70.16,0.08,70.24,70.16,78.302961,89.794755,89.794755,78.302961
50031,2013,7,1401,1407,1,H,0,40,M,83,...,0.554545,58.8,57.76,1.04,58.8,57.76,141.156463,112.534626,112.534626,141.156463
164700,2018,117,3111,3114,0,H,0,40,W,35,...,0.453125,61.48,63.08,-1.6,61.48,63.08,56.929083,96.7026,96.7026,56.929083
167919,2019,71,3244,3251,0,A,0,40,W,61,...,0.602041,67.04,70.16,-3.12,67.04,70.16,90.990453,95.496009,95.496009,90.990453
150004,2016,16,3306,3357,1,A,0,40,W,81,...,0.483333,66.44,68.92,-2.48,66.44,68.92,121.914509,100.116077,100.116077,121.914509


## Matrix Completion and Neural Network Model

In [8]:
df_cf

Unnamed: 0,Season,DayNum,TeamID,OppTeamID,Outcome,WLoc,NumOT,Min,League,TeamScore,...,Opp_eFG_pct,Team_Possessions,Opp_Possessions,PossessionMargin,Team_Tempo,Opp_Tempo,Team_OffEff,Team_DefEff,Opp_OffEff,Opp_DefEff
0,2003,10,1104,1328,1,N,0,40,M,68,...,0.433962,74.92,70.68,4.24,74.92,70.68,90.763481,87.719298,87.719298,90.763481
1,2003,10,1272,1393,1,N,0,40,M,70,...,0.402985,68.36,67.80,0.56,68.36,67.80,102.399064,92.920354,92.920354,102.399064
2,2003,11,1266,1437,1,N,0,40,M,73,...,0.321918,63.76,64.12,-0.36,63.76,64.12,114.491844,95.134124,95.134124,114.491844
3,2003,11,1296,1457,1,N,0,40,M,56,...,0.428571,57.64,57.60,0.04,57.64,57.60,97.154754,86.805556,86.805556,97.154754
4,2003,11,1208,1400,0,N,0,40,M,71,...,0.540984,62.88,63.72,-0.84,62.88,63.72,112.913486,120.841180,120.841180,112.913486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200585,2025,131,3413,3471,0,A,0,40,W,66,...,0.451613,74.16,77.32,-3.16,74.16,77.32,88.996764,96.999483,96.999483,88.996764
200586,2025,132,3192,3476,1,H,0,40,W,66,...,0.403509,61.92,61.76,0.16,61.92,61.76,106.589147,79.339378,79.339378,106.589147
200587,2025,132,3119,3250,0,H,0,40,W,62,...,0.655556,62.40,62.48,-0.08,62.40,62.48,99.358974,118.437900,118.437900,99.358974
200588,2025,132,3125,3293,0,N,0,40,W,62,...,0.648148,69.16,68.60,0.56,69.16,68.60,89.647195,120.991254,120.991254,89.647195


### Creating Game Matrix

In [16]:
import pandas as pd
import numpy as np

# Filter your input data to only include 2025 games.
df_cf = df_cf[df_cf['Season'] == 2025].copy()

# We choose the following target game parameters for our matrix completion model:
target_parameters = ["FG_pct", "FG3_pct", "FT_pct", "OR", "DR", "Ast", "TO", "Stl", "Blk", "PF"]

# List to store the complete game matrix for 2025.
season_game_matrices = []

# Since we only want to process 2025, set season to 2025.
season = 2025
df_season = df_cf.copy()

# Get the union of all teams in 2025 from both TeamID and OppTeamID.
teams_from_team = df_season[['TeamID', 'League']].drop_duplicates()
teams_from_opp = df_season[['OppTeamID', 'League']].drop_duplicates()
teams_from_opp.columns = ['TeamID', 'League']
team_league_map = pd.concat([teams_from_team, teams_from_opp]).drop_duplicates(subset='TeamID')

# Process each league separately.
for league in team_league_map['League'].unique():
    # Get teams in this league.
    teams = team_league_map[team_league_map['League'] == league]['TeamID'].unique()
    
    # Create the unique cartesian product for teams in this league.
    # Use t < o to ensure each matchup appears only once and self-matchups are excluded.
    cartesian = pd.DataFrame(
        [(season, t, o) for t in teams for o in teams if t < o],
        columns=['Season', 'TeamID', 'OppTeamID']
    )
    
    # For observed games, filter df_season to only include games where the team's league equals the current league.
    observed = (
        df_season[df_season['League'] == league]
        .groupby(['TeamID', 'OppTeamID'])[target_parameters]
        .mean()
        .reset_index()
    )
    
    # Merge the cartesian product with observed game parameters.
    full_matrix = pd.merge(cartesian, observed, how='left', on=['TeamID', 'OppTeamID'])
    
    # Optionally, add a column for league.
    full_matrix['League'] = league
    
    season_game_matrices.append(full_matrix)

# Combine the matrices from all leagues into a single DataFrame.
complete_game_matrix = pd.concat(season_game_matrices, ignore_index=True)

# Inspect a sample of the complete game matrix for Season 2025.
print("Complete Game Matrix for Season 2025:")
print(complete_game_matrix.head())



Complete Game Matrix for Season 2025:
   Season  TeamID  OppTeamID  FG_pct  FG3_pct  FT_pct  OR  DR  Ast  TO  Stl  \
0    2025    1104       1112     NaN      NaN     NaN NaN NaN  NaN NaN  NaN   
1    2025    1104       1107     NaN      NaN     NaN NaN NaN  NaN NaN  NaN   
2    2025    1104       1130     NaN      NaN     NaN NaN NaN  NaN NaN  NaN   
3    2025    1104       1133     NaN      NaN     NaN NaN NaN  NaN NaN  NaN   
4    2025    1104       1137     NaN      NaN     NaN NaN NaN  NaN NaN  NaN   

   Blk  PF League  
0  NaN NaN      M  
1  NaN NaN      M  
2  NaN NaN      M  
3  NaN NaN      M  
4  NaN NaN      M  


### Matrix Completion Model

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


# Preparing the Training Data
# We assume complete_game_matrix (generated above) is available and contains:
# ['Season', 'TeamID', 'OppTeamID'] plus the target parameters.
target_parameters = ["FG_pct", "FG3_pct", "FT_pct", "OR", "DR", "Ast", "TO", "Stl", "Blk", "PF"]

# For training, we only want rows where the target parameters are observed.
train_matrix = complete_game_matrix.dropna(subset=target_parameters).reset_index(drop=True)
print("Training samples:", len(train_matrix))

class GameParameterDataset(Dataset):
    def __init__(self, df, target_cols):
        self.team_ids = torch.tensor(df['TeamID'].values, dtype=torch.long)
        self.opp_ids = torch.tensor(df['OppTeamID'].values, dtype=torch.long)
        self.targets = torch.tensor(df[target_cols].values, dtype=torch.float32)
    
    def __len__(self):
        return len(self.team_ids)
    
    def __getitem__(self, idx):
        return self.team_ids[idx], self.opp_ids[idx], self.targets[idx]

train_dataset = GameParameterDataset(train_matrix, target_parameters)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Determine the number of teams.
all_teams = np.union1d(train_matrix['TeamID'].unique(), train_matrix['OppTeamID'].unique())
num_teams = int(all_teams.max() + 1)
latent_dim = 32  # latent factor dimension
num_targets = len(target_parameters)

# Define the Matrix Completion Model
class GameParameterCompletionModel(nn.Module):
    def __init__(self, num_teams, latent_dim, num_targets):
        super(GameParameterCompletionModel, self).__init__()
        # Learn latent embeddings for team and opponent.
        self.team_embedding = nn.Embedding(num_teams, latent_dim)
        self.opp_embedding = nn.Embedding(num_teams, latent_dim)
        # Optional bias embeddings.
        self.team_bias = nn.Embedding(num_teams, latent_dim)
        self.opp_bias = nn.Embedding(num_teams, latent_dim)
        
        # Combine features:
        # - Element-wise product
        # - Absolute difference
        # - Raw team embeddings (with biases)
        # - Raw opponent embeddings (with biases)
        input_dim = latent_dim * 4  # product + diff + (team+bias) + (opp+bias)
        hidden_dim = 64
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(hidden_dim, num_targets)
        )
        
        # Initialize embeddings.
        nn.init.normal_(self.team_embedding.weight, std=0.01)
        nn.init.normal_(self.opp_embedding.weight, std=0.01)
        nn.init.constant_(self.team_bias.weight, 0)
        nn.init.constant_(self.opp_bias.weight, 0)
    
    def forward(self, team_ids, opp_ids):
        team_latent = self.team_embedding(team_ids)
        opp_latent = self.opp_embedding(opp_ids)
        team_b = self.team_bias(team_ids)
        opp_b = self.opp_bias(opp_ids)
        
        # Compute element-wise product.
        prod = team_latent * opp_latent
        # Compute absolute difference.
        diff = torch.abs(team_latent - opp_latent)
        # Compute raw latent features with biases.
        raw_team = team_latent + team_b
        raw_opp = opp_latent + opp_b
        
        # Concatenate all features.
        combined = torch.cat([prod, diff, raw_team, raw_opp], dim=1)
        preds = self.mlp(combined)
        return preds

model = GameParameterCompletionModel(num_teams=num_teams, latent_dim=latent_dim, num_targets=num_targets)

# Train the Matrix Completion Model
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for team_batch, opp_batch, target_batch in train_loader:
        optimizer.zero_grad()
        preds = model(team_batch, opp_batch)
        loss = criterion(preds, target_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * team_batch.size(0)
    epoch_loss /= len(train_dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

# Complete the Matrix for Every Hypothetical Game
model.eval()
all_indices = torch.tensor(complete_game_matrix[['TeamID', 'OppTeamID']].values, dtype=torch.long)
all_team_ids = all_indices[:, 0]
all_opp_ids = all_indices[:, 1]

batch_size = 512
predictions = []
with torch.no_grad():
    for i in range(0, len(all_team_ids), batch_size):
        batch_team = all_team_ids[i:i+batch_size]
        batch_opp = all_opp_ids[i:i+batch_size]
        batch_preds = model(batch_team, batch_opp)
        predictions.append(batch_preds.cpu().numpy())
predictions = np.concatenate(predictions, axis=0)

# Create a DataFrame for the predicted parameters.
pred_df = pd.DataFrame(predictions, columns=[f"pred_{col}" for col in target_parameters])
# Combine with the complete game matrix.
completed_matrix = pd.concat([complete_game_matrix.reset_index(drop=True), pred_df], axis=1)

# For each target parameter, fill in missing values with the predictions.
for col in target_parameters:
    pred_col = f"pred_{col}"
    completed_matrix[col] = completed_matrix[col].fillna(completed_matrix[pred_col])
    completed_matrix.drop(columns=[pred_col], inplace=True)

print("Completed game parameter matrix (sample):")
print(completed_matrix.head())


Training samples: 8000
Epoch 1/20, Loss: 77.0908
Epoch 2/20, Loss: 11.9872
Epoch 3/20, Loss: 11.4753
Epoch 4/20, Loss: 11.1584
Epoch 5/20, Loss: 11.0521
Epoch 6/20, Loss: 10.8404
Epoch 7/20, Loss: 10.7700
Epoch 8/20, Loss: 10.6320
Epoch 9/20, Loss: 10.4593
Epoch 10/20, Loss: 10.3124
Epoch 11/20, Loss: 10.1790
Epoch 12/20, Loss: 9.9714
Epoch 13/20, Loss: 9.8210
Epoch 14/20, Loss: 9.4720
Epoch 15/20, Loss: 9.0256
Epoch 16/20, Loss: 8.7339
Epoch 17/20, Loss: 8.5606
Epoch 18/20, Loss: 8.3646
Epoch 19/20, Loss: 8.2303
Epoch 20/20, Loss: 8.0716
Completed game parameter matrix (sample):
   Season  TeamID  OppTeamID    FG_pct   FG3_pct    FT_pct         OR  \
0    2025    1104       1112  0.471806  0.361607  0.700068   9.080230   
1    2025    1104       1107  0.470908  0.360912  0.699365   9.010074   
2    2025    1104       1130  0.457620  0.355791  0.706522   9.348111   
3    2025    1104       1133  0.491897  0.376854  0.713725  10.342007   
4    2025    1104       1137  0.483941  0.371917

## Neural Network Model

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

target_parameters = [
    "FG_pct", "FG3_pct", "FT_pct", "OR", "DR", "Ast", "TO", "Stl", "Blk", "PF"
]

observed_outcomes = (
    df_cf.groupby(['Season', 'TeamID', 'OppTeamID'])['Outcome']
    .mean()
    .reset_index()
)

# Merges observed outcomes into the completed matrix
completed_matrix_with_outcome = pd.merge(
    completed_matrix,
    observed_outcomes,
    how='left',
    on=['Season', 'TeamID', 'OppTeamID']
)
print("Completed matrix with outcome labels (sample):")
print(completed_matrix_with_outcome.head())

# Building the Outcome Prediction Neural Network
feature_cols = target_parameters

# Filter training data to only include Season 2025 with observed outcomes
train_df = completed_matrix_with_outcome[completed_matrix_with_outcome["Season"] == 2025].dropna(subset=["Outcome"]).copy()
print("Training samples (2025 only):", len(train_df))

# Create a PyTorch Dataset for outcome prediction
class OutcomeDataset(Dataset):
    def __init__(self, df, feature_cols, outcome_col="Outcome"):
        self.X = torch.tensor(df[feature_cols].values, dtype=torch.float32)
        self.y = torch.tensor(df[outcome_col].values, dtype=torch.float32)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = OutcomeDataset(train_df, feature_cols)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define a simple feedforward neural network for binary outcome prediction
class OutcomePredictionNN(nn.Module):
    def __init__(self, input_dim, hidden_dims=[64, 32]):
        super(OutcomePredictionNN, self).__init__()
        layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            prev_dim = h_dim
        layers.append(nn.Linear(prev_dim, 1))  # final logit
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        logits = self.net(x)
        prob = torch.sigmoid(logits)
        return prob.squeeze()

input_dim = len(feature_cols)
outcome_model = OutcomePredictionNN(input_dim=input_dim)

criterion = nn.BCELoss()
optimizer = optim.Adam(outcome_model.parameters(), lr=0.001)

# Trains the Outcome Prediction Model on 2025 Data
num_epochs = 20
for epoch in range(num_epochs):
    outcome_model.train()
    epoch_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        preds = outcome_model(batch_X)
        loss = criterion(preds, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * batch_X.size(0)
    epoch_loss /= len(train_dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

# Use the Trained Outcome Model to Predict Outcomes for 2025 Only 
# Use the output of the matrix completion model (completed_matrix) for Season 2025
prediction_df = completed_matrix[completed_matrix["Season"] == 2025].copy()
print("Total matchups for 2025 before filtering:", len(prediction_df))

# Removes self-matchups and keep only one ordering per matchup
prediction_df = prediction_df[(prediction_df["TeamID"] != prediction_df["OppTeamID"]) & 
                              (prediction_df["TeamID"] < prediction_df["OppTeamID"])].copy()
prediction_df = prediction_df.drop_duplicates(subset=["Season", "TeamID", "OppTeamID"])
print("Number of unique matchups for 2025 after filtering:", len(prediction_df))


all_features = torch.tensor(prediction_df[feature_cols].values, dtype=torch.float32)
outcome_model.eval()
with torch.no_grad():
    all_preds = outcome_model(all_features).cpu().numpy()

prediction_df["Predicted_Outcome"] = all_preds


# Formats Submission File for 2025
def create_submission_row(row):
    season = row['Season']
    team1 = row['TeamID']
    team2 = row['OppTeamID']
    prob = row["Predicted_Outcome"]
    submission_id = f"{season}_{str(team1).zfill(4)}_{str(team2).zfill(4)}"
    return pd.Series({"ID": submission_id, "Pred": prob})

submission_df = prediction_df.apply(create_submission_row, axis=1)
submission_df = submission_df.drop_duplicates(subset="ID").sort_values("ID").reset_index(drop=True)

# Saves the submission file
submission_df.to_csv("matrix_completion_2025_v2.csv", index=False)
print("Submission file for 2025 created with", submission_df.shape[0], "rows.")
print(submission_df.head())



Completed matrix with outcome labels (sample):
   Season  TeamID  OppTeamID    FG_pct   FG3_pct    FT_pct         OR  \
0    2025    1104       1112  0.471806  0.361607  0.700068   9.080230   
1    2025    1104       1107  0.470908  0.360912  0.699365   9.010074   
2    2025    1104       1130  0.457620  0.355791  0.706522   9.348111   
3    2025    1104       1133  0.491897  0.376854  0.713725  10.342007   
4    2025    1104       1137  0.483941  0.371917  0.714909  10.170638   

          DR        Ast         TO       Stl       Blk         PF League  \
0  25.311821  15.665292  10.672794  6.811394  4.042196  15.615582      M   
1  25.134733  15.561280  10.588608  6.761246  4.016811  15.499048      M   
2  25.103640  14.998397  14.036436  7.420934  3.971437  17.762068      M   
3  28.580099  17.791071  12.125739  7.750404  4.576554  17.567400      M   
4  27.810369  17.175924  13.059697  7.784810  4.457716  17.905924      M   

   Outcome  
0      NaN  
1      NaN  
2      NaN  
3    