Data processing: Collecting features for training XGBoost model. Features trained on: 

Team Performance:

    Point differential
    Win percentage
    Offensive/defensive efficiency
    Effective field goal % (eFG%)
    True shooting % (TS%)
    Offensive/defensive rebounding %
    Turnover ratio %
    Free throw rate (FTR)
    Vegas odds
    Strength of Schedule
    Coach Experience
    3p%
    Conference strength

Game Context:

    SeasonId
    Regular season vs. tournament
    Tournament round/seed (for tournament games)

In [1]:
# make needed installs

In [2]:
import pandas as pd
import os

In [3]:
data_dir = "../data"

# Load men's data
m_teams = pd.read_csv(os.path.join(data_dir, "MTeams.csv"))
m_team_conferences = pd.read_csv(os.path.join(data_dir, "MTeamConferences.csv"))
m_coaches = pd.read_csv(os.path.join(data_dir, "MTeamCoaches.csv"))
m_regular_season = pd.read_csv(os.path.join(data_dir, "MRegularSeasonDetailedResults.csv"))
m_tourney_results = pd.read_csv(os.path.join(data_dir, "MNCAATourneyDetailedResults.csv"))
m_tourney_seeds = pd.read_csv(os.path.join(data_dir, "MNCAATourneySeeds.csv"))

# Load women's data (without coaches file)
w_teams = pd.read_csv(os.path.join(data_dir, "WTeams.csv"))
w_team_conferences = pd.read_csv(os.path.join(data_dir, "WTeamConferences.csv"))
w_regular_season = pd.read_csv(os.path.join(data_dir, "WRegularSeasonDetailedResults.csv"))
w_tourney_results = pd.read_csv(os.path.join(data_dir, "WNCAATourneyDetailedResults.csv"))
w_tourney_seeds = pd.read_csv(os.path.join(data_dir, "WNCAATourneySeeds.csv"))

# Load shared data
conferences = pd.read_csv(os.path.join(data_dir, "Conferences.csv"))
seasons = pd.read_csv(os.path.join(data_dir, "MSeasons.csv"))  # M and W seasons should be the same

# Print summary
print(f"Loaded {len(m_teams)} men's teams and {len(w_teams)} women's teams")
print(f"Men's regular season games: {len(m_regular_season)}")
print(f"Women's regular season games: {len(w_regular_season)}")
print(f"Men's tournament games: {len(m_tourney_results)}")
print(f"Women's tournament games: {len(w_tourney_results)}")

Loaded 380 men's teams and 378 women's teams
Men's regular season games: 118449
Women's regular season games: 81308
Men's tournament games: 1382
Women's tournament games: 894


In [4]:
def calculate_team_stats(games_df, season, gender='M'):
    """
    Calculate season statistics for each team
    
    Parameters:
    -----------
    games_df : DataFrame
        DataFrame containing game results with detailed statistics
    season : int
        The season to calculate stats for
    gender : str
        'M' for men's games, 'W' for women's games
    
    Returns:
    --------
    DataFrame containing team statistics and advanced metrics
    """
    season_games = games_df[games_df['Season'] == season].copy()
    
    teams_stats = {}
    
    # Process each game
    for _, game in season_games.iterrows():
        w_team = game['WTeamID']
        l_team = game['LTeamID']
        
        # Initialize team stats if not already present
        for team_id in [w_team, l_team]:
            if team_id not in teams_stats:
                teams_stats[team_id] = {
                    'TeamID': team_id,
                    'Season': season,
                    'Gender': gender,
                    'Wins': 0,
                    'Losses': 0,
                    'ScoredPoints': 0,
                    'AllowedPoints': 0,
                    'Games': 0,
                    'FGM': 0,
                    'FGA': 0,
                    'FGM3': 0,
                    'FGA3': 0,
                    'FTM': 0,
                    'FTA': 0,
                    'OR': 0,
                    'DR': 0,
                    'Ast': 0,
                    'TO': 0,
                    'Stl': 0,
                    'Blk': 0,
                    'PF': 0,
                    'OppFGM': 0,
                    'OppFGA': 0,
                    'OppFGM3': 0,
                    'OppFGA3': 0,
                    'OppFTM': 0,
                    'OppFTA': 0,
                    'OppOR': 0,
                    'OppDR': 0,
                    'OppAst': 0,
                    'OppTO': 0,
                    'OppStl': 0,
                    'OppBlk': 0,
                    'OppPF': 0
                }
        
        # Update winner stats
        teams_stats[w_team]['Wins'] += 1
        teams_stats[w_team]['ScoredPoints'] += game['WScore']
        teams_stats[w_team]['AllowedPoints'] += game['LScore']
        teams_stats[w_team]['Games'] += 1
        
        # Add detailed stats for winner
        teams_stats[w_team]['FGM'] += game['WFGM']
        teams_stats[w_team]['FGA'] += game['WFGA']
        teams_stats[w_team]['FGM3'] += game['WFGM3']
        teams_stats[w_team]['FGA3'] += game['WFGA3']
        teams_stats[w_team]['FTM'] += game['WFTM']
        teams_stats[w_team]['FTA'] += game['WFTA']
        teams_stats[w_team]['OR'] += game['WOR']
        teams_stats[w_team]['DR'] += game['WDR']
        teams_stats[w_team]['Ast'] += game['WAst']
        teams_stats[w_team]['TO'] += game['WTO']
        teams_stats[w_team]['Stl'] += game['WStl']
        teams_stats[w_team]['Blk'] += game['WBlk']
        teams_stats[w_team]['PF'] += game['WPF']
        
        # Add opponent stats for winner
        teams_stats[w_team]['OppFGM'] += game['LFGM']
        teams_stats[w_team]['OppFGA'] += game['LFGA']
        teams_stats[w_team]['OppFGM3'] += game['LFGM3']
        teams_stats[w_team]['OppFGA3'] += game['LFGA3']
        teams_stats[w_team]['OppFTM'] += game['LFTM']
        teams_stats[w_team]['OppFTA'] += game['LFTA']
        teams_stats[w_team]['OppOR'] += game['LOR']
        teams_stats[w_team]['OppDR'] += game['LDR']
        teams_stats[w_team]['OppAst'] += game['LAst']
        teams_stats[w_team]['OppTO'] += game['LTO']
        teams_stats[w_team]['OppStl'] += game['LStl']
        teams_stats[w_team]['OppBlk'] += game['LBlk']
        teams_stats[w_team]['OppPF'] += game['LPF']
        
        # Update loser stats
        teams_stats[l_team]['Losses'] += 1
        teams_stats[l_team]['ScoredPoints'] += game['LScore']
        teams_stats[l_team]['AllowedPoints'] += game['WScore']
        teams_stats[l_team]['Games'] += 1
        
        # Add detailed stats for loser
        teams_stats[l_team]['FGM'] += game['LFGM']
        teams_stats[l_team]['FGA'] += game['LFGA']
        teams_stats[l_team]['FGM3'] += game['LFGM3']
        teams_stats[l_team]['FGA3'] += game['LFGA3']
        teams_stats[l_team]['FTM'] += game['LFTM']
        teams_stats[l_team]['FTA'] += game['LFTA']
        teams_stats[l_team]['OR'] += game['LOR']
        teams_stats[l_team]['DR'] += game['LDR']
        teams_stats[l_team]['Ast'] += game['LAst']
        teams_stats[l_team]['TO'] += game['LTO']
        teams_stats[l_team]['Stl'] += game['LStl']
        teams_stats[l_team]['Blk'] += game['LBlk']
        teams_stats[l_team]['PF'] += game['LPF']
        
        # Add opponent stats for loser
        teams_stats[l_team]['OppFGM'] += game['WFGM']
        teams_stats[l_team]['OppFGA'] += game['WFGA']
        teams_stats[l_team]['OppFGM3'] += game['WFGM3']
        teams_stats[l_team]['OppFGA3'] += game['WFGA3']
        teams_stats[l_team]['OppFTM'] += game['WFTM']
        teams_stats[l_team]['OppFTA'] += game['WFTA']
        teams_stats[l_team]['OppOR'] += game['WOR']
        teams_stats[l_team]['OppDR'] += game['WDR']
        teams_stats[l_team]['OppAst'] += game['WAst']
        teams_stats[l_team]['OppTO'] += game['WTO']
        teams_stats[l_team]['OppStl'] += game['WStl']
        teams_stats[l_team]['OppBlk'] += game['WBlk']
        teams_stats[l_team]['OppPF'] += game['WPF']
    
    # Convert to DataFrame
    stats_df = pd.DataFrame(list(teams_stats.values()))
    
    # Calculate advanced metrics
    if not stats_df.empty:
        # Win percentage
        stats_df['WinPct'] = stats_df['Wins'] / stats_df['Games']
        
        # Point differential
        stats_df['PointDiff'] = (stats_df['ScoredPoints'] - stats_df['AllowedPoints']) / stats_df['Games']
        
        # Effective Field Goal Percentage
        stats_df['EFG'] = (stats_df['FGM'] + 0.5 * stats_df['FGM3']) / stats_df['FGA']
        stats_df['OppEFG'] = (stats_df['OppFGM'] + 0.5 * stats_df['OppFGM3']) / stats_df['OppFGA']
        
        # True Shooting Percentage
        stats_df['TS'] = stats_df['ScoredPoints'] / (2 * (stats_df['FGA'] + 0.44 * stats_df['FTA']))
        
        # Offensive and Defensive Rebounding Percentages
        stats_df['ORebPct'] = stats_df['OR'] / (stats_df['OR'] + stats_df['OppDR'])
        stats_df['DRebPct'] = stats_df['DR'] / (stats_df['DR'] + stats_df['OppOR'])
        
        # Turnover Ratio
        stats_df['TORatio'] = stats_df['TO'] / (stats_df['FGA'] + 0.44 * stats_df['FTA'] + stats_df['TO'])
        stats_df['OppTORatio'] = stats_df['OppTO'] / (stats_df['OppFGA'] + 0.44 * stats_df['OppFTA'] + stats_df['OppTO'])
        
        # Free Throw Rate
        stats_df['FTR'] = stats_df['FTA'] / stats_df['FGA']
        
        # Three Point Rate
        stats_df['ThreePAR'] = stats_df['FGA3'] / stats_df['FGA']
        
        # Three Point Percentage
        stats_df['ThreePPct'] = stats_df['FGM3'] / stats_df['FGA3']
        
        # Offensive and Defensive Efficiency (points per 100 possessions)
        # Approximate possessions formula
        stats_df['Poss'] = stats_df['FGA'] - stats_df['OR'] + stats_df['TO'] + 0.44 * stats_df['FTA']
        stats_df['OppPoss'] = stats_df['OppFGA'] - stats_df['OppOR'] + stats_df['OppTO'] + 0.44 * stats_df['OppFTA']
        
        stats_df['OffEff'] = 100 * stats_df['ScoredPoints'] / stats_df['Poss']
        stats_df['DefEff'] = 100 * stats_df['AllowedPoints'] / stats_df['OppPoss']
        stats_df['NetEff'] = stats_df['OffEff'] - stats_df['DefEff']
    
    return stats_df

In [5]:
# Calculate conference strength for a given season
def calculate_conference_strength(team_stats_df, team_conferences_df, season):
    """Calculate conference strength metrics for a given season"""
    # Get team conferences for the season
    season_conferences = team_conferences_df[team_conferences_df['Season'] == season]
    
    # Merge team stats with conferences
    merged = pd.merge(
        team_stats_df, 
        season_conferences, 
        on=['TeamID', 'Season'], 
        how='inner'
    )
    
    # Group by conference and calculate metrics
    conf_strength = merged.groupby('ConfAbbrev').agg({
        'WinPct': 'mean',
        'PointDiff': 'mean',
        'OffEff': 'mean',
        'DefEff': 'mean',
        'NetEff': 'mean',
        'TeamID': 'count'  # Number of teams in conference
    }).reset_index()
    
    conf_strength.rename(columns={'TeamID': 'TeamCount'}, inplace=True)
    
    return conf_strength

In [6]:
def calculate_strength_of_schedule(games_df, team_stats_df, season):
    """Calculate strength of schedule for each team in a season"""
    season_games = games_df[games_df['Season'] == season].copy()
    teams_sos = {}
    
    # Get win percentages
    win_pct = team_stats_df[['TeamID', 'WinPct']].set_index('TeamID')['WinPct'].to_dict()
    
    # Process each game
    for _, game in season_games.iterrows():
        w_team = game['WTeamID']
        l_team = game['LTeamID']
        
        # Initialize teams if not already present
        for team_id in [w_team, l_team]:
            if team_id not in teams_sos:
                teams_sos[team_id] = {
                    'TeamID': team_id,
                    'Season': season,
                    'OpponentWinPct': [],
                    'Games': 0
                }
        
        # Update SOS for winner (opponent = loser)
        if l_team in win_pct:
            teams_sos[w_team]['OpponentWinPct'].append(win_pct[l_team])
            teams_sos[w_team]['Games'] += 1
        
        # Update SOS for loser (opponent = winner)
        if w_team in win_pct:
            teams_sos[l_team]['OpponentWinPct'].append(win_pct[w_team])
            teams_sos[l_team]['Games'] += 1
    
    # Calculate average opponent win percentage (SOS)
    sos_list = []
    for team_id, data in teams_sos.items():
        if data['Games'] > 0:
            avg_opp_win_pct = sum(data['OpponentWinPct']) / data['Games']
            sos_list.append({
                'TeamID': team_id,
                'Season': season,
                'SOS': avg_opp_win_pct,
                'Games': data['Games']
            })
    
    return pd.DataFrame(sos_list)


In [7]:
# Calculate coach experience
def calculate_coach_experience(coaches_df, tourney_results_df, season):
    """Calculate tournament experience for coaches up to the given season"""
    # Filter tournament results up to this season
    past_tourney = tourney_results_df[tourney_results_df['Season'] < season].copy()
    
    # Count wins for each team
    w_counts = past_tourney.groupby('WTeamID').size().reset_index(name='TourneyWins')
    
    # Count appearances (unique team-season combinations)
    all_teams = pd.concat([
        past_tourney[['Season', 'WTeamID']].rename(columns={'WTeamID': 'TeamID'}),
        past_tourney[['Season', 'LTeamID']].rename(columns={'LTeamID': 'TeamID'})
    ])
    
    appearances = all_teams.drop_duplicates().groupby('TeamID').size().reset_index(name='TourneyAppearances')
    
    # Get current coaches
    current_coaches = coaches_df[coaches_df['Season'] == season].copy()
    
    # Merge tournament experience with coaches
    coach_exp = pd.merge(current_coaches, w_counts, left_on='TeamID', right_on='WTeamID', how='left')
    coach_exp = pd.merge(coach_exp, appearances, on='TeamID', how='left')
    
    # Fill NaN values with 0 (coaches/teams with no tournament experience)
    coach_exp['TourneyWins'] = coach_exp['TourneyWins'].fillna(0)
    coach_exp['TourneyAppearances'] = coach_exp['TourneyAppearances'].fillna(0)
    
    return coach_exp[['TeamID', 'Season', 'CoachName', 'TourneyWins', 'TourneyAppearances']]

In [8]:
# Function to prepare features for a matchup
def create_matchup_features(team1_id, team2_id, team_stats, tourney=False, round_num=None):
    """Create features for a matchup between two teams"""
    team1 = team_stats[team_stats['TeamID'] == team1_id].iloc[0]
    team2 = team_stats[team_stats['TeamID'] == team2_id].iloc[0]
    
    features = {
        # Basic stats differentials
        'WinPct_diff': team1['WinPct'] - team2['WinPct'],
        'PointDiff_diff': team1['PointDiff'] - team2['PointDiff'],
        
        # Shooting differentials
        'EFG_diff': team1['EFG'] - team2['EFG'],
        'TS_diff': team1['TS'] - team2['TS'],
        'ThreePPct_diff': team1['ThreePPct'] - team2['ThreePPct'],
        
        # Rebounding differentials
        'ORebPct_diff': team1['ORebPct'] - team2['ORebPct'],
        'DRebPct_diff': team1['DRebPct'] - team2['DRebPct'],
        
        # Efficiency differentials
        'OffEff_diff': team1['OffEff'] - team2['OffEff'],
        'DefEff_diff': team1['DefEff'] - team2['DefEff'],
        'NetEff_diff': team1['NetEff'] - team2['NetEff'],
        
        # Turnover differentials
        'TORatio_diff': team1['TORatio'] - team2['TORatio'],
        
        # Free throw differentials
        'FTR_diff': team1['FTR'] - team2['FTR'],
        
        # Context features
        'IsTournament': 1 if tourney else 0,
    }
    
    # Add round number if tournament game
    if tourney and round_num is not None:
        features['TournamentRound'] = round_num
    
    # Add SOS if available
    if 'SOS' in team1 and 'SOS' in team2:
        features['SOS_diff'] = team1['SOS'] - team2['SOS']
    
    # Add conference strength if available
    if 'ConfStrength' in team1 and 'ConfStrength' in team2:
        features['ConfStrength_diff'] = team1['ConfStrength'] - team2['ConfStrength']
    
    # Add coach experience if available
    if 'CoachTourneyWins' in team1 and 'CoachTourneyWins' in team2:
        features['CoachExp_diff'] = team1['CoachTourneyWins'] - team2['CoachTourneyWins']
    
    return features

In [9]:
# Function to prepare dataset for a specific season
def prepare_season_dataset(reg_season_df, tourney_df, tourney_seeds_df, season, include_tourney=True):
    """Prepare training dataset for a given season"""
    # Calculate team statistics
    team_stats = calculate_team_stats(reg_season_df, season)
    
    # Calculate strength of schedule
    sos = calculate_strength_of_schedule(reg_season_df, team_stats, season)
    team_stats = pd.merge(team_stats, sos, on=['TeamID', 'Season'], how='left')
    
    # Calculate conference strength
    conf_strength = calculate_conference_strength(team_stats, team_conferences, season)
    
    # Add conference strength to team stats
    season_confs = team_conferences[team_conferences['Season'] == season]
    team_stats = pd.merge(
        team_stats, 
        season_confs[['TeamID', 'ConfAbbrev']], 
        on='TeamID', 
        how='left'
    )
    team_stats = pd.merge(
        team_stats,
        conf_strength[['ConfAbbrev', 'NetEff']],
        on='ConfAbbrev',
        how='left'
    )
    team_stats.rename(columns={'NetEff_y': 'ConfStrength', 'NetEff_x': 'NetEff'}, inplace=True)
    
    # Add coach experience
    coach_exp = calculate_coach_experience(coaches, tourney_results, season)
    team_stats = pd.merge(team_stats, coach_exp[['TeamID', 'TourneyWins']], on='TeamID', how='left')
    team_stats.rename(columns={'TourneyWins': 'CoachTourneyWins'}, inplace=True)
    team_stats['CoachTourneyWins'] = team_stats['CoachTourneyWins'].fillna(0)
    
    # Prepare features for regular season games
    regular_features = []
    season_games = reg_season_df[reg_season_df['Season'] == season]
    
    for _, game in season_games.iterrows():
        # Create matchup features for both ways (for training)
        w_features = create_matchup_features(game['WTeamID'], game['LTeamID'], team_stats)
        l_features = create_matchup_features(game['LTeamID'], game['WTeamID'], team_stats)
        
        # Add result (1 for win, 0 for loss)
        w_features['Result'] = 1
        l_features['Result'] = 0
        
        # Add season and teams
        for features in [w_features, l_features]:
            features['Season'] = season
            features['Team1ID'] = game['WTeamID'] if features['Result'] == 1 else game['LTeamID']
            features['Team2ID'] = game['LTeamID'] if features['Result'] == 1 else game['WTeamID']
        
        regular_features.append(w_features)
        regular_features.append(l_features)
    
    # Convert to DataFrame
    regular_df = pd.DataFrame(regular_features)
    
    # Add tournament games if requested
    if include_tourney and not tourney_df[tourney_df['Season'] == season].empty:
        tourney_features = []
        season_tourney = tourney_df[tourney_df['Season'] == season]
        
        # Extract round information from seeds dataframe
        # For simplicity, we'll use a placeholder approach
        # In practice, would need to calculate rounds based on seed paths
        round_map = {1: 64, 2: 32, 3: 16, 4: 8, 5: 4, 6: 2}
        
        for _, game in season_tourney.iterrows():
            # Determine round based on day of tournament
            # This is an approximation
            round_num = max(1, min(6, game['DayNum'] - min(season_tourney['DayNum']) + 1))
            
            # Create matchup features for both ways (for training)
            w_features = create_matchup_features(game['WTeamID'], game['LTeamID'], team_stats, True, round_num)
            l_features = create_matchup_features(game['LTeamID'], game['WTeamID'], team_stats, True, round_num)
            
            # Add result (1 for win, 0 for loss)
            w_features['Result'] = 1
            l_features['Result'] = 0
            
            # Add season and teams
            for features in [w_features, l_features]:
                features['Season'] = season
                features['Team1ID'] = game['WTeamID'] if features['Result'] == 1 else game['LTeamID']
                features['Team2ID'] = game['LTeamID'] if features['Result'] == 1 else game['WTeamID']
            
            tourney_features.append(w_features)
            tourney_features.append(l_features)
        
        # Convert to DataFrame and combine with regular season
        if tourney_features:
            tourney_df = pd.DataFrame(tourney_features)
            return pd.concat([regular_df, tourney_df], ignore_index=True)
    
    return regular_df

In [10]:
def process_season(season, reg_season_df, tourney_df, coaches_df, conf_df):
    """Process a season and return team features"""
    print(f"Processing season {season}...")
    
    # Filter data for this season
    season_games = reg_season_df[reg_season_df['Season'] == season]
    
    # Calculate team statistics
    team_stats = calculate_team_stats(season_games, season)
    
    # Calculate strength of schedule - ADDED THIS
    sos = calculate_strength_of_schedule(season_games, team_stats, season)
    team_stats = pd.merge(team_stats, sos, on=['TeamID', 'Season'], how='left')
    
    # Calculate conference strength
    conf_strength = calculate_conference_strength(team_stats, conf_df, season)
    
    # Add conference strength to team stats
    season_confs = conf_df[conf_df['Season'] == season]
    team_stats = pd.merge(
        team_stats,
        season_confs[['TeamID', 'ConfAbbrev']],
        on='TeamID',
        how='left'
    )
    team_stats = pd.merge(
        team_stats,
        conf_strength[['ConfAbbrev', 'NetEff']],
        on='ConfAbbrev',
        how='left'
    )
    team_stats.rename(columns={'NetEff_y': 'ConfStrength', 'NetEff_x': 'NetEff'}, inplace=True)
    
    # Calculate coach experience
    coach_exp = calculate_coach_experience(coaches_df, tourney_df, season)
    
    # Add coach experience to team stats
    team_stats = pd.merge(
        team_stats,
        coach_exp[['TeamID', 'TourneyWins', 'TourneyAppearances']],
        on='TeamID',
        how='left'
    )
    
    # Fill NA values
    team_stats['TourneyWins'] = team_stats['TourneyWins'].fillna(0)
    team_stats['TourneyAppearances'] = team_stats['TourneyAppearances'].fillna(0)
    team_stats['SOS'] = team_stats['SOS'].fillna(team_stats['SOS'].mean())  # Added this
    
    return team_stats

In [11]:
# Build training dataset across multiple seasons
def build_training_dataset(reg_season_df, tourney_df, tourney_seeds_df, start_season=2003, end_season=2023):
    """Build training dataset across multiple seasons"""
    all_data = []
    
    for season in range(start_season, end_season + 1):
        print(f"Processing season {season}...")
        season_data = prepare_season_dataset(reg_season_df, tourney_df, tourney_seeds_df, season)
        all_data.append(season_data)
    
    return pd.concat(all_data, ignore_index=True)

In [12]:
# Process multiple seasons
def collect_features(start_season=2003, end_season=2025):
    """Collect features for multiple seasons"""
    all_seasons = []
    
    for season in range(start_season, end_season + 1):
        season_stats = process_season(
            season,
            regular_season,
            tourney_results,
            coaches,
            team_conferences
        )
        all_seasons.append(season_stats)
    
    # Combine all seasons
    team_features = pd.concat(all_seasons, ignore_index=True)
    
    # Add team names
    team_features = pd.merge(
        team_features,
        teams[['TeamID', 'TeamName']],
        on='TeamID',
        how='left'
    )
    
    return team_features

In [13]:
# Process and save data
all_team_features = collect_features(2003, 2025)
print(f"\nCollected features for {len(all_team_features)} team-seasons")

# Save to CSV
all_team_features.to_csv('team_features.csv', index=False)
print("Saved features to team_features.csv")

# Display feature columns
print("\nFeature columns:")
for col in sorted(all_team_features.columns):
    print(f"- {col}")

# Show sample of final dataset
print("\nSample of final dataset:")
print(all_team_features[['Season', 'TeamName', 'WinPct', 'OffEff', 'DefEff', 'NetEff', 'EFG', 'TS', 'ORebPct', 'DRebPct', 'TORatio', 'FTR', 'SOS', 'ConfStrength', 'TourneyWins']].head())

NameError: name 'regular_season' is not defined

In [14]:
all_team_features.head()

NameError: name 'all_team_features' is not defined

In [15]:
def process_all_team_stats(mens_games_df, womens_games_df, seasons):
    """
    Process both men's and women's basketball statistics for multiple seasons
    
    Parameters:
    -----------
    mens_games_df : DataFrame
        DataFrame containing men's game results
    womens_games_df : DataFrame
        DataFrame containing women's game results
    seasons : list
        List of seasons to process
    
    Returns:
    --------
    DataFrame containing combined team statistics for all seasons and genders
    """
    all_stats = []
    
    # Process men's games
    for season in seasons:
        print(f"Processing men's games for season {season}...")
        mens_stats = calculate_team_stats(mens_games_df, season, gender='M')
        all_stats.append(mens_stats)
    
    # Process women's games
    for season in seasons:
        print(f"Processing women's games for season {season}...")
        womens_stats = calculate_team_stats(womens_games_df, season, gender='W')
        all_stats.append(womens_stats)
    
    # Combine all stats
    combined_stats = pd.concat(all_stats, ignore_index=True)
    
    return combined_stats

In [None]:
# Load data
mens_games = pd.read_csv('../data/MRegularSeasonDetailedResults.csv')
womens_games = pd.read_csv('../data/WRegularSeasonDetailedResults.csv')

# Get unique seasons
all_seasons = sorted(set(mens_games['Season'].unique()).union(set(womens_games['Season'].unique())))
print(f"Processing {len(all_seasons)} seasons: {all_seasons}")

# Process all team stats
team_stats = process_all_team_stats(mens_games, womens_games, all_seasons)

# Save results
team_stats.to_csv('../data/processed/all_team_stats.csv', index=False)

print(f"Processed stats for {team_stats['Gender'].value_counts()['M']} men's teams and {team_stats['Gender'].value_counts()['W']} women's teams")
print(f"Total rows: {len(team_stats)}")

# Display sample of results
display(team_stats.head())

Processing 23 seasons: [np.int64(2003), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2010), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
Processing men's games for season 2003...
Processing men's games for season 2004...
Processing men's games for season 2005...
Processing men's games for season 2006...
Processing men's games for season 2007...
Processing men's games for season 2008...
Processing men's games for season 2009...
Processing men's games for season 2010...
Processing men's games for season 2011...
Processing men's games for season 2012...
Processing men's games for season 2013...
Processing men's games for season 2014...
Processing men's games for season 2015...
Processing men's games for season 2016...
Processing men's gam

Unnamed: 0,TeamID,Season,Gender,Wins,Losses,ScoredPoints,AllowedPoints,Games,FGM,FGA,...,TORatio,OppTORatio,FTR,ThreePAR,ThreePPct,Poss,OppPoss,OffEff,DefEff,NetEff
0,1104,2003,M,17,11,1940,1820,28,673,1601,...,0.166753,0.180197,0.366021,0.347283,0.320144,1850.84,1848.2,104.817272,98.474191,6.343081
1,1328,2003,M,24,6,2135,1805,30,758,1696,...,0.154213,0.182023,0.329009,0.335495,0.393673,1931.52,1945.96,110.534708,92.756275,17.778434
2,1272,2003,M,23,6,2161,1909,29,762,1740,...,0.164463,0.183657,0.381609,0.334483,0.348797,2024.16,2021.44,106.760335,94.437629,12.322707
3,1393,2003,M,24,5,2323,2027,29,848,1804,...,0.157975,0.165343,0.379712,0.254989,0.330435,2085.4,2076.12,111.393498,97.634048,13.75945
4,1266,2003,M,23,5,2195,1895,28,762,1575,...,0.169202,0.157032,0.419683,0.271111,0.379391,1878.84,1873.84,116.827404,101.129232,15.698172


In [17]:
import pandas as pd
import numpy as np
import os

# Create directory for output files
os.makedirs('../data/model', exist_ok=True)

# Load the team statistics you've collected
team_stats = pd.read_csv('../data/processed/all_team_stats.csv')

# Load game results for creating historical matchups
mens_games = pd.read_csv('../data/MRegularSeasonDetailedResults.csv')
womens_games = pd.read_csv('../data/WRegularSeasonDetailedResults.csv')

# Load tournament results if available (for additional training data)
try:
    mens_tourney = pd.read_csv('../data/MNCAATourneyDetailedResults.csv')
    womens_tourney = pd.read_csv('../data/WNCAATourneyDetailedResults.csv')
    
    # Combine regular season and tournament games
    mens_all_games = pd.concat([mens_games, mens_tourney], ignore_index=True)
    womens_all_games = pd.concat([womens_games, womens_tourney], ignore_index=True)
    print("Tournament data loaded and combined with regular season data")
except FileNotFoundError:
    # If tournament data not available, just use regular season
    mens_all_games = mens_games.copy()
    womens_all_games = womens_games.copy()
    print("Tournament data not found, using only regular season data")

# Load the sample submission file for Stage 2
sample_submission = pd.read_csv('../data/SampleSubmissionStage2.csv', names=['ID', 'Pred'], header=0)

# Parse sample submission
sample_submission['Season'] = sample_submission['ID'].str.split('_').str[0].astype(int)
sample_submission['Team1ID'] = sample_submission['ID'].str.split('_').str[1].astype(int)
sample_submission['Team2ID'] = sample_submission['ID'].str.split('_').str[2].astype(int)

print(f"Loaded {len(sample_submission)} matchups for prediction")

# Function to create feature matchups
def create_matchup_features(games_df, team_stats_df, gender='M', include_outcomes=True):
    """
    Create matchup features by combining statistics of both teams
    
    Parameters:
    -----------
    games_df : DataFrame
        Games data with matchup information
    team_stats_df : DataFrame
        Team statistics data
    gender : str
        'M' for men's games, 'W' for women's games
    include_outcomes : bool
        Whether to include the game outcome (for training data)
        
    Returns:
    --------
    DataFrame with matchup features
    """
    matchups = []
    
    # Filter team stats for the appropriate gender
    gender_stats = team_stats_df[team_stats_df['Gender'] == gender]
    
    # Process each game
    for _, game in games_df.iterrows():
        season = game['Season']
        team1_id = game['WTeamID'] if include_outcomes else game['Team1ID']
        team2_id = game['LTeamID'] if include_outcomes else game['Team2ID']
        
        # Get team stats for this season
        team1_stats = gender_stats[(gender_stats['TeamID'] == team1_id) & 
                                  (gender_stats['Season'] == season)]
        team2_stats = gender_stats[(gender_stats['TeamID'] == team2_id) & 
                                  (gender_stats['Season'] == season)]
        
        # Skip if stats for either team are missing
        if team1_stats.empty or team2_stats.empty:
            continue
            
        # Get the first (and should be only) row for each team
        team1_stats = team1_stats.iloc[0]
        team2_stats = team2_stats.iloc[0]
        
        # Create matchup features
        matchup = {
            'Season': season,
            'Team1ID': team1_id,
            'Team2ID': team2_id,
            'Gender': gender
        }
        
        # Add team1 stats with prefix
        for col in team1_stats.index:
            if col not in ['TeamID', 'Season', 'Gender', 'Wins', 'Losses', 'Games']:
                matchup[f'Team1_{col}'] = team1_stats[col]
                
        # Add team2 stats with prefix
        for col in team2_stats.index:
            if col not in ['TeamID', 'Season', 'Gender', 'Wins', 'Losses', 'Games']:
                matchup[f'Team2_{col}'] = team2_stats[col]
                
        # Add comparative features
        for col in team1_stats.index:
            if col not in ['TeamID', 'Season', 'Gender', 'Wins', 'Losses', 'Games']:
                matchup[f'Diff_{col}'] = team1_stats[col] - team2_stats[col]
                matchup[f'Ratio_{col}'] = team1_stats[col] / team2_stats[col] if team2_stats[col] != 0 else 0
                
        # Add outcome (Team1 wins) if this is for training data
        if include_outcomes:
            # For historical data, Team1 is always the winner, so outcome is 1
            matchup['Outcome'] = 1
            
            # Create the reverse matchup too (with Team2 as winner) with outcome 0
            reverse_matchup = matchup.copy()
            reverse_matchup['Team1ID'] = team2_id
            reverse_matchup['Team2ID'] = team1_id
            reverse_matchup['Outcome'] = 0
            
            # Swap team1 and team2 features
            for col in team1_stats.index:
                if col not in ['TeamID', 'Season', 'Gender', 'Wins', 'Losses', 'Games']:
                    reverse_matchup[f'Team1_{col}'] = matchup[f'Team2_{col}']
                    reverse_matchup[f'Team2_{col}'] = matchup[f'Team1_{col}']
                    reverse_matchup[f'Diff_{col}'] = -matchup[f'Diff_{col}']
                    reverse_matchup[f'Ratio_{col}'] = 1/matchup[f'Ratio_{col}'] if matchup[f'Ratio_{col}'] != 0 else 0
            
            matchups.append(reverse_matchup)
            
        matchups.append(matchup)
    
    return pd.DataFrame(matchups)

# Create training data from historical matchups (2024 and prior)
print("Creating training data from historical matchups...")

# Filter games for seasons 2024 and prior
mens_historical = mens_all_games[mens_all_games['Season'] <= 2024]
womens_historical = womens_all_games[womens_all_games['Season'] <= 2024]

# Create matchup features for men's games
mens_training = create_matchup_features(mens_historical, team_stats, gender='M')
print(f"Created {len(mens_training)} men's training examples")

# Create matchup features for women's games
womens_training = create_matchup_features(womens_historical, team_stats, gender='W')
print(f"Created {len(womens_training)} women's training examples")

# Combine training data
training_data = pd.concat([mens_training, womens_training], ignore_index=True)

# Create test data from 2025 matchups in sample submission
print("Creating test data from 2025 matchups...")

# Prepare sample submission data for feature creation
mens_matchups = sample_submission[sample_submission['Team1ID'] < 3000].copy()  # Men's teams are below 3000
womens_matchups = sample_submission[sample_submission['Team1ID'] >= 3000].copy()  # Women's teams are 3000+

# Create feature DataFrame for test matchups
test_mens = create_matchup_features(mens_matchups, team_stats, gender='M', include_outcomes=False)
test_womens = create_matchup_features(womens_matchups, team_stats, gender='W', include_outcomes=False)

# Combine test data
test_data = pd.concat([test_mens, test_womens], ignore_index=True)

# Add matchup ID for later reference
test_data['ID'] = test_data.apply(lambda x: f"{x['Season']}_{x['Team1ID']}_{x['Team2ID']}", axis=1)

# Save to files
training_data.to_csv('../data/model/training_data.csv', index=False)
test_data.to_csv('../data/model/test_data.csv', index=False)

print(f"Saved {len(training_data)} training examples and {len(test_data)} test examples")

# Optional: Check training data distribution
print("\nTraining data distribution by gender:")
print(training_data['Gender'].value_counts())

print("\nTraining data distribution by outcome:")
print(training_data['Outcome'].value_counts())

# Display sample of training data
print("\nSample of training data:")
display(training_data.head())

# Display sample of test data
print("\nSample of test data:")
display(test_data.head())

Tournament data loaded and combined with regular season data
Loaded 131407 matchups for prediction
Creating training data from historical matchups...
Created 229246 men's training examples
Created 154316 women's training examples
Creating test data from 2025 matchups...
Saved 383562 training examples and 131407 test examples

Training data distribution by gender:
Gender
M    229246
W    154316
Name: count, dtype: int64

Training data distribution by outcome:
Outcome
0    191781
1    191781
Name: count, dtype: int64

Sample of training data:


Unnamed: 0,Season,Team1ID,Team2ID,Gender,Team1_ScoredPoints,Team1_AllowedPoints,Team1_FGM,Team1_FGA,Team1_FGM3,Team1_FGA3,...,Ratio_Poss,Diff_OppPoss,Ratio_OppPoss,Diff_OffEff,Ratio_OffEff,Diff_DefEff,Ratio_DefEff,Diff_NetEff,Ratio_NetEff,Outcome
0,2003,1328,1104,M,2135,1805,758,1696,224,569,...,1.043591,97.76,1.052895,5.717436,1.054547,-5.717917,0.941935,11.435353,2.802807,0
1,2003,1104,1328,M,1940,1820,673,1601,178,556,...,0.95823,-97.76,0.949763,-5.717436,0.948275,5.717917,1.061645,-11.435353,0.356785,1
2,2003,1393,1272,M,2323,2027,848,1804,152,460,...,1.030255,54.68,1.02705,4.633162,1.043398,3.19642,1.033847,1.436743,1.116593,0
3,2003,1272,1393,M,2161,1909,762,1740,203,582,...,0.970634,-54.68,0.973662,-4.633162,0.958407,-3.19642,0.967261,-1.436743,0.895581,1
4,2003,1437,1266,M,2166,2103,745,1772,200,573,...,1.120862,234.04,1.124899,-13.974494,0.880383,-1.360744,0.986545,-12.61375,0.196483,0



Sample of test data:


Unnamed: 0,Season,Team1ID,Team2ID,Gender,Team1_ScoredPoints,Team1_AllowedPoints,Team1_FGM,Team1_FGA,Team1_FGM3,Team1_FGA3,...,Ratio_Poss,Diff_OppPoss,Ratio_OppPoss,Diff_OffEff,Ratio_OffEff,Diff_DefEff,Ratio_DefEff,Diff_NetEff,Ratio_NetEff,ID
0,2025,1101,1102,M,1754,1836,624,1430,109,369,...,0.936755,-162.16,0.917427,-0.184156,0.998092,-10.222372,0.908832,10.038216,0.356663,2025_1101_1102
1,2025,1101,1103,M,1754,1836,624,1430,109,369,...,0.877637,-270.48,0.86947,-15.83294,0.858852,-0.789884,0.992308,-15.043057,-0.587169,2025_1101_1103
2,2025,1101,1104,M,1754,1836,624,1430,109,369,...,0.82144,-449.2,0.800434,-22.727207,0.809122,-1.299191,0.987411,-21.428015,-0.350828,2025_1101_1104
3,2025,1101,1105,M,1754,1836,624,1430,109,369,...,0.93393,-117.48,0.938786,1.645888,1.017381,-7.830621,0.928641,9.476508,0.369982,2025_1101_1105
4,2025,1101,1106,M,1754,1836,624,1430,109,369,...,0.916257,-157.28,0.919713,-6.778457,0.934265,-4.427042,0.958366,-2.351415,1.731679,2025_1101_1106


In [15]:
# First, let's examine the tournament results data
print(f"Tournament results data shape: {tourney_results.shape}")
print(f"Earliest season in tournament data: {tourney_results['Season'].min()}")
print(f"Latest season in tournament data: {tourney_results['Season'].max()}")
print(f"Number of unique winning teams: {tourney_results['WTeamID'].nunique()}")

# Now, let's debug the calculate_coach_experience function for a specific season
debug_season = 2023  # Choose a recent season
past_tourney = tourney_results[tourney_results['Season'] < debug_season].copy()
print(f"\nPast tournament games (before {debug_season}): {len(past_tourney)}")

# Count wins for each team
w_counts = past_tourney.groupby('WTeamID').size().reset_index(name='TourneyWins')
print(f"Teams with at least one tournament win: {len(w_counts)}")
print("\nTop 5 teams by tournament wins:")
print(w_counts.sort_values('TourneyWins', ascending=False).head())

# Count appearances
all_teams = pd.concat([
    past_tourney[['Season', 'WTeamID']].rename(columns={'WTeamID': 'TeamID'}),
    past_tourney[['Season', 'LTeamID']].rename(columns={'LTeamID': 'TeamID'})
])
appearances = all_teams.drop_duplicates().groupby('TeamID').size().reset_index(name='TourneyAppearances')
print(f"\nTeams with at least one tournament appearance: {len(appearances)}")
print("\nTop 5 teams by tournament appearances:")
print(appearances.sort_values('TourneyAppearances', ascending=False).head())

# Get current coaches for the season
current_coaches = coaches[coaches['Season'] == debug_season].copy()
print(f"\nCoaches in season {debug_season}: {len(current_coaches)}")

# Generate complete coach experience data
coach_exp = calculate_coach_experience(coaches, tourney_results, debug_season)
print(f"\nCoach experience records generated: {len(coach_exp)}")
print("\nSample coach experience data:")
print(coach_exp[coach_exp['TourneyWins'] > 0].sort_values('TourneyWins', ascending=False).head())

# Save coach experience to CSV
coach_exp.to_csv('coach_experience.csv', index=False)
print("\nSaved coach experience data to coach_experience.csv")

# Let's also examine a sample processed season's data
sample_team_stats = process_season(debug_season, regular_season, tourney_results, coaches, team_conferences)
print(f"\nProcessed team stats for season {debug_season}: {len(sample_team_stats)}")

# Check tournament data in the processed stats
tournament_stats = sample_team_stats[['TeamID', 'TourneyWins', 'TourneyAppearances']]
print("\nSample tournament stats in processed data:")
print(tournament_stats.sort_values('TourneyWins', ascending=False).head(10))

# Save the processed season data
sample_team_stats.to_csv(f'team_stats_{debug_season}.csv', index=False)
print(f"\nSaved team stats for {debug_season} to team_stats_{debug_season}.csv")

Tournament results data shape: (1382, 34)
Earliest season in tournament data: 2003
Latest season in tournament data: 2024
Number of unique winning teams: 177

Past tournament games (before 2023): 1248
Teams with at least one tournament win: 167

Top 5 teams by tournament wins:
    WTeamID  TourneyWins
54     1242           50
90     1314           50
32     1181           43
56     1246           40
71     1277           37

Teams with at least one tournament appearance: 258

Top 5 teams by tournament appearances:
     TeamID  TourneyAppearances
93     1242                  19
74     1211                  19
120    1277                  19
252    1458                  18
57     1181                  18

Coaches in season 2023: 368

Coach experience records generated: 368

Sample coach experience data:
     TeamID  Season      CoachName  TourneyWins  TourneyAppearances
134    1242    2023      bill_self         50.0                19.0
205    1314    2023   hubert_davis         50.0    

In [16]:
# Make sure this function is defined properly
all_team_features = collect_features(2003, 2025)

# Now save to CSV
all_team_features.to_csv('collected_team_features.csv', index=False)
print(f"Saved {len(all_team_features)} team records to collected_team_features.csv")

Processing season 2003...
Processing season 2004...
Processing season 2005...
Processing season 2006...
Processing season 2007...
Processing season 2008...
Processing season 2009...
Processing season 2010...
Processing season 2011...
Processing season 2012...
Processing season 2013...
Processing season 2014...
Processing season 2015...
Processing season 2016...
Processing season 2017...
Processing season 2018...
Processing season 2019...
Processing season 2020...
Processing season 2021...
Processing season 2022...
Processing season 2023...
Processing season 2024...
Processing season 2025...
Saved 8063 team records to collected_team_features.csv


In [22]:
# Generate matchup features for training
recent_seasons = range(2003, 2026)
all_matchups = []

for season in recent_seasons:
    print(f"Creating matchups for season {season}...")
    # Filter team features for this season
    season_team_stats = all_team_features[all_team_features['Season'] == season]
    
    # Get games for this season
    season_games = regular_season[regular_season['Season'] == season]
    season_tourney = tourney_results[tourney_results['Season'] == season]
    
    # Process games to create matchup features
    for _, game in season_games.iterrows():
        # Create features for both matchup directions
        w_features = create_matchup_features(game['WTeamID'], game['LTeamID'], season_team_stats)
        w_features['Result'] = 1
        w_features['Season'] = season
        all_matchups.append(w_features)
        
        l_features = create_matchup_features(game['LTeamID'], game['WTeamID'], season_team_stats)
        l_features['Result'] = 0
        l_features['Season'] = season
        all_matchups.append(l_features)
    
    # Add tournament games
    for _, game in season_tourney.iterrows():
        w_features = create_matchup_features(game['WTeamID'], game['LTeamID'], season_team_stats, tourney=True)
        w_features['Result'] = 1
        w_features['Season'] = season
        all_matchups.append(w_features)
        
        l_features = create_matchup_features(game['LTeamID'], game['WTeamID'], season_team_stats, tourney=True)
        l_features['Result'] = 0
        l_features['Season'] = season
        all_matchups.append(l_features)

# Convert to DataFrame and save
matchups_df = pd.DataFrame(all_matchups)
matchups_df.to_csv('matchup_features.csv', index=False)
print(f"Saved {len(matchups_df)} matchups to matchup_features.csv")

Creating matchups for season 2003...
Creating matchups for season 2004...
Creating matchups for season 2005...
Creating matchups for season 2006...
Creating matchups for season 2007...
Creating matchups for season 2008...
Creating matchups for season 2009...
Creating matchups for season 2010...
Creating matchups for season 2011...
Creating matchups for season 2012...
Creating matchups for season 2013...
Creating matchups for season 2014...
Creating matchups for season 2015...
Creating matchups for season 2016...
Creating matchups for season 2017...
Creating matchups for season 2018...
Creating matchups for season 2019...
Creating matchups for season 2020...
Creating matchups for season 2021...
Creating matchups for season 2022...
Creating matchups for season 2023...
Creating matchups for season 2024...
Creating matchups for season 2025...
Saved 239662 matchups to matchup_features.csv


In [23]:
%pip install pandas
%pip install xgboost
import pandas as pd
import xgboost as xgb


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/Cellar/jupyterlab/4.3.5_1/libexec/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/Cellar/jupyterlab/4.3.5_1/libexec/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [24]:
# Create test data for Kaggle submission
def create_submission_test_data(season=2025):
    """Generate test data based on Kaggle sample submission format"""
    print(f"Creating test data for season {season}...")
    
    # Load sample submission to get required matchups
    data_dir = "../data"
    sample_submission = pd.read_csv(os.path.join(data_dir, "SampleSubmissionStage1.csv"))
    
    # Extract matchup IDs
    submission_pairs = sample_submission['ID'].str.split('_', expand=True)
    submission_pairs.columns = ['Season', 'Team1ID', 'Team2ID']
    
    # Filter for current season
    current_matchups = submission_pairs[submission_pairs['Season'] == str(season)]
    print(f"Found {len(current_matchups)} matchups to predict")
    
    # Load team features for current season
    team_features = all_team_features.copy()
    
    # Check if we have data for the current season
    if season not in team_features['Season'].unique():
        print(f"Warning: No team features for season {season}. Using most recent season data.")
        latest_season = team_features['Season'].max()
        # Keep the season column but use stats from latest season
        team_features.loc[team_features['Season'] == latest_season, 'Season'] = season
    
    # Filter for current season
    current_team_stats = team_features[team_features['Season'] == season]
    print(f"Using stats for {len(current_team_stats)} teams")
    
    # Create test data
    test_matchups = []
    for i, row in current_matchups.iterrows():
        if i % 10000 == 0:
            print(f"Processing matchup {i}/{len(current_matchups)}...")
        
        # Get team IDs
        team1_id = int(row['Team1ID'])
        team2_id = int(row['Team2ID'])
        matchup_id = f"{season}_{team1_id}_{team2_id}"
        
        try:
            # Create features for this matchup
            features = create_matchup_features(
                team1_id, team2_id,
                current_team_stats,
                tourney=True
            )
            
            # Add matchup ID
            features['ID'] = matchup_id
            test_matchups.append(features)
            
        except Exception as e:
            print(f"Error creating features for matchup {matchup_id}: {str(e)[:100]}")
            # Add empty row with just ID
            test_matchups.append({'ID': matchup_id})
    
    # Convert to DataFrame
    test_df = pd.DataFrame(test_matchups)
    
    # Ensure all required IDs are present
    missing_ids = set(sample_submission['ID']) - set(test_df['ID'])
    if missing_ids:
        print(f"Adding {len(missing_ids)} missing matchups")
        for missing_id in missing_ids:
            test_df = pd.concat([test_df, pd.DataFrame([{'ID': missing_id}])], ignore_index=True)
    
    # Save test data
    test_file = f"test_data_{season}.csv"
    test_df.to_csv(test_file, index=False)
    print(f"Saved {len(test_df)} matchups to {test_file}")
    
    return test_df

In [None]:
# Run the function
test_data = create_submission_test_data(season=2025)

Creating test data for season 2025...
Found 0 matchups to predict
Using stats for 366 teams


KeyError: 'ID'