In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss

folder = 'march-machine-learning-mania-2024/'

Seeds = pd.read_csv(folder+'WNCAATourneySeeds.csv')
Conferences = pd.read_csv(folder + 'Conferences.csv')

Reg_Deatil = pd.read_csv(folder+'WRegularSeasonDetailedResults.csv')
Tourney_Compact = pd.read_csv(folder+'WNCAATourneyCompactResults.csv')

MTeams = pd.read_csv(folder+'WTeams.csv')

WTeams = pd.DataFrame()
LTeams = pd.DataFrame()

columns = ['Season', 'TeamID', 'Points', 'OppPoints', 'NumOT', 
       'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA',
       'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'OppFGM', 'OppFGA',
       'OppFGM3', 'OppFGA3', 'OppFTM', 'OppFTA', 'OppOR', 'OppDR', 'OppAst', 'OppTO',
       'OppStl', 'OppBlk', 'OppPF']

WTeams[columns] = Reg_Deatil[['Season', 'WTeamID', 'WScore', 'LScore',
    'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA',
       'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA',
       'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl',
       'LBlk', 'LPF']]
WTeams['Wins'] = 1
WTeams['Losses'] = 0

LTeams[columns] = Reg_Deatil[['Season', 'LTeamID', 'LScore', 'WScore',
      'NumOT', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA',
       'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'WFGM', 'WFGA',
       'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO',
       'WStl', 'WBlk', 'WPF']]

LTeams['Wins'] = 0
LTeams['Losses'] = 1

conf_abbrev_dict = {}
for _, row in Conferences.iterrows():
    conf_abbrev = row['ConfAbbrev']
    conf_abbrev_dict[conf_abbrev] = []

for _, row in MTeams.iterrows():
    team_id = row['TeamID']
    for conf_abbrev, team_id_range in conf_abbrev_dict.items():
        if not team_id_range or team_id_range[-1] == team_id - 1:
            team_id_range.append(team_id)
            break

# Manually add the 'ConfAbbrev' column to the MTeams DataFrame
def get_conf_abbrev(team_id):
    for conf_abbrev, team_id_range in conf_abbrev_dict.items():
        if team_id_range[0] <= team_id <= team_id_range[-1]:
            return conf_abbrev
    return 'ind'

MTeams['ConfAbbrev'] = MTeams['TeamID'].apply(get_conf_abbrev)

WinLoseTeams = pd.concat([WTeams, LTeams])
WinLoseTeams = WinLoseTeams.merge(MTeams[['TeamID', 'ConfAbbrev']], on='TeamID')

combinedTeams = WinLoseTeams.groupby(['Season', 'TeamID', 'ConfAbbrev']).sum()
combinedTeams['NumGames'] = combinedTeams['Wins'] + combinedTeams['Losses']

combinedTeams = combinedTeams.reset_index().merge(MTeams[['TeamID', 'TeamName']], on='TeamID').set_index(['Season', 'TeamID', 'ConfAbbrev'])

RegularSeasonData = combinedTeams[['Wins', 'Losses', 'NumGames', 'Points', 'OppPoints', 'FGM', 'FGA', 'OppFGM', 'OppFGA', 'DR', 'OR', 'OppDR', 'OppOR', 'Ast', 'OppAst', 'TO', 'OppTO', 'Blk', 'FTA', 'OppFTA', 'PF', 'OppPF']]

RegularSeasonData['WinRatio'] = combinedTeams['Wins'] / combinedTeams['NumGames']
RegularSeasonData['PPG'] = combinedTeams['Points'] / combinedTeams ['NumGames']
RegularSeasonData['PAPG'] = combinedTeams['OppPoints'] / combinedTeams ['NumGames']
RegularSeasonData['PointDifferential'] = (combinedTeams['Points'] - combinedTeams['OppPoints']) / combinedTeams['NumGames']
RegularSeasonData['FGP'] = combinedTeams['FGM'] / combinedTeams ['FGA']
RegularSeasonData['AllowedFGP'] = combinedTeams['OppFGM'] / combinedTeams ['OppFGA']
RegularSeasonData['ReboundDifferential'] = ((combinedTeams['DR'] + combinedTeams['OR']) - (combinedTeams['OppDR'] + combinedTeams['OppOR'])) / combinedTeams['NumGames']
RegularSeasonData['DRDifferential'] = (combinedTeams['DR']  - combinedTeams['OppDR']) / combinedTeams['NumGames']
RegularSeasonData['ORDifferential'] = (combinedTeams['OR']  - combinedTeams['OppOR']) / combinedTeams['NumGames']
RegularSeasonData['AssistsPG'] = combinedTeams['Ast'] / combinedTeams ['NumGames']
RegularSeasonData['AssistDifferential'] = (combinedTeams['Ast']  - combinedTeams['OppAst']) / combinedTeams['NumGames']
RegularSeasonData['TOMargain'] = (combinedTeams['TO']  - combinedTeams['OppTO']) / combinedTeams['NumGames']
RegularSeasonData['AstTORatio'] = (combinedTeams['Ast']  - combinedTeams['TO']) / combinedTeams['NumGames']
RegularSeasonData['BlkPG'] = combinedTeams['Blk'] / combinedTeams ['NumGames']
RegularSeasonData['FTAMargain'] = (combinedTeams['FTA'] -  combinedTeams['OppFTA']) / combinedTeams ['NumGames']
RegularSeasonData['FoulsPG'] = combinedTeams['PF'] / combinedTeams ['NumGames']
RegularSeasonData['FoulsMargain'] = (combinedTeams['PF'] -  combinedTeams['OppPF']) / combinedTeams ['NumGames']

# Calculate strength of schedule (SOS) for each team
RegularSeasonData['SOS'] = RegularSeasonData.groupby(['Season', 'TeamID'])['WinRatio'].transform('mean')

# Calculate opponents' strength of schedule (OSOS) for each team
RegularSeasonData['OSOS'] = RegularSeasonData.groupby(['Season', 'TeamID'])['SOS'].transform('mean')

# Compute conferenceRPI
RegularSeasonData['Schedule'] =  (RegularSeasonData['SOS'] + RegularSeasonData['OSOS']) 

SeedDictionary = Seeds.set_index(['Season', 'TeamID'])

TourneyInput = pd.DataFrame()

winIds = Tourney_Compact['WTeamID']
loseIds = Tourney_Compact['LTeamID']
season = Tourney_Compact['Season']

winners = pd.DataFrame()
winners[['Season', 'Team1', 'Team2']] = Tourney_Compact[['Season', 'WTeamID', 'LTeamID']]
winners['Result'] = 1

losers = pd.DataFrame()
losers[['Season', 'Team1', 'Team2']] = Tourney_Compact[['Season', 'LTeamID', 'WTeamID']]
losers['Result'] = 0

TourneyInput = pd.concat([winners, losers])

# Filter TourneyInput to include only seasons from 2010 onwards
TourneyInput = TourneyInput[TourneyInput['Season'] >= 2010]

Team1seeds = []
Team2seeds = []

for x in range(len(TourneyInput)):
    try:
        index = (TourneyInput.iloc[x]['Season'], TourneyInput.iloc[x]['Team1'])
        seed = SeedDictionary.loc[index].values[0]
        if len(seed) == 4:
            seed = int(seed[1:-1])
        else:
            seed = int(seed[1:])
        Team1seeds.append(seed)
    except KeyError:
        Team1seeds.append(0)

    try:
        index = (TourneyInput.iloc[x]['Season'], TourneyInput.iloc[x]['Team2'])
        seed = SeedDictionary.loc[index].values[0]
        if len(seed) == 4:
            seed = int(seed[1:-1])
        else:
            seed = int(seed[1:])
        Team2seeds.append(seed)
    except KeyError:
        Team2seeds.append(0)

TourneyInput['Team1Seed'] = Team1seeds
TourneyInput['Team2Seed'] = Team2seeds

TourneyInput['SeedDiff'] = TourneyInput['Team2Seed'] - TourneyInput['Team1Seed']
TourneyInput['SeedDiffSquared'] = TourneyInput['SeedDiff'] ** 2
TourneyInput['ScheduleDiff'] = 0  # Initialize 'ScheduleDiff' column with 0

for x in range(len(TourneyInput)):
    index = (TourneyInput.iloc[x]['Season'], TourneyInput.iloc[x]['Team1'])
    Team1Schedule = RegularSeasonData.loc[index]['Schedule']

    index = (TourneyInput.iloc[x]['Season'], TourneyInput.iloc[x]['Team2'])
    Team2Schedule = RegularSeasonData.loc[index]['Schedule']

    if Team1Schedule.iloc[0] > Team2Schedule.iloc[0]:
        TourneyInput.at[x, 'ScheduleDiff'] = 1
    elif Team1Schedule.iloc[0] < Team2Schedule.iloc[0]:
        TourneyInput.at[x, 'ScheduleDiff'] = -1

stats = []
results = []

for x in range(len(TourneyInput)):
    try:
        index = (TourneyInput.iloc[x]['Season'], TourneyInput.iloc[x]['Team1'])
        Team1Stats = RegularSeasonData.loc[index]
        Team1Stats['Seed'] = TourneyInput.iloc[x]['Team1Seed']
    except KeyError:
        Team1Stats = pd.Series(0, index=RegularSeasonData.columns)
        Team1Stats['Seed'] = TourneyInput.iloc[x]['Team1Seed']

    try:
        index = (TourneyInput.iloc[x]['Season'], TourneyInput.iloc[x]['Team2'])
        Team2Stats = RegularSeasonData.loc[index]
        Team2Stats['Seed'] = TourneyInput.iloc[x]['Team2Seed']
    except KeyError:
        Team2Stats = pd.Series(0, index=RegularSeasonData.columns)
        Team2Stats['Seed'] = TourneyInput.iloc[x]['Team2Seed']

    game_stats = Team1Stats - Team2Stats
    game_stats['SeedDiffSquared'] = TourneyInput.iloc[x]['SeedDiffSquared']
    game_stats['ScheduleDiff'] = TourneyInput.iloc[x]['ScheduleDiff']
    stats.append(game_stats.values.flatten())
    results.append(TourneyInput.iloc[x]['Result'])

stats = pd.DataFrame(stats)
results = pd.Series(results)
  
x = stats.values
y = results.values
np.random.seed(1)
index = np.random.permutation(len(x))
TrainIndex = index[:int(-.2*len(x))]
TestIndex = index[int(-.2*len(x)):]

XTrain = x[TrainIndex]
XTest = x[TestIndex]
YTrain = y[TrainIndex]
YTest = y[TestIndex]

# Remove rows with NaN values from XTrain, YTrain, XTest, and YTest
nan_rows_train = np.isnan(YTrain)
XTrain = XTrain[~nan_rows_train]
YTrain = YTrain[~nan_rows_train]

nan_rows_test = np.isnan(YTest)
XTest = XTest[~nan_rows_test]
YTest = YTest[~nan_rows_test]

mins = XTrain.min(axis=0)
maxs = XTrain.max(axis=0)

XTrain = (XTrain - mins) / (maxs - mins)
XTest = (XTest - mins) / (maxs - mins)

model = RandomForestClassifier(random_state=1)
model = model.fit(XTrain, YTrain)

# Predict probabilities for the test set
y_pred_prob = model.predict_proba(XTest)[:, 1]

# Calculate the Brier Score
brier_score = brier_score_loss(YTest, y_pred_prob)

print("Brier Score:", brier_score)

# Load the sample dataframe
womens_df = pd.read_csv('2024_potential_matchups.csv')

# Prepare the input features for prediction
Team1seeds = []
Team2seeds = []

for x in range(len(womens_df)):
    try:
        index = (2024, womens_df.iloc[x]['T1_ID'])
        seed = SeedDictionary.loc[index].values[0]
        if len(seed) == 4:
            seed = int(seed[1:-1])
        else:
            seed = int(seed[1:])
        Team1seeds.append(seed)
    except KeyError:
        Team1seeds.append(0)

    try:
        index = (2024, womens_df.iloc[x]['T2_ID'])
        seed = SeedDictionary.loc[index].values[0]
        if len(seed) == 4:
            seed = int(seed[1:-1])
        else:
            seed = int(seed[1:])
        Team2seeds.append(seed)
    except KeyError:
        Team2seeds.append(0)

womens_df['Team1Seed'] = Team1seeds
womens_df['Team2Seed'] = Team2seeds
womens_df['SeedDiff'] = womens_df['Team2Seed'] - womens_df['Team1Seed']
womens_df['SeedDiffSquared'] = womens_df['SeedDiff'] ** 2
womens_df['ScheduleDiff'] = 0  # Initialize 'ScheduleDiff' column with 0

stats = []

for x in range(len(womens_df)):
    try:
        index = (2024, womens_df.iloc[x]['T1_ID'])
        Team1Schedule = RegularSeasonData.loc[index]['Schedule']
    except KeyError:
        Team1Schedule = None

    try:
        index = (2024, womens_df.iloc[x]['T2_ID'])
        Team2Schedule = RegularSeasonData.loc[index]['Schedule']
    except KeyError:
        Team2Schedule = None

    if Team1Schedule is None or Team2Schedule is None:
        continue

    if Team1Schedule.iloc[0] > Team2Schedule.iloc[0]:
        womens_df.at[x, 'ScheduleDiff'] = 1
    elif Team1Schedule.iloc[0] < Team2Schedule.iloc[0]:
        womens_df.at[x, 'ScheduleDiff'] = -1

stats = []

for x in range(len(womens_df)):
    try:
        index = (2024, womens_df.iloc[x]['T1_ID'])
        Team1Stats = RegularSeasonData.loc[index]
        Team1Stats['Seed'] = womens_df.iloc[x]['Team1Seed']
    except KeyError:
        Team1Stats = pd.Series(0, index=RegularSeasonData.columns)
        Team1Stats['Seed'] = womens.iloc[x]['Team1Seed']

    try:
        index = (2024, womens_df.iloc[x]['T2_ID'])
        Team2Stats = RegularSeasonData.loc[index]
        Team2Stats['Seed'] = womens_df.iloc[x]['Team2Seed']
    except KeyError:
        Team2Stats = pd.Series(0, index=RegularSeasonData.columns)
        Team2Stats['Seed'] = womens_df.iloc[x]['Team2Seed']

    game_stats = Team1Stats - Team2Stats
    game_stats['SeedDiffSquared'] = womens_df.iloc[x]['SeedDiffSquared']
    game_stats['ScheduleDiff'] = womens_df.iloc[x]['ScheduleDiff']
    stats.append(game_stats.values.flatten())

stats = pd.DataFrame(stats)

# Normalize the input features
mins = XTrain.min(axis=0)
maxs = XTrain.max(axis=0)
stats = (stats - mins) / (maxs - mins)


y_pred_prob = model.predict_proba(stats)[:, 1]

# Add the predicted probabilities to the sample dataframe
womens_df['T1_Win_Prob'] = y_pred_prob

# Add a new column indicating the predicted winner (1 for T1, 0 for T2)
womens_df['T1_Win'] = (womens_df['T1_Win_Prob'] > 0.5).astype(int)

# Apply the stipulation for cases where the win probability is exactly 0.5
def determine_winner(row):
    if row['T1_Win_Prob'] == 0.5:
        if row['Team1Seed'] < row['Team2Seed']:
            return 1
        elif row['Team1Seed'] > row['Team2Seed']:
            return 0
        else:
            return 1  # If both teams have the same seed, give the win to Team1
    else:
        return row['T1_Win']

womens_df['T1_Win'] = womens_df.apply(determine_winner, axis=1)

# Print the matchups and the predicted winners
for index, row in womens_df.iterrows():
    print(f"{row['T1_Spelling']} (Seed: {row['T1_Seed']}) vs {row['T2_Spelling']} (Seed: {row['T2_Seed']})")
    if row['T1_Win'] == 1:
        print(f"Predicted winner: {row['T1_Spelling']}")
    else:
        print(f"Predicted winner: {row['T2_Spelling']}")
    print()

womens_df = womens_df[['T1_Spelling', 'T1_ID', 'T1_Seed', 'T2_Spelling', 'T2_ID', 'T2_Seed', 'T1_Win']]


# Save the updated sample dataframe to a new CSV file
womens_df.to_csv('2024_potential_matchups_with_predictions_womens.csv', index=False)

      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RegularSeasonData['WinRatio'] = combinedTeams['Wins'] / combinedTeams['NumGames']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RegularSeasonData['PPG'] = combinedTeams['Points'] / combinedTeams ['NumGames']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RegularSeasonData['PAPG'] = combinedTeams['

Brier Score: 0.2639883883653257


KeyError: 1163