In [1]:
import numpy as np
import pandas as pd
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib as mpl
from matplotlib.patches import Circle, Rectangle, Arc
import seaborn as sns

from sklearn.metrics import accuracy_score, log_loss
import xgboost as xgb
from sklearn.model_selection import GroupKFold

In [2]:
DATA_PATH = "/kaggle/input/march-machine-learning-mania-2024/"

In [3]:
dfTourneySeeds = pd.concat([
        pd.read_csv(DATA_PATH + "MNCAATourneySeeds.csv").assign(League="M"),
        pd.read_csv(DATA_PATH + "WNCAATourneySeeds.csv").assign(League="W")]).reset_index(drop=True)

dfSeasonCompactResults = pd.concat([
        pd.read_csv(DATA_PATH + "MRegularSeasonCompactResults.csv").assign(League="M"),
        pd.read_csv(DATA_PATH + "WRegularSeasonCompactResults.csv").assign(League="W")  ]).reset_index(drop=True)

dfTourneyCompactResults = pd.concat([
        pd.read_csv(DATA_PATH + "MNCAATourneyCompactResults.csv").assign(League="M"),
        pd.read_csv(DATA_PATH + "WNCAATourneyCompactResults.csv").assign(League="W")]
).reset_index(drop=True)

In [4]:
print(dfTourneySeeds.head())
print(dfSeasonCompactResults.head())
print(dfTourneyCompactResults.head())

   Season Seed  TeamID League
0    1985  W01    1207      M
1    1985  W02    1210      M
2    1985  W03    1228      M
3    1985  W04    1260      M
4    1985  W05    1374      M
   Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT League
0    1985      20     1228      81     1328      64    N      0      M
1    1985      25     1106      77     1354      70    H      0      M
2    1985      25     1112      63     1223      56    H      0      M
3    1985      25     1165      70     1432      54    H      0      M
4    1985      25     1192      86     1447      74    H      0      M
   Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT League
0    1985     136     1116      63     1234      54    N      0      M
1    1985     136     1120      59     1345      58    N      0      M
2    1985     136     1207      68     1250      43    N      0      M
3    1985     136     1229      58     1425      55    N      0      M
4    1985     136     1242      49     

In [5]:
dfTeamSeasonResults = pd.concat(
    [
        dfSeasonCompactResults[['Season', 'League', 'WTeamID', 'DayNum', 'WScore', 'LScore']]
        .assign(GameResult='W')
        .rename(
            columns={"WTeamID":"TeamID", "WScore": "teamScore", "LScore": "oppositeTeamScore"}
        ),
        
        dfSeasonCompactResults[['Season', 'League', 'LTeamID', 'DayNum', 'WScore', 'LScore']]
        .assign(GameResult='L')
        .rename(
            columns={"LTeamID":"TeamID", "LScore": "teamScore", "WScore": "oppositeTeamScore"}
        )
    ]
).reset_index(drop=True)

In [6]:
dfTeamSeasonResults

Unnamed: 0,Season,League,TeamID,DayNum,teamScore,oppositeTeamScore,GameResult
0,1985,M,1228,20,81,64,W
1,1985,M,1106,25,77,70,W
2,1985,M,1112,25,63,56,W
3,1985,M,1165,25,70,54,W
4,1985,M,1192,25,86,74,W
...,...,...,...,...,...,...,...
637747,2024,W,3372,131,74,75,L
637748,2024,W,3283,132,75,76,L
637749,2024,W,3392,132,60,68,L
637750,2024,W,3131,132,55,61,L


In [7]:
dfTeamSeasonResults["scoreDifference"] = (dfTeamSeasonResults["teamScore"] -dfTeamSeasonResults["oppositeTeamScore"])
dfTeamSeasonResults["Win"] = (dfTeamSeasonResults["GameResult"] == "W").astype("int")

In [8]:
dfTeamSeasonResults[(dfTeamSeasonResults["TeamID"] == 3456) & (dfTeamSeasonResults["DayNum"] == 17)]

Unnamed: 0,Season,League,TeamID,DayNum,teamScore,oppositeTeamScore,GameResult,scoreDifference,Win
227458,2007,W,3456,17,67,54,W,13,1
237617,2009,W,3456,17,98,38,W,60,1
561595,2010,W,3456,17,66,71,L,-5,0


In [9]:
teamSeasonAggregate = (
    dfTeamSeasonResults.groupby(['Season', 'TeamID', 'League'])
        .agg(
            AvgScoreDiff=("scoreDifference", "mean"),
            MedianScoreDiff=("scoreDifference", "median"),
            MinScoreDiff=("scoreDifference", "min"),
            MaxScoreDiff=("scoreDifference", "max"),
            Wins=("Win", "sum"),
            Losses=("GameResult", lambda x: (x == "L").sum()),
            WinPercentage=("Win", "mean"), 
        ).reset_index()
)

In [10]:
teamSeasonAggregate.head()

Unnamed: 0,Season,TeamID,League,AvgScoreDiff,MedianScoreDiff,MinScoreDiff,MaxScoreDiff,Wins,Losses,WinPercentage
0,1985,1102,M,-5.791667,-5.5,-41,29,5,19,0.208333
1,1985,1103,M,-3.043478,-2.0,-22,16,9,14,0.391304
2,1985,1104,M,7.8,6.5,-12,25,21,9,0.7
3,1985,1106,M,-3.791667,-1.5,-35,28,10,14,0.416667
4,1985,1108,M,7.96,4.0,-15,35,19,6,0.76


In [11]:
dfTourneySeeds.head()

Unnamed: 0,Season,Seed,TeamID,League
0,1985,W01,1207,M
1,1985,W02,1210,M
2,1985,W03,1228,M
3,1985,W04,1260,M
4,1985,W05,1374,M


In [12]:
dfTourneySeeds["chalkSeed"] = (dfTourneySeeds["Seed"].str.replace("a", "") 
                         .str.replace("b", "").str[1:].astype("int"))

In [13]:
dfTourneySeeds.head()

Unnamed: 0,Season,Seed,TeamID,League,chalkSeed
0,1985,W01,1207,M,1
1,1985,W02,1210,M,2
2,1985,W03,1228,M,3
3,1985,W04,1260,M,4
4,1985,W05,1374,M,5


In [14]:
teamSeasonAggregateSeeds = teamSeasonAggregate.merge(
    dfTourneySeeds, on=["Season", "TeamID", "League"], how="left")

In [15]:
teamSeasonAggregateSeeds.head(2)

Unnamed: 0,Season,TeamID,League,AvgScoreDiff,MedianScoreDiff,MinScoreDiff,MaxScoreDiff,Wins,Losses,WinPercentage,Seed,chalkSeed
0,1985,1102,M,-5.791667,-5.5,-41,29,5,19,0.208333,,
1,1985,1103,M,-3.043478,-2.0,-22,16,9,14,0.391304,,


In [16]:
dfTeamTourneyResults = pd.concat(
    [
        dfTourneyCompactResults[['Season', 'League', 'WTeamID', 'LTeamID', 'WScore', 'LScore']]
        .assign(GameResult='W')
        .rename(
            columns={"WTeamID":"TeamID", "LTeamID": "oppositeTeamID", "WScore": "teamScore", "LScore": "oppositeTeamScore"}
        ),
        
        dfTourneyCompactResults[['Season', 'League', 'LTeamID', 'WTeamID', 'LScore', 'WScore']]
        .assign(GameResult='L')
        .rename(
            columns={"LTeamID":"TeamID", "WTeamID":"oppositeTeamID", "LScore": "teamScore", "WScore": "oppositeTeamScore"}
        )
    ]
).reset_index(drop=True)

dfTeamTourneyResults["Win"] = (dfTeamTourneyResults["GameResult"] == "W").astype(
    "int"
)

In [17]:
dfTeamTourneyResults.head(2)

Unnamed: 0,Season,League,TeamID,oppositeTeamID,teamScore,oppositeTeamScore,GameResult,Win
0,1985,M,1116,1234,63,54,W,1
1,1985,M,1120,1345,59,58,W,1


In [18]:
print(len(dfTeamTourneyResults))
print(len(teamSeasonAggregateSeeds))

8068
22150


In [19]:
dfTeamTourneyResults.merge(teamSeasonAggregateSeeds[
        ["Season", "League", "TeamID", "WinPercentage", "MedianScoreDiff", "chalkSeed"]
        ],
    on = ["Season", "League", "TeamID"],
    how = "left")

Unnamed: 0,Season,League,TeamID,oppositeTeamID,teamScore,oppositeTeamScore,GameResult,Win,WinPercentage,MedianScoreDiff,chalkSeed
0,1985,M,1116,1234,63,54,W,1,0.636364,5.0,9.0
1,1985,M,1120,1345,59,58,W,1,0.620690,2.0,11.0
2,1985,M,1207,1250,68,43,W,1,0.925926,14.0,1.0
3,1985,M,1229,1425,58,55,W,1,0.740741,6.0,9.0
4,1985,M,1242,1325,49,38,W,1,0.766667,5.5,3.0
...,...,...,...,...,...,...,...,...,...,...,...
8063,2023,W,3268,3376,75,86,L,0,0.806452,11.0,2.0
8064,2023,W,3326,3439,74,84,L,0,0.781250,12.0,3.0
8065,2023,W,3376,3234,73,77,L,0,1.000000,28.0,1.0
8066,2023,W,3439,3261,72,79,L,0,0.870968,13.0,1.0


In [20]:
len(dfTeamTourneyResults[dfTeamTourneyResults["TeamID"] == 1116])

60

In [21]:
dfHistoricTourneyFeatures = dfTeamTourneyResults.merge(
    teamSeasonAggregateSeeds[
        ["Season", "League", "TeamID", "WinPercentage", "MedianScoreDiff", "chalkSeed"]
        ],
    on = ["Season", "League", "TeamID"],
    how = "left"
).merge(
    teamSeasonAggregateSeeds[
        ["Season", "League", "TeamID", "WinPercentage", "MedianScoreDiff", "chalkSeed"]
    ].rename(
        columns={
            "TeamID": "oppositeTeamID",
            "WinPercentage": "OppWinPercentage",
            "MedianScoreDiff": "OppMedianScoreDiff",
            "chalkSeed": "oppositeChalkSeed",
        }
    ),
    on=["Season", "League", "oppositeTeamID"]
)

In [22]:
dfHistoricTourneyFeatures.head(2)

Unnamed: 0,Season,League,TeamID,oppositeTeamID,teamScore,oppositeTeamScore,GameResult,Win,WinPercentage,MedianScoreDiff,chalkSeed,OppWinPercentage,OppMedianScoreDiff,oppositeChalkSeed
0,1985,M,1116,1234,63,54,W,1,0.636364,5.0,9.0,0.666667,9.5,8.0
1,1985,M,1120,1345,59,58,W,1,0.62069,2.0,11.0,0.68,9.0,6.0


In [23]:
len(dfHistoricTourneyFeatures)

8068

In [24]:
dfHistoricTourneyFeatures["winPercentageDiff"] = (dfHistoricTourneyFeatures["WinPercentage"]- dfHistoricTourneyFeatures["OppWinPercentage"])

dfHistoricTourneyFeatures["chalkSeedDiff"] = (dfHistoricTourneyFeatures["chalkSeed"]- dfHistoricTourneyFeatures["oppositeChalkSeed"])

dfHistoricTourneyFeatures["MedianScoreDiffDiff"] = (
    dfHistoricTourneyFeatures["MedianScoreDiff"]
    - dfHistoricTourneyFeatures["OppMedianScoreDiff"]
)

In [25]:
dfHistoricTourneyFeatures.sample(5, random_state=532)

Unnamed: 0,Season,League,TeamID,oppositeTeamID,teamScore,oppositeTeamScore,GameResult,Win,WinPercentage,MedianScoreDiff,chalkSeed,OppWinPercentage,OppMedianScoreDiff,oppositeChalkSeed,winPercentageDiff,chalkSeedDiff,MedianScoreDiffDiff
808,1997,M,1112,1242,85,82,W,1,0.678571,8.5,4.0,0.96875,17.5,1.0,-0.290179,3.0,-9.0
5901,2014,M,1454,1437,53,73,L,0,0.606061,4.0,15.0,0.875,16.0,2.0,-0.268939,13.0,-12.0
3017,2006,W,3268,3181,78,75,W,1,0.875,16.0,2.0,0.896552,23.0,1.0,-0.021552,1.0,-7.0
5382,2006,M,1107,1163,59,72,L,0,0.677419,9.0,16.0,0.9,12.0,1.0,-0.222581,15.0,-3.0
4231,1988,M,1124,1272,60,75,L,0,0.6875,2.5,8.0,0.633333,5.0,9.0,0.054167,-1.0,-2.5


In [26]:
dfHistoricTourneyFeatures["BaselinePred"] = (dfHistoricTourneyFeatures["chalkSeed"] < dfHistoricTourneyFeatures["oppositeChalkSeed"])

dfHistoricTourneyFeatures.loc[dfHistoricTourneyFeatures["chalkSeed"]== dfHistoricTourneyFeatures["oppositeChalkSeed"],"BaselinePred",] = (dfHistoricTourneyFeatures["WinPercentage"]> dfHistoricTourneyFeatures["OppWinPercentage"])

In [27]:
dfHistoricTourneyFeatures.head()

Unnamed: 0,Season,League,TeamID,oppositeTeamID,teamScore,oppositeTeamScore,GameResult,Win,WinPercentage,MedianScoreDiff,chalkSeed,OppWinPercentage,OppMedianScoreDiff,oppositeChalkSeed,winPercentageDiff,chalkSeedDiff,MedianScoreDiffDiff,BaselinePred
0,1985,M,1116,1234,63,54,W,1,0.636364,5.0,9.0,0.666667,9.5,8.0,-0.030303,1.0,-4.5,False
1,1985,M,1120,1345,59,58,W,1,0.62069,2.0,11.0,0.68,9.0,6.0,-0.05931,5.0,-7.0,False
2,1985,M,1207,1250,68,43,W,1,0.925926,14.0,1.0,0.37931,-3.0,16.0,0.546616,-15.0,17.0,True
3,1985,M,1229,1425,58,55,W,1,0.740741,6.0,9.0,0.678571,2.5,8.0,0.062169,1.0,3.5,False
4,1985,M,1242,1325,49,38,W,1,0.766667,5.5,3.0,0.740741,6.0,14.0,0.025926,-11.0,-0.5,True


In [28]:
CV_SCORES_BASELINE = []

for season in dfHistoricTourneyFeatures["Season"].unique():
    pred = dfHistoricTourneyFeatures.query("Season == @season")["BaselinePred"].astype("int")
    
    y = dfHistoricTourneyFeatures.query("Season == @season")["Win"]
    
    score = accuracy_score(y, pred)
    score_ll = log_loss(y, pred)
    CV_SCORES_BASELINE.append(score)
    print(f"Holdout season {season} - Accuracy {score:0.4f} Log Loss {score_ll:0.4f}")

print(f"Baseline accuracy {np.mean(CV_SCORES_BASELINE):0.4f}")

Holdout season 1985 - Accuracy 0.7143 Log Loss 10.2982
Holdout season 1986 - Accuracy 0.7143 Log Loss 10.2982
Holdout season 1987 - Accuracy 0.6984 Log Loss 10.8703
Holdout season 1988 - Accuracy 0.7143 Log Loss 10.2982
Holdout season 1989 - Accuracy 0.6667 Log Loss 12.0146
Holdout season 1990 - Accuracy 0.6825 Log Loss 11.4424
Holdout season 1991 - Accuracy 0.7460 Log Loss 9.1539
Holdout season 1992 - Accuracy 0.7619 Log Loss 8.5818
Holdout season 1993 - Accuracy 0.7937 Log Loss 7.4376
Holdout season 1994 - Accuracy 0.7143 Log Loss 10.2982
Holdout season 1995 - Accuracy 0.7619 Log Loss 8.5818
Holdout season 1996 - Accuracy 0.7460 Log Loss 9.1539
Holdout season 1997 - Accuracy 0.7302 Log Loss 9.7261
Holdout season 1998 - Accuracy 0.7143 Log Loss 10.2982
Holdout season 1999 - Accuracy 0.7222 Log Loss 10.0121
Holdout season 2000 - Accuracy 0.7302 Log Loss 9.7261
Holdout season 2001 - Accuracy 0.7047 Log Loss 10.6428
Holdout season 2002 - Accuracy 0.7480 Log Loss 9.0819
Holdout season 200

In [29]:
dfHistoricTourneyFeatures.head()

Unnamed: 0,Season,League,TeamID,oppositeTeamID,teamScore,oppositeTeamScore,GameResult,Win,WinPercentage,MedianScoreDiff,chalkSeed,OppWinPercentage,OppMedianScoreDiff,oppositeChalkSeed,winPercentageDiff,chalkSeedDiff,MedianScoreDiffDiff,BaselinePred
0,1985,M,1116,1234,63,54,W,1,0.636364,5.0,9.0,0.666667,9.5,8.0,-0.030303,1.0,-4.5,False
1,1985,M,1120,1345,59,58,W,1,0.62069,2.0,11.0,0.68,9.0,6.0,-0.05931,5.0,-7.0,False
2,1985,M,1207,1250,68,43,W,1,0.925926,14.0,1.0,0.37931,-3.0,16.0,0.546616,-15.0,17.0,True
3,1985,M,1229,1425,58,55,W,1,0.740741,6.0,9.0,0.678571,2.5,8.0,0.062169,1.0,3.5,False
4,1985,M,1242,1325,49,38,W,1,0.766667,5.5,3.0,0.740741,6.0,14.0,0.025926,-11.0,-0.5,True


In [30]:
dfHistoricTourneyFeatures.columns

Index(['Season', 'League', 'TeamID', 'oppositeTeamID', 'teamScore',
       'oppositeTeamScore', 'GameResult', 'Win', 'WinPercentage',
       'MedianScoreDiff', 'chalkSeed', 'OppWinPercentage',
       'OppMedianScoreDiff', 'oppositeChalkSeed', 'winPercentageDiff',
       'chalkSeedDiff', 'MedianScoreDiffDiff', 'BaselinePred'],
      dtype='object')

In [31]:
FEATURES = ["winPercentageDiff","chalkSeedDiff"]

TARGET = "Win"

In [32]:
# Create subsets of dfHistoricTourneyFeatures, with X containing the feature columns and y containing the target column.
X = dfHistoricTourneyFeatures[FEATURES]
y = dfHistoricTourneyFeatures[TARGET]

# X = pd.DataFrame(df_scaled)
# y = dfHistoricTourneyFeatures[TARGET]

groups = dfHistoricTourneyFeatures["Season"]
seasons = dfHistoricTourneyFeatures["Season"].unique()

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

gkf = GroupKFold(n_splits=dfHistoricTourneyFeatures["Season"].nunique())
cv_results = []
models = []

season_idx = 0
for train_index, test_index in gkf.split(X, y, groups): 
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Prepare the model
    model = RandomForestClassifier(n_estimators=200, random_state=56)

    holdout_season = seasons[season_idx]
    print(f"Holdout Season: {holdout_season}")

    # Train the model
    model.fit(X_train, y_train)

    # Predict probabilities on the test set
    y_pred_proba = model.predict_proba(X_test)

    # Predict labels for accuracy calculation
    y_pred = model.predict(X_test)

    # Calculate log loss
    score_ll = log_loss(y_test, y_pred_proba)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    models.append(model)
    cv_results.append((accuracy, score_ll))
    
    season_idx += 1
    print(f"Season {holdout_season} Accuracy: {accuracy}, Log Loss: {score_ll}")

# Calculate average accuracy and log loss across all folds
avg_accuracy = np.mean([result[0] for result in cv_results])
avg_log_loss = np.mean([result[1] for result in cv_results])

# Print the average accuracy and log loss across all folds
print("Average CV Accuracy:", avg_accuracy)
print("Average CV Log Loss:", avg_log_loss)

Holdout Season: 1985
Season 1985 Accuracy: 0.5447761194029851, Log Loss: 1.8025058062861201
Holdout Season: 1986
Season 1986 Accuracy: 0.6604477611940298, Log Loss: 1.1134684409254576
Holdout Season: 1987
Season 1987 Accuracy: 0.6576923076923077, Log Loss: 1.0441864203025928
Holdout Season: 1988
Season 1988 Accuracy: 0.6076923076923076, Log Loss: 1.2595057214132859
Holdout Season: 1989
Season 1989 Accuracy: 0.7076923076923077, Log Loss: 0.7703165208512471
Holdout Season: 1990
Season 1990 Accuracy: 0.6346153846153846, Log Loss: 1.7162030397189103
Holdout Season: 1991
Season 1991 Accuracy: 0.6923076923076923, Log Loss: 1.1408860531856584
Holdout Season: 1992
Season 1992 Accuracy: 0.6038461538461538, Log Loss: 1.2654100633802474
Holdout Season: 1993
Season 1993 Accuracy: 0.7115384615384616, Log Loss: 1.1431912480956
Holdout Season: 1994
Season 1994 Accuracy: 0.6576923076923077, Log Loss: 1.2265119193248792
Holdout Season: 1995
Season 1995 Accuracy: 0.7, Log Loss: 1.1831102227716739
Holdou

In [34]:
# from sklearn.model_selection import GroupKFold
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error
# import numpy as np

# gkf = GroupKFold(n_splits=dfHistoricTourneyFeatures["Season"].nunique())
# cv_results = []

# season_idx = 0
# for train_index, test_index in gkf.split(X, y, groups): 
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#     # Prepare the model
#     model = SVR(kernel='rbf')

#     holdout_season = seasons[season_idx]
#     print(f"Holdout Season: {holdout_season}")

#     # Train the model
#     model.fit(X_train, y_train)

#     # Predict on the test set
#     y_pred = model.predict(X_test)

#     # Calculate mean squared error
#     mse = mean_squared_error(y_test, y_pred)

#     cv_results.append(mse)
    
#     season_idx += 1
#     print(f"Season {holdout_season} MSE: {mse}")

# # Calculate average MSE across all folds
# avg_mse = np.mean(cv_results)

# # Print the average MSE across all folds
# print("Average CV MSE:", avg_mse)


In [35]:
TEST_SEASON = 2023

seeds_2024 = pd.read_csv(DATA_PATH + "2024_tourney_seeds.csv")

seeds_2024["chalkSeed"] = (seeds_2024["Seed"].str.replace("a", "").str.replace("b", "").str[1:].astype("int"))

In [36]:
print(len(seeds_2024))
seeds_2024.head()

128


Unnamed: 0,Tournament,Seed,TeamID,chalkSeed
0,M,W01,1163,1
1,M,W02,1235,2
2,M,W03,1228,3
3,M,W04,1120,4
4,M,W05,1361,5


In [37]:
tourneyPairs = (
    seeds_2024.merge(seeds_2024, on=["Tournament"], suffixes=("", "Opp")) 
    .assign(Season=TEST_SEASON)
    .query("TeamID != TeamIDOpp")
    .rename(columns={"Tournament": "League"})
)

In [38]:
dfHistoricTourneyFeatures.head()

Unnamed: 0,Season,League,TeamID,oppositeTeamID,teamScore,oppositeTeamScore,GameResult,Win,WinPercentage,MedianScoreDiff,chalkSeed,OppWinPercentage,OppMedianScoreDiff,oppositeChalkSeed,winPercentageDiff,chalkSeedDiff,MedianScoreDiffDiff,BaselinePred
0,1985,M,1116,1234,63,54,W,1,0.636364,5.0,9.0,0.666667,9.5,8.0,-0.030303,1.0,-4.5,False
1,1985,M,1120,1345,59,58,W,1,0.62069,2.0,11.0,0.68,9.0,6.0,-0.05931,5.0,-7.0,False
2,1985,M,1207,1250,68,43,W,1,0.925926,14.0,1.0,0.37931,-3.0,16.0,0.546616,-15.0,17.0,True
3,1985,M,1229,1425,58,55,W,1,0.740741,6.0,9.0,0.678571,2.5,8.0,0.062169,1.0,3.5,False
4,1985,M,1242,1325,49,38,W,1,0.766667,5.5,3.0,0.740741,6.0,14.0,0.025926,-11.0,-0.5,True


In [39]:
tourneyPairs.head()

Unnamed: 0,League,Seed,TeamID,chalkSeed,SeedOpp,TeamIDOpp,chalkSeedOpp,Season
1,M,W01,1163,1,W02,1235,2,2023
2,M,W01,1163,1,W03,1228,3,2023
3,M,W01,1163,1,W04,1120,4,2023
4,M,W01,1163,1,W05,1361,5,2023
5,M,W01,1163,1,W06,1140,6,2023


In [40]:
# Merging 'tourneyPairs' and 'teamSeasonAggregate' dataframes

tourneyPairs = (
    tourneyPairs.merge(
        teamSeasonAggregate[
            ["Season", "League", "TeamID", "WinPercentage", "MedianScoreDiff"]
        ],
        on=["Season", "League", "TeamID"],
        how="left",
    )
    .merge(
        teamSeasonAggregate[
            ["Season", "League", "TeamID", "WinPercentage", "MedianScoreDiff"]
        ].rename(
            columns={
                "TeamID": "TeamIDOpp",
                "WinPercentage": "OppWinPercentage",
                "MedianScoreDiff": "OppMedianScoreDiff",
            }
        ),
        on=["Season", "League", "TeamIDOpp"],
    )
    .reset_index(drop=True)
)

tourneyPairs["oppositeChalkSeed"] = (
    tourneyPairs["SeedOpp"]
    .str.replace("a", "")
    .str.replace("b", "")
    .str[1:]
    .astype("int")
)

In [41]:
tourneyPairs.head()

Unnamed: 0,League,Seed,TeamID,chalkSeed,SeedOpp,TeamIDOpp,chalkSeedOpp,Season,WinPercentage,MedianScoreDiff,OppWinPercentage,OppMedianScoreDiff,oppositeChalkSeed
0,M,W01,1163,1,W02,1235,2,2023,0.757576,12.0,0.59375,3.5,2
1,M,W01,1163,1,W03,1228,3,2023,0.757576,12.0,0.625,9.0,3
2,M,W01,1163,1,W04,1120,4,2023,0.757576,12.0,0.625,5.5,4
3,M,W01,1163,1,W05,1361,5,2023,0.757576,12.0,0.8125,8.0,5
4,M,W01,1163,1,W06,1140,6,2023,0.757576,12.0,0.53125,2.5,6


In [42]:
len(tourneyPairs)

8064

In [43]:
tourneyPairs["BaselinePred"] = (tourneyPairs["chalkSeed"] < tourneyPairs["oppositeChalkSeed"])

tourneyPairs.loc[tourneyPairs["chalkSeed"] == tourneyPairs["oppositeChalkSeed"],"BaselinePred",] = (tourneyPairs["WinPercentage"] > tourneyPairs["OppWinPercentage"])

tourneyPairs["winPercentageDiff"] = (tourneyPairs["WinPercentage"] - tourneyPairs["OppWinPercentage"])

tourneyPairs["chalkSeedDiff"] = (tourneyPairs["chalkSeed"] - tourneyPairs["oppositeChalkSeed"])

tourneyPairs["MedianScoreDiffDiff"] = (tourneyPairs["MedianScoreDiff"] - tourneyPairs["OppMedianScoreDiff"])

In [44]:
tourneyPairs.head(2)

Unnamed: 0,League,Seed,TeamID,chalkSeed,SeedOpp,TeamIDOpp,chalkSeedOpp,Season,WinPercentage,MedianScoreDiff,OppWinPercentage,OppMedianScoreDiff,oppositeChalkSeed,BaselinePred,winPercentageDiff,chalkSeedDiff,MedianScoreDiffDiff
0,M,W01,1163,1,W02,1235,2,2023,0.757576,12.0,0.59375,3.5,2,True,0.163826,-1,8.5
1,M,W01,1163,1,W03,1228,3,2023,0.757576,12.0,0.625,9.0,3,True,0.132576,-2,3.0


In [45]:
for i, model in enumerate(models):
    tourneyPairs[f"model{i}"] = model.predict(tourneyPairs[FEATURES])

In [46]:
tourneyPairs.columns

Index(['League', 'Seed', 'TeamID', 'chalkSeed', 'SeedOpp', 'TeamIDOpp',
       'chalkSeedOpp', 'Season', 'WinPercentage', 'MedianScoreDiff',
       'OppWinPercentage', 'OppMedianScoreDiff', 'oppositeChalkSeed',
       'BaselinePred', 'winPercentageDiff', 'chalkSeedDiff',
       'MedianScoreDiffDiff', 'model0', 'model1', 'model2', 'model3', 'model4',
       'model5', 'model6', 'model7', 'model8', 'model9', 'model10', 'model11',
       'model12', 'model13', 'model14', 'model15', 'model16', 'model17',
       'model18', 'model19', 'model20', 'model21', 'model22', 'model23',
       'model24', 'model25', 'model26', 'model27', 'model28', 'model29',
       'model30', 'model31', 'model32', 'model33', 'model34', 'model35',
       'model36', 'model37'],
      dtype='object')

In [47]:
tourneyPairs["Pred"] = tourneyPairs[[f for f in tourneyPairs.columns if "model" in f]].mean(axis=1)

tourneyPairs["ID"] = (tourneyPairs["Season"].astype("str") + "_" + tourneyPairs["TeamID"].astype("str") + "_" + tourneyPairs["TeamIDOpp"].astype("str"))

preds = tourneyPairs.copy()

In [48]:
preds

Unnamed: 0,League,Seed,TeamID,chalkSeed,SeedOpp,TeamIDOpp,chalkSeedOpp,Season,WinPercentage,MedianScoreDiff,...,model30,model31,model32,model33,model34,model35,model36,model37,Pred,ID
0,M,W01,1163,1,W02,1235,2,2023,0.757576,12.0,...,1,1,1,1,1,1,1,1,1.000000,2023_1163_1235
1,M,W01,1163,1,W03,1228,3,2023,0.757576,12.0,...,1,1,1,1,1,1,1,1,0.973684,2023_1163_1228
2,M,W01,1163,1,W04,1120,4,2023,0.757576,12.0,...,0,0,0,0,0,0,0,0,0.078947,2023_1163_1120
3,M,W01,1163,1,W05,1361,5,2023,0.757576,12.0,...,0,0,0,0,0,0,0,0,0.000000,2023_1163_1361
4,M,W01,1163,1,W06,1140,6,2023,0.757576,12.0,...,0,0,0,0,0,0,0,0,0.052632,2023_1163_1140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,W,Z16,3394,16,Z11,3112,11,2023,0.592593,7.0,...,1,1,0,1,1,1,1,1,0.973684,2023_3394_3112
8060,W,Z16,3394,16,Z12,3162,12,2023,0.592593,7.0,...,0,0,0,0,0,0,0,0,0.000000,2023_3394_3162
8061,W,Z16,3394,16,Z13,3267,13,2023,0.592593,7.0,...,1,1,1,1,1,1,1,1,1.000000,2023_3394_3267
8062,W,Z16,3394,16,Z14,3238,14,2023,0.592593,7.0,...,1,1,1,1,1,1,0,1,0.921053,2023_3394_3238


In [49]:
!pip install tqdm



In [50]:
from tqdm import tqdm 

# Load and filter data
roundSlots = pd.read_csv(f"{DATA_PATH}/MNCAATourneySlots.csv")
roundSlots = roundSlots[roundSlots["Season"] == 2023]
roundSlots = roundSlots[roundSlots["Slot"].str.contains("R")  ]                                          
seeds = pd.read_csv(f"{DATA_PATH}/2024_tourney_seeds.csv")

seeds_m = seeds[seeds["Tournament"] == "M"]
seeds_w = seeds[seeds["Tournament"] == "W"]

preds["ID"] = preds["ID"].str.split("_")

In [51]:
roundSlots.head(2)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2385,2023,R1W1,W01,W16
2386,2023,R1W2,W02,W15


In [52]:
seeds.head(2)

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1163
1,M,W02,1235


In [53]:
preds.head(2)

Unnamed: 0,League,Seed,TeamID,chalkSeed,SeedOpp,TeamIDOpp,chalkSeedOpp,Season,WinPercentage,MedianScoreDiff,...,model30,model31,model32,model33,model34,model35,model36,model37,Pred,ID
0,M,W01,1163,1,W02,1235,2,2023,0.757576,12.0,...,1,1,1,1,1,1,1,1,1.0,"[2023, 1163, 1235]"
1,M,W01,1163,1,W03,1228,3,2023,0.757576,12.0,...,1,1,1,1,1,1,1,1,0.973684,"[2023, 1163, 1228]"


In [54]:
preds.columns

Index(['League', 'Seed', 'TeamID', 'chalkSeed', 'SeedOpp', 'TeamIDOpp',
       'chalkSeedOpp', 'Season', 'WinPercentage', 'MedianScoreDiff',
       'OppWinPercentage', 'OppMedianScoreDiff', 'oppositeChalkSeed',
       'BaselinePred', 'winPercentageDiff', 'chalkSeedDiff',
       'MedianScoreDiffDiff', 'model0', 'model1', 'model2', 'model3', 'model4',
       'model5', 'model6', 'model7', 'model8', 'model9', 'model10', 'model11',
       'model12', 'model13', 'model14', 'model15', 'model16', 'model17',
       'model18', 'model19', 'model20', 'model21', 'model22', 'model23',
       'model24', 'model25', 'model26', 'model27', 'model28', 'model29',
       'model30', 'model31', 'model32', 'model33', 'model34', 'model35',
       'model36', 'model37', 'Pred', 'ID'],
      dtype='object')

In [55]:
def prepareData(seeds, preds):
    seedDict = seeds.set_index("Seed")["TeamID"].to_dict()
    inverted_seed_dict = {value: key for key, value in seedDict.items()}
    probabilitiesDict = {}

    for teams, proba in zip(preds["ID"], preds["Pred"]):
        team1, team2 = teams[1], teams[2]

        probabilitiesDict.setdefault(team1, {})[team2] = proba
        probabilitiesDict.setdefault(team2, {})[team1] = 1 - proba

    return seedDict, inverted_seed_dict, probabilitiesDict

def simulate(roundSlots, seeds, inverted_seeds, probas, sim=True):
    """
    Simulates each round of the tournament.

    Parameters:
    - roundSlots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - probas (dict): Dictionary containing matchup probabilities.
    - sim (boolean): Simulates match if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    """
    winners = []
    slots = []

    for slot, strong, weak in zip(
        roundSlots.Slot, roundSlots.StrongSeed, roundSlots.WeakSeed
    ):
        team_1, team_2 = seeds[strong], seeds[weak]

        # Get the probability of team_1 winning
        proba = probas[str(team_1)][str(team_2)]

        if sim:
            # Randomly determine the winner based on the probability
            winner = np.random.choice([team_1, team_2], p=[proba, 1 - proba])
        else:
            # Determine the winner based on the higher probability
            winner = [team_1, team_2][np.argmax([proba, 1 - proba])]

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    # Convert winners to original seeds using the inverted_seeds dictionary
    return [inverted_seeds[w] for w in winners], slots

def run_simulation(brackets=1, seeds=None, preds=None, roundSlots=None, sim=True):
    """
    Runs a simulation of bracket tournaments.

    Parameters:
    - brackets (int): Number of brackets to simulate.
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - preds (pd.DataFrame): DataFrame containing prediction information for each match-up.
    - roundSlots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - sim (boolean): Simulates matches if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    """
    
    # Get relevant data for the simulation
    seedDict, inverted_seed_dict, probabilitiesDict = prepareData(seeds, preds)
    
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(roundSlots, seedDict, inverted_seed_dict, probabilitiesDict, sim)

        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({"Bracket": bracket, "Slot": slots, "Team": results})

    return result_df

n_brackets = 3
result_m = run_simulation(
    brackets=n_brackets, seeds=seeds_m, preds=preds, roundSlots=roundSlots, sim=False
)
result_m["Tournament"] = "M"
result_w = run_simulation(brackets=n_brackets, seeds=seeds_w, preds=preds, roundSlots=roundSlots, sim=False)
result_w["Tournament"] = "W"
submission = pd.concat([result_m, result_w])
submission = submission.reset_index(drop=True)
submission.index.names = ["RowId"]
submission = submission.reset_index()

100%|██████████| 3/3 [00:00<00:00, 1358.55it/s]
100%|██████████| 3/3 [00:00<00:00, 868.51it/s]


In [56]:
ss = pd.read_csv(DATA_PATH + "sample_submission.csv")
submission[ss.columns] = submission[ss.columns]
submission[ss.columns].to_csv("submission.csv", index=False)