<a href="https://www.kaggle.com/code/miltiadesgeneral/marchmadness2023-lrmodel?scriptVersionId=120967190" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import glob
import pandas as pd
import numpy as np
import math
import random
import matplotlib.pyplot as plt
import re

from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression, ElasticNet
import seaborn as sns

%matplotlib inline

In [2]:
class Config:
    paths = glob.glob("/kaggle/input/warmup-round-march-machine-learning-mania-2023/*.csv")
    paths_dct = {}
    for path in paths:
        paths_dct[path.split("/")[-1][:-4]] = path
    
    paths_keys = list(paths_dct.keys())

In [3]:
paths = Config.paths_dct

## Constructing the Dataframe
- Get the team seeds
- Get the team rankings
- Engineer regular season results for a team during a season
- Engineer tourney results for a team during a season

### Include Features
- W/L percentage
- Point Differential
- Tournament Seed

### Team Seeds

In [4]:
Mseeds = Config.paths_dct["MNCAATourneySeeds"]
Wseeds = Config.paths_dct["WNCAATourneySeeds"]

seeds_df = pd.concat([
        pd.read_csv(Mseeds),
        pd.read_csv(Wseeds)
], ignore_index=True)

seeds_df.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


### Regular Season Results

In [5]:
# Create a dataframe of all the games from both men and women, regular season and tourneys
MRegularSeason = Config.paths_dct["MRegularSeasonCompactResults"]
WRegularSeason = Config.paths_dct["WRegularSeasonCompactResults"]
regular_season_df = pd.concat([
    pd.read_csv(MRegularSeason),
    pd.read_csv(WRegularSeason)
], ignore_index=True)

# Mark that the game is not a tourney game
regular_season_df["Tourney"] = 0

MTourney = Config.paths_dct["MNCAATourneyCompactResults"]
WTourney = Config.paths_dct["WNCAATourneyCompactResults"]
tourney_df = pd.concat([
    pd.read_csv(MTourney),
    pd.read_csv(WTourney)
], ignore_index=True)

# Mark that the game is a tourney game
tourney_df = tourney_df[tourney_df["Season"] < 2017]
tourney_df["Tourney"] = 1

# Combine the tourney games with the regular season games
full_season_df = pd.concat([
    regular_season_df,
    tourney_df
], ignore_index=True)

### Flatten the Dataframe so each team has its own row 

In [6]:
# Separate and reconnect the winning and losing teams
rename_w_cats = { "WTeamID": "TeamID", "WScore": "Score" }
rename_l_cats = { "LTeamID": "TeamID", "LScore": "Score" }

teams_df = pd.concat([
    pd.read_csv(paths["MTeams"]),
    pd.read_csv(paths["WTeams"])
], ignore_index=True).drop(["FirstD1Season", "LastD1Season"], axis=1)


_winning_df = full_season_df
_winning_df["PtDiff"] = _winning_df["WScore"] - _winning_df["LScore"]
_winning_df["Win"] = 1
_winning_df = _winning_df.rename(rename_w_cats, axis=1).drop(rename_l_cats.keys(), axis=1)

winning_teams_df = pd.merge(
    left=teams_df,
    right=_winning_df,
    on="TeamID"
)

_losing_df = full_season_df
_losing_df["PtDiff"] = _losing_df["LScore"] - _losing_df["WScore"]
_losing_df["Win"] = 0
_losing_df = _losing_df.rename(rename_l_cats, axis=1).drop(rename_w_cats.keys(), axis=1)

losing_teams_df = pd.merge(
    left=teams_df,
    right=_losing_df,
    on="TeamID"
)

teams_df = pd.concat([
    winning_teams_df,
    losing_teams_df
], ignore_index=True).sort_values(["Season", "TeamID", "DayNum"]).reset_index(drop=True).drop(["WLoc", "NumOT"], axis=1)

In [7]:
teams_df

Unnamed: 0,TeamID,TeamName,Season,DayNum,Score,Tourney,PtDiff,Win
0,1102,Air Force,1985,26,59,0,-5,0
1,1102,Air Force,1985,32,78,0,11,1
2,1102,Air Force,1985,33,55,0,-25,0
3,1102,Air Force,1985,40,85,0,29,1
4,1102,Air Force,1985,61,52,0,-15,0
...,...,...,...,...,...,...,...,...
619047,3477,TX A&M Commerce,2023,101,67,0,20,1
619048,3477,TX A&M Commerce,2023,103,66,0,-5,0
619049,3477,TX A&M Commerce,2023,108,83,0,13,1
619050,3477,TX A&M Commerce,2023,110,55,0,-11,0


### Calculate cumulative season statistics for each team going into each game

In [8]:
# Save win, tourney and score column to reattach
win_col = teams_df["Win"]
score_col = teams_df["Score"]
tourney_col = teams_df["Tourney"]

cum_sum = teams_df.groupby(["Season", "TeamID"])[["Score", "PtDiff", "Win"]].cumsum()
teams_df = teams_df[["TeamID", "TeamName", "Season", "DayNum"]]
teams_df[["Score", "PtDiff", "Win"]] = cum_sum
teams_df = teams_df.rename({ "Win": "CumWins" }, axis=1)

# reattach cols
teams_df["Win"] = win_col
teams_df["FinalScore"] = score_col
teams_df["Tourney"] = tourney_col

count = teams_df.groupby(["Season", "TeamID"]).cumcount() + 1
teams_df["count"] = count
# Calculate winning percentage
teams_df["WPct"] = teams_df["CumWins"] / teams_df["count"]
# Calculate pt differential per game
teams_df["PtDiff/g"] = teams_df["PtDiff"] / teams_df["count"]
# Calculate points scored per game
teams_df["Pts/g"] = teams_df["Score"] / teams_df["count"]


# def treatSeed(seed):
#     return int(re.sub('[^0-9]', "", seed))

# teams_seeds_df = pd.merge(
#     teams_df,
#     seeds_df,
#     left_on=["Season", "TeamID"],
#     right_on=["Season", "TeamID"]
# )

# teams_seeds_df["Seed"] = teams_seeds_df["Seed"].apply(treatSeed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [9]:
# Shift the day by one up so that each statistic corresponds to the statistic going into that game for a given team
shifted_day = teams_df.groupby(["Season", "TeamID"])["DayNum"].shift(-1)
teams_df["DayNum"] = shifted_day
teams_df = teams_df.dropna()
teams_df["DayNum"] = teams_df["DayNum"].astype("int")

In [10]:
def findComplimentaryIndices(arr, end, start=0):
    complimentary_values = set(range(start, end+1)) - set(arr)
    return list(complimentary_values)

def swapIDs(df, col1, col2):
    # Create a random array of indices to take from the winner column
    _rand_range = list(range(len(df) - 1))
    rand_index = []
    for i in range(len(_rand_range) // 2):
        _choice = random.choice(_rand_range)
        rand_index.append(_choice)
        _rand_range.remove(_choice)
    
    comp_index = findComplimentaryIndices(rand_index, end=len(df) - 1)
    
    _winners_toA = df.loc[rand_index, col1].sort_index()
    _losers_toA = df.loc[comp_index, col2].sort_index()
    
    _winners_toB = df.loc[comp_index, col1].sort_index()
    _losers_toB = df.loc[rand_index, col2].sort_index()
    
    _a_col = pd.concat([
        _winners_toA,
        _losers_toA
    ]).sort_index()
    
    _b_col = pd.concat([
        _winners_toB,
        _losers_toB
    ]).sort_index()
        
    df["ATeamID"] = _a_col
    df["BTeamID"] = _b_col
    
    df.drop([col1, col2], axis=1, inplace=True)

In [11]:
# Start a dataframe to get the original matchup for all games 
# full_games_df = full_season_df[["Season", "DayNum", "WTeamID", "LTeamID"]]
# full_games_df
# # randomly assign a team to be ATeam and a team to be BTeam
# swapIDs(full_games_df, "WTeamID", "LTeamID")

In [12]:
full_games_df = pd.read_csv("/kaggle/input/full-games-helper-file-mm-2023/full_games.csv")

In [13]:
full_games_df = full_games_df.drop(["Unnamed: 0"], axis=1)
full_games_df

Unnamed: 0,Season,DayNum,ATeamID,BTeamID
0,1985,20,1228,1328
1,1985,25,1106,1354
2,1985,25,1223,1112
3,1985,25,1432,1165
4,1985,25,1192,1447
...,...,...,...,...
309521,2016,147,3163,3400
309522,2016,147,3333,3124
309523,2016,153,3333,3163
309524,2016,153,3393,3449


In [14]:
full_season_df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,Tourney,PtDiff,Win
0,1985,20,1228,81,1328,64,N,0,0,-17,0
1,1985,25,1106,77,1354,70,H,0,0,-7,0
2,1985,25,1112,63,1223,56,H,0,0,-7,0
3,1985,25,1165,70,1432,54,H,0,0,-16,0
4,1985,25,1192,86,1447,74,H,0,0,-12,0
...,...,...,...,...,...,...,...,...,...,...,...
309521,2016,147,3163,86,3400,65,N,0,1,-21,0
309522,2016,147,3333,60,3124,57,N,0,1,-3,0
309523,2016,153,3163,80,3333,51,N,0,1,-29,0
309524,2016,153,3393,80,3449,59,N,0,1,-21,0


In [15]:
rename_a_map = { 'TeamID' : 'ATeamID', 'Pts/g' : 'APts/g', 'PtDiff/g': 'APtDiff/g', 'WPct': 'AWPct', 'Win': 'APrevWin', 'Score': 'ACumScore', 'PtDiff': 'APtDiff',
               'CumWins': 'ACumWins', 'FinalScore': 'APrevFinalScore', 'Seed': 'ASeed', 'Tourney': 'ATourney' }

rename_b_map = { 'TeamID' : 'BTeamID', 'Pts/g' : 'BPts/g', 'PtDiff/g': 'BPtDiff/g', 'WPct': 'BWPct', 'Win': 'BPrevWin', 'Score': 'BCumScore', 'PtDiff': 'BPtDiff',
               'CumWins': 'BCumWins', 'FinalScore': 'BPrevFinalScore', 'Seed': 'BSeed', 'Tourney': 'BTourney', }

full_df = pd.merge(
    left=full_games_df,
    right=teams_df,
    left_on=["ATeamID", "Season", "DayNum"], 
    right_on=["TeamID", "Season", "DayNum"]
).drop(["count", "TeamID", "TeamName"], axis=1).rename(rename_a_map, axis=1)

full_df = pd.merge(
    left=full_df,
    right=teams_df,
    left_on=["BTeamID", "Season", "DayNum"],
    right_on=["TeamID", "Season", "DayNum"]
).drop(["count", "TeamID", "TeamName"], axis=1).rename(rename_b_map, axis=1)

In [16]:
w_seasons_df = full_season_df[["Season", "DayNum", "WTeamID", "WScore"]]
l_seasons_df = full_season_df[["Season", "DayNum", "LTeamID", "LScore"]]
a_winner_df = pd.merge(
    full_df,
    w_seasons_df,
    left_on=["ATeamID", "Season", "DayNum"],
    right_on=["WTeamID", "Season", "DayNum"]
).rename({ "WScore": "AScore" }, axis=1).drop("WTeamID", axis=1)

a_winner_df = pd.merge(
    a_winner_df,
    l_seasons_df,
    left_on=["BTeamID", "Season", "DayNum"],
    right_on=["LTeamID", "Season", "DayNum"]
).rename({ "LScore": "BScore" }, axis=1).drop("LTeamID", axis=1).reset_index(drop=True)

a_loser_df = pd.merge(
    full_df,
    l_seasons_df,
    left_on=["ATeamID", "Season", "DayNum"],
    right_on=["LTeamID", "Season", "DayNum"]
).rename({ "LScore": "AScore" }, axis=1).drop("LTeamID", axis=1)

a_loser_df = pd.merge(
    a_loser_df,
    w_seasons_df,
    left_on=["BTeamID", "Season", "DayNum"],
    right_on=["WTeamID", "Season", "DayNum"]
).rename({ "WScore": "BScore" }, axis=1).drop("WTeamID", axis=1).reset_index(drop=True)

full_df = pd.concat([
    a_winner_df,
    a_loser_df
], ignore_index=True).sort_values(["Season", "DayNum", "ATeamID"]).reset_index(drop=True)

In [17]:
full_df["Win"] = full_df["AScore"] > full_df["BScore"]
full_df["ScoreDiff"] = full_df["AScore"] - full_df["BScore"]

In [18]:
full_df["Win"] = full_df["Win"].apply(lambda x: int(x))
full_df

Unnamed: 0,Season,DayNum,ATeamID,BTeamID,ACumScore,APtDiff,ACumWins,APrevWin,APrevFinalScore,ATourney,...,BPrevWin,BPrevFinalScore,BTourney,BWPct,BPtDiff/g,BPts/g,AScore,BScore,Win,ScoreDiff
0,1985,26,1397,1226,65,-5,0,0,65,0,...,0,44,0,0.000000,-20.000000,44.000000,65,59,1,6
1,1985,26,1412,1228,70,5,1,1,70,0,...,1,64,0,1.000000,18.500000,72.500000,59,52,1,7
2,1985,27,1268,1397,56,-2,0,0,56,0,...,1,65,0,0.500000,0.500000,65.000000,72,49,1,23
3,1985,27,1332,1228,49,-17,0,0,49,0,...,0,52,0,0.666667,10.000000,65.666667,72,75,0,-3
4,1985,27,1412,1242,129,12,2,1,59,0,...,1,66,0,1.000000,9.500000,62.000000,50,46,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296196,2023,113,3177,3437,2178,155,15,1,86,0,...,0,51,0,0.821429,13.500000,71.285714,64,67,0,-3
296197,2023,113,3237,3464,1910,-19,15,1,87,0,...,0,54,0,0.640000,5.000000,65.280000,87,80,1,7
296198,2023,113,3268,3234,2120,273,22,1,66,0,...,1,80,0,0.814815,18.370370,88.296296,96,68,1,28
296199,2023,113,3344,3207,1719,-36,13,0,64,0,...,0,34,0,0.423077,-3.730769,59.153846,69,73,0,-4


In [19]:
full_df[(full_df["Season"] == 2015) & (full_df["ATeamID"] == 1181)]

Unnamed: 0,Season,DayNum,ATeamID,BTeamID,ACumScore,APtDiff,ACumWins,APrevWin,APrevFinalScore,ATourney,...,BPrevWin,BPrevFinalScore,BTourney,BWPct,BPtDiff/g,BPts/g,AScore,BScore,Win,ScoreDiff
209374,2015,18,1181,1396,303,129,3,1,81,0,...,1,82,0,1.0,5.0,61.0,74,54,1,20
209475,2015,19,1181,1390,377,149,4,1,74,0,...,1,89,0,1.0,18.333333,82.333333,70,59,1,11
209917,2015,23,1181,1202,447,160,5,1,70,0,...,0,55,0,0.333333,-6.333333,59.666667,93,54,1,39
210576,2015,30,1181,1458,633,219,7,1,93,0,...,1,69,0,1.0,24.0,74.571429,80,70,1,10
211641,2015,45,1181,1163,788,242,9,1,75,0,...,1,106,0,0.571429,5.857143,68.285714,66,56,1,10
212334,2015,56,1181,1405,854,252,10,1,66,0,...,1,83,0,0.636364,8.818182,77.454545,86,69,1,17
212544,2015,58,1181,1459,940,269,11,1,86,0,...,0,44,0,0.7,2.2,62.6,84,55,1,29
213131,2015,65,1181,1448,1109,321,13,1,85,0,...,0,76,0,0.533333,2.933333,70.6,73,65,1,8
213725,2015,71,1181,1274,1257,317,14,0,75,0,...,1,60,0,0.733333,8.4,69.8,74,90,0,-16
214041,2015,75,1181,1257,1331,301,14,0,74,0,...,1,78,0,0.882353,16.941176,73.823529,63,52,1,11


In [20]:
features = ["AWPct", "APtDiff/g", "ACumWins", "ACumScore", "APts/g", "BWPct", "BPtDiff/g", "BCumWins", "BCumScore", "BPts/g"]

In [21]:
def rescale(features, df_train, df_val, df_test=None):
    min_ = df_train[features].min()
    max_ = df_train[features].max()
    
    df_train[features] = (df_train[features] - min_) / (max_ - min_)
    df_val[features] = (df_val[features] - min_) / (max_ - min_)
    
    if df_test is not None:
        df_test[features] = (df_test[features] - min_) / (max_ - min_)
        
    return df_train, df_val, df_test

In [22]:
def KFolds(df, df_test=None, plot=False, verbose=False, mode='reg'):
    seasons = df['Season'].unique()
    cvs = []
    pred_tests = []
    target = "ScoreDiff" if mode == "reg" else "Win"
    
    fold = 0
    
    minScore = float("inf")
    best_index = 0
    
    for season in seasons[1:]:
        if verbose:
            print(f'\n Validating on season {season}')
            
            df_train = df[df['Season'] < season].reset_index(drop=True).copy()
            df_val = df[df['Season'] == season].reset_index(drop=True).copy()
#             df_test = df_test_.copy() if df_test.any() else None
            
            df_train, df_val, df_test = rescale(features, df_train, df_val, df_test)
            
            if mode == "reg":
                model = LinearRegression()
            else:
                model = LogisticRegression(C=1)
                
            model.fit(df_train[features], df_train[target])
            
            if mode == "reg":
                pred = model.predict(df_val[features])
            else:
                pred = model.predict_proba(df_val[features])[:, 1]
        
            if df_test is not None:
                if mode == "reg":
                    pred_test = model.predict(df_test[features])

                    print( pred_test.max() - pred_test.min())
                    pred_test = (pred_test - pred_test.min()) / (pred_test.max() - pred_test.min())
                else:
                    pred_test = model.predict_proba(df_test[features])[:, 1]

                pred_tests.append(pred_test)
            
            if plot:
                plt.figure(figsize=(15, 6))
                plt.subplot(1, 2, 1)
                plt.scatter(pred, df_val['ScoreDiff'].values, s=5)
                plt.title('Prediction vs Score Diff')
                plt.grid(True)
                plt.subplot(1, 2, 2)
                sns.histplot(pred, bins=20)
                plt.title('Predictions probability repartition')
                plt.show()
            
            pred = (pred - pred.min()) / (pred.max() - pred.min())
            pred = np.clip(pred, 0, 1)
            

            score = ((df_val['Win'].values - pred) ** 2).mean()
            cvs.append(score)
            
            if score < minScore:
                best_index = fold
                minScore = score
            print(f'Best Fold: {best_index}')
            
            if verbose:
                print(f'\t -> Scored {score:.3f}')
                
            fold += 1

        print(f'\n Local CV is {np.mean(cvs):.3f}')

    return pred_tests

# Apply to testing df
____________________________________________________

In [23]:
team_season_avg = teams_df.groupby(["TeamID", "Season"]).mean().reset_index()
test_df = pd.read_csv(Config.paths_dct["SampleSubmissionWarmup"])

season = test_df["ID"].apply(lambda x: x.split("_")[0])
ATeamID = test_df["ID"].apply(lambda x: x.split("_")[1])
BTeamID = test_df["ID"].apply(lambda x: x.split("_")[2])

test_df["Season"] = season.astype("int")
test_df["ATeamID"] = ATeamID.astype("int")
test_df["BTeamID"] = BTeamID.astype("int")
test_df

Unnamed: 0,ID,Pred,Season,ATeamID,BTeamID
0,2017_1101_1102,0.5,2017,1101,1102
1,2017_1101_1103,0.5,2017,1101,1103
2,2017_1101_1104,0.5,2017,1101,1104
3,2017_1101_1105,0.5,2017,1101,1105
4,2017_1101_1106,0.5,2017,1101,1106
...,...,...,...,...,...
614314,2022_3469_3471,0.5,2022,3469,3471
614315,2022_3469_3472,0.5,2022,3469,3472
614316,2022_3470_3471,0.5,2022,3470,3471
614317,2022_3470_3472,0.5,2022,3470,3472


In [24]:
# Replace the two teams that didnt have a row for 2021
# Replace with the mean of the 2020 and 2022 season
# Team1: teamID = 3169
# Team2: teamID = 3197
team_1_fill = team_season_avg.iloc[14227:14229].mean()
team_2_fill = team_season_avg.iloc[14930:14932].mean()

team_1_fill["TeamID"] = team_1_fill["TeamID"].astype("int")
team_1_fill["Season"] = team_1_fill["Season"].astype("int")

team_2_fill["TeamID"] = team_2_fill["TeamID"].astype("int")
team_2_fill["Season"] = team_2_fill["Season"].astype("int")

In [25]:
team_season_avg.loc[(len(team_season_avg.index))] = team_1_fill
team_season_avg.loc[(len(team_season_avg.index))] = team_2_fill

team_season_avg[["TeamID", "Season"]] = team_season_avg[["TeamID", "Season"]].astype("int")
team_season_avg

Unnamed: 0,TeamID,Season,DayNum,Score,PtDiff,CumWins,Win,FinalScore,Tourney,count,WPct,PtDiff/g,Pts/g
0,1101,2014,67.900000,633.700000,-210.350000,0.750000,0.100000,63.500000,0.0,10.5,0.047414,-21.847326,59.899979
1,1101,2015,76.259259,878.444444,-148.703704,4.185185,0.259259,62.259259,0.0,14.0,0.267709,-12.769678,61.159269
2,1101,2016,75.961538,931.307692,-116.807692,3.884615,0.346154,69.884615,0.0,13.5,0.243308,-10.586515,69.219923
3,1101,2017,76.708333,831.541667,-62.000000,4.000000,0.375000,67.708333,0.0,12.5,0.342147,-3.983081,66.240762
4,1101,2018,73.153846,970.653846,-30.576923,6.384615,0.423077,70.730769,0.0,13.5,0.430008,-3.453277,72.093502
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21425,3475,2023,69.173913,770.869565,-53.434783,4.347826,0.347826,62.565217,0.0,12.0,0.389505,-4.281552,66.059219
21426,3476,2023,61.480000,724.080000,-113.800000,3.080000,0.240000,58.480000,0.0,13.0,0.202319,-10.205478,53.086281
21427,3477,2023,66.760000,836.400000,-118.840000,4.000000,0.400000,64.640000,0.0,13.0,0.221138,-10.392260,64.367660
21428,3169,2021,73.657931,787.920000,-132.394483,4.600690,0.329655,56.286897,0.0,14.0,0.286761,-12.328393,55.810160


In [26]:
test_df = pd.merge(
    test_df,
    team_season_avg,
    left_on=["Season", "ATeamID"],
    right_on=["Season", "TeamID"],
    how='left'
).drop(["count", "TeamID", "DayNum"], axis=1).rename(rename_a_map, axis=1)

test_df = pd.merge(
    test_df,
    team_season_avg,
    left_on=["Season", "BTeamID"],
    right_on=["Season", "TeamID"],
    how='left'
).drop(["count", "TeamID", "DayNum"], axis=1).rename(rename_b_map, axis=1)

In [27]:
preds = KFolds(full_df, test_df, plot=False, verbose=True)


 Validating on season 1986
90.97148125716338
Best Fold: 0
	 -> Scored 0.212

 Local CV is 0.212

 Validating on season 1987
6.846492065952141
Best Fold: 0
	 -> Scored 0.221

 Local CV is 0.217

 Validating on season 1988
8.286704337370761
Best Fold: 0
	 -> Scored 0.216

 Local CV is 0.217

 Validating on season 1989
7.230123781487848
Best Fold: 0
	 -> Scored 0.217

 Local CV is 0.217

 Validating on season 1990
7.833893879663288
Best Fold: 0
	 -> Scored 0.220

 Local CV is 0.217

 Validating on season 1991
7.124407867946758
Best Fold: 0
	 -> Scored 0.224

 Local CV is 0.218

 Validating on season 1992
7.048171178578878
Best Fold: 0
	 -> Scored 0.215

 Local CV is 0.218

 Validating on season 1993
6.857928473792605
Best Fold: 0
	 -> Scored 0.219

 Local CV is 0.218

 Validating on season 1994
7.471180583425501
Best Fold: 0
	 -> Scored 0.217

 Local CV is 0.218

 Validating on season 1995
7.13596003950719
Best Fold: 0
	 -> Scored 0.219

 Local CV is 0.218

 Validating on season 1996
6.9

## Prepare for Submission

In [28]:
submission_df = pd.DataFrame(test_df["ID"])
submission_df["Pred"] = preds[27]

In [29]:
submission_df

Unnamed: 0,ID,Pred
0,2017_1101_1102,0.562256
1,2017_1101_1103,0.707810
2,2017_1101_1104,0.628547
3,2017_1101_1105,0.317965
4,2017_1101_1106,0.389867
...,...,...
614314,2022_3469_3471,0.601543
614315,2022_3469_3472,0.529394
614316,2022_3470_3471,0.448459
614317,2022_3470_3472,0.376311


In [30]:
submission_df.to_csv("submission.csv", index=False)