In [1]:
import random
import numpy as np
import pandas as pd
import polars as pl

In [2]:
import glob
import os
from os import listdir, walk
from os.path import join, split

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

In [4]:
import sklearn as sk
import torch

In [5]:
class CFG:
    random_seed = 42
    dir = "/home/nick/Projects/kaggle/march_mania_24/data/"

In [6]:
def load_df(df_name, describe=False):
    df = pl.read_csv(join(CFG.dir, f"{df_name}.csv"), ignore_errors=True)
    print("\n"+"*"*50)
    print(f"DATASET NAME: {df_name}")
    print(f"SHAPE: {df.shape}")
    print(f"\nMEMORY USAGE: {df.estimated_size('b')} bytes")
    print("\nDISPLAY\n")
    display(df.head(3))
    if describe:
        print("\nDESCRIBE\n")
        display(df.describe())
    return df

In [7]:
mMassey_partaa = load_df('MMasseyOrdinals_partaa')
mMassey_partab = load_df('MMasseyOrdinals_partab')
mMassey_partac = load_df('MMasseyOrdinals_partac')


**************************************************
DATASET NAME: MMasseyOrdinals_partaa
SHAPE: (2386744, 5)

MEMORY USAGE: 114563712 bytes

DISPLAY



Season,RankingDayNum,SystemName,TeamID,OrdinalRank
i64,i64,str,i64,i64
2003,35,"""SEL""",1102,159
2003,35,"""SEL""",1103,229
2003,35,"""SEL""",1104,12



**************************************************
DATASET NAME: MMasseyOrdinals_partab
SHAPE: (2377078, 5)

MEMORY USAGE: 114099744 bytes

DISPLAY



Season,RankingDayNum,SystemName,TeamID,OrdinalRank
i64,i64,str,i64,i64
2015,65,"""BUR""",1234,41
2015,65,"""BUR""",1235,18
2015,65,"""BUR""",1236,283



**************************************************
DATASET NAME: MMasseyOrdinals_partac
SHAPE: (402283, 5)

MEMORY USAGE: 19309584 bytes

DISPLAY



Season,RankingDayNum,SystemName,TeamID,OrdinalRank
i64,i64,str,i64,i64
2023,79,"""JNG""",1117,301
2023,79,"""JNG""",1119,202
2023,79,"""JNG""",1120,19


In [8]:
mMassey_list = [mMassey_partaa, mMassey_partab, mMassey_partac]
mMassey = pl.concat(mMassey_list)
mMassey

Season,RankingDayNum,SystemName,TeamID,OrdinalRank
i64,i64,str,i64,i64
2003,35,"""SEL""",1102,159
2003,35,"""SEL""",1103,229
2003,35,"""SEL""",1104,12
2003,35,"""SEL""",1105,314
2003,35,"""SEL""",1106,260
…,…,…,…,…
2024,100,"""WLK""",1474,275
2024,100,"""WLK""",1475,331
2024,100,"""WLK""",1476,357
2024,100,"""WLK""",1477,312


In [9]:
mTourneyResults = load_df('MNCAATourneyDetailedResults')


**************************************************
DATASET NAME: MNCAATourneyDetailedResults
SHAPE: (1315, 34)

MEMORY USAGE: 368200 bytes

DISPLAY



Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
i64,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
2003,134,1421,92,1411,84,"""N""",1,32,69,11,29,17,26,14,30,17,12,5,3,22,29,67,12,31,14,31,17,28,16,15,5,0,22
2003,136,1112,80,1436,51,"""N""",0,31,66,7,23,11,14,11,36,22,16,10,7,8,20,64,4,16,7,7,8,26,12,17,10,3,15
2003,136,1113,84,1272,71,"""N""",0,31,59,6,14,16,22,10,27,18,9,7,4,19,25,69,7,28,14,21,20,22,11,12,2,5,18


In [10]:
mTourneyResults

Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
i64,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
2003,134,1421,92,1411,84,"""N""",1,32,69,11,29,17,26,14,30,17,12,5,3,22,29,67,12,31,14,31,17,28,16,15,5,0,22
2003,136,1112,80,1436,51,"""N""",0,31,66,7,23,11,14,11,36,22,16,10,7,8,20,64,4,16,7,7,8,26,12,17,10,3,15
2003,136,1113,84,1272,71,"""N""",0,31,59,6,14,16,22,10,27,18,9,7,4,19,25,69,7,28,14,21,20,22,11,12,2,5,18
2003,136,1141,79,1166,73,"""N""",0,29,53,3,7,18,25,11,20,15,18,13,1,19,27,60,7,17,12,17,14,17,20,21,6,6,21
2003,136,1143,76,1301,74,"""N""",1,27,64,7,20,15,23,18,20,17,13,8,2,14,25,56,9,21,15,20,10,26,16,14,5,8,19
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2023,146,1274,88,1400,81,"""N""",0,29,49,2,8,28,32,4,19,11,12,9,1,14,30,60,10,25,11,15,9,14,20,13,6,1,23
2023,146,1361,57,1166,56,"""N""",0,25,66,3,13,4,6,12,23,6,7,1,3,11,22,55,2,17,10,11,8,24,9,9,3,2,11
2023,152,1163,72,1274,59,"""N""",0,28,57,9,26,7,13,13,27,19,14,5,5,11,20,62,7,20,12,12,12,17,10,9,8,1,12
2023,152,1361,72,1194,71,"""N""",0,25,57,9,18,13,22,12,23,8,8,3,2,17,23,52,9,22,16,21,7,24,6,9,6,2,17


I want to create a new column named 'round' that shows in what round of the tournament the game is played.
round 0 will be for the play in games.

In [11]:
mTourneyResults = mTourneyResults.with_columns(
    pl.when(mTourneyResults['DayNum'].is_in(range(134, 136))).then(pl.lit(0))
    .when(mTourneyResults['DayNum'].is_in(range(136, 138))).then(pl.lit(1))
    .when(mTourneyResults['DayNum'].is_in(range(138, 140))).then(pl.lit(2))
    .when(mTourneyResults['DayNum'].is_in(range(143, 145))).then(pl.lit(3))
    .when(mTourneyResults['DayNum'].is_in(range(145, 147))).then(pl.lit(4))
    .when(mTourneyResults['DayNum'] == 152).then(pl.lit(5))
    .when(mTourneyResults['DayNum'] == 154).then(pl.lit(6))
    .otherwise(pl.lit(-1))
    .alias('Round')
)

In [13]:
mTourneyResults['Round']

Round
i32
0
1
1
1
1
…
4
4
5
5


In [16]:
# check if rankings also during tournament are updated
mMassey.select(pl.max('RankingDayNum'))

RankingDayNum
i64
133


Now want to do use the rankings as the parameters to see how they predict the results of the tournament