In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
%load_ext lab_black

In [3]:
# csv files
csv_files = [
    "Cities.csv",
    "Conferences.csv",
    "MConferenceTourneyGames.csv",
    "MGameCities.csv",
    "MMasseyOrdinals.csv",
    "MNCAATourneyCompactResults.csv",
    "MNCAATourneyDetailedResults.csv",
    "MNCAATourneySeedRoundSlots.csv",
    "MNCAATourneySeeds.csv",
    "MNCAATourneySlots.csv",
    "MRegularSeasonCompactResults.csv",
    "MRegularSeasonDetailedResults.csv",
    "MSampleSubmissionStage1.csv",
    "MSeasons.csv",
    "MSecondaryTourneyCompactResults.csv",
    "MSecondaryTourneyTeams.csv",
    "MTeamCoaches.csv",
    "MTeamConferences.csv",
    "MTeamSpellings.csv",
    "MTeams.csv",
]

# getting the csv files to dataframes
dataframes = list()
for file in csv_files:
    path = "../data/raw/" + file
    df = pd.read_csv(path, encoding="cp1252")
    dataframes.append(df)

# convert to dataframe name
csv_names = " ".join(list(map(lambda x: "df_" + x.split(".")[0], csv_files)))
csv_names

# names are corrected manually
(
    df_Cities,
    df_Conf,
    df_ConfTournGms,
    df_GameCities,
    df_MasseyOrdinals,
    df_NCAATourCR,
    df_NCAATourDR,
    df_NCAATourSeedRoundSlots,
    df_NCAATourSeeds,
    df_NCAATourSlots,
    df_RegSCR,
    df_RegSDR,
    df_SampleSubmissionStage1,
    df_Seasons,
    df_SecTourCR,
    df_SecTourTeams,
    df_TeamCoaches,
    df_TeamConf,
    df_TeamSpellings,
    df_Teams,
) = dataframes

### Selection Sunday is 13 March. 68 teams will be announced than.

## Exploring 2021 NCAA Tournement

In [4]:
# seeded teams for 2021
seeds_2021 = df_NCAATourSeeds[df_NCAATourSeeds.Season == 2021].TeamID.unique()

In [5]:
# NCAA matches 2021
df_ncaa_2021 = df_NCAATourDR[df_NCAATourDR.Season == 2021].copy()

In [6]:
# number of games played
gp = list(
    map(
        lambda x: df_ncaa_2021[
            (df_ncaa_2021.WTeamID == x) | (df_ncaa_2021.LTeamID == x)
        ].shape[0],
        seeds_2021,
    )
)

In [7]:
# number of wins
w = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].shape[0],
        seeds_2021,
    )
)

# number of losses
l = np.subtract(gp, w)

In [8]:
# psw = points scored in wins
psw = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WScore.sum(),
        seeds_2021,
    )
)

# psl = points scored in losses
psl = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LScore.sum(),
        seeds_2021,
    )
)

# paw = points allowed in wins
paw = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].LScore.sum(),
        seeds_2021,
    )
)

# pal = points allowed in losses
pal = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].WFGM.sum(),
        seeds_2021,
    )
)

In [9]:
# fgm = field goal made
fgm = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WFGM.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LFGM.sum(),
        seeds_2021,
    )
)

# fga = field goad attempts
fga = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WFGA.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LFGA.sum(),
        seeds_2021,
    )
)

# fgm3 = 3 points fg made
fgm3 = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WFGM3.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LFGM3.sum(),
        seeds_2021,
    )
)

# fga3 = 3 points attempts
fga3 = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WFGA3.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LFGA3.sum(),
        seeds_2021,
    )
)

# ftm = free throws made
ftm = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WFTM.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LFTM.sum(),
        seeds_2021,
    )
)

# fta = free throw attempts
fta = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WFTA.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LFTA.sum(),
        seeds_2021,
    )
)

# orb = offensive rebound
orb = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WOR.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LOR.sum(),
        seeds_2021,
    )
)

# drb = defensive rebound
drb = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WDR.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LDR.sum(),
        seeds_2021,
    )
)

# ast = assists
ast = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WAst.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LAst.sum(),
        seeds_2021,
    )
)

# to = turnover
to = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WTO.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LTO.sum(),
        seeds_2021,
    )
)

# stl = steal
stl = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WStl.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LStl.sum(),
        seeds_2021,
    )
)

# blk = block
blk = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WBlk.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LBlk.sum(),
        seeds_2021,
    )
)

# pf = personal fouls
pf = list(
    map(
        lambda x: df_ncaa_2021[(df_ncaa_2021.WTeamID == x)].WPF.sum()
        + df_ncaa_2021[(df_ncaa_2021.LTeamID == x)].LPF.sum(),
        seeds_2021,
    )
)

In [10]:
# Regular Season Standings Table
df_tt = pd.DataFrame(
    data=list(
        zip(
            seeds_2021,
            gp,
            w,
            l,
            psw,
            psl,
            paw,
            pal,
            fgm,
            fga,
            fgm3,
            fga3,
            ftm,
            fta,
            orb,
            drb,
            ast,
            to,
            stl,
            blk,
            pf,
        )
    ),
    columns=[
        "TeamID",
        "gp",
        "w",
        "l",
        "psw",
        "psl",
        "paw",
        "pal",
        "fgm",
        "fga",
        "fgm3",
        "fga3",
        "ftm",
        "fta",
        "orb",
        "drb",
        "ast",
        "to",
        "stl",
        "blk",
        "pf",
    ],
)
df_tt["ps"] = (df_tt.psw + df_tt.psl) / df_tt.gp
df_tt["pa"] = (df_tt.paw + df_tt.pal) / df_tt.gp
df_tt["psw"] = df_tt.psw / df_tt.w
df_tt["psl"] = df_tt.psl / df_tt.l
df_tt["paw"] = df_tt.paw / df_tt.w
df_tt["pal"] = df_tt.pal / df_tt.l
df_tt["fgm"] = df_tt.fgm / df_tt.gp
df_tt["fga"] = df_tt.fga / df_tt.gp
df_tt["fgm3"] = df_tt.fgm3 / df_tt.gp
df_tt["fga3"] = df_tt.fga3 / df_tt.gp
df_tt["ftm"] = df_tt.ftm / df_tt.gp
df_tt["fta"] = df_tt.fta / df_tt.gp
df_tt["orb"] = df_tt.orb / df_tt.gp
df_tt["drb"] = df_tt.drb / df_tt.gp
df_tt["ast"] = df_tt.ast / df_tt.gp
df_tt["to"] = df_tt.to / df_tt.gp
df_tt["stl"] = df_tt.stl / df_tt.gp
df_tt["blk"] = df_tt.blk / df_tt.gp
df_tt["pf"] = df_tt.pf / df_tt.gp
df_tt["wp"] = (df_tt.w / df_tt.gp) * 100
df_tt["fgp"] = (df_tt.fgm / df_tt.fga) * 100
df_tt["fgp3"] = (df_tt.fgm3 / df_tt.fga3) * 100
df_tt["mrg"] = df_tt.ps - df_tt.pa

In [11]:
# opponent winning percentages
def opponent_wp_calc(TeamID):
    try:
        opponents = df_ncaa_2021[(df_ncaa_2021.WTeamID == TeamID)].LTeamID.values
        opponents_wp = [df_tt[df_tt.TeamID == x].wp for x in opponents]
        return np.mean(opponents_wp)
    except:
        return 0


opp_wp = list(map(opponent_wp_calc, seeds_2021))
df_tt["opp_wp"] = opp_wp

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [12]:
df_tt.sort_values("wp", ascending=False)[
    [
        "TeamID",
        "w",
        "l",
        "wp",
        "mrg",
        "fgp",
        "ps",
        "pa",
        "psw",
        "psl",
        "opp_wp",
    ]
].head(10)

Unnamed: 0,TeamID,w,l,wp,mrg,fgp,ps,pa,psw,psl,opp_wp
52,1124,6,0,100.0,15.333333,45.844504,77.0,61.666667,77.0,,59.166667
11,1417,5,1,83.333333,17.0,45.945946,75.833333,58.833333,73.0,90.0,38.333333
18,1211,5,1,83.333333,23.166667,54.107649,86.0,62.833333,89.2,70.0,65.0
37,1222,4,1,80.0,17.2,38.720539,67.6,50.4,69.75,59.0,47.916667
0,1276,3,1,75.0,17.5,47.685185,73.25,55.75,81.333333,49.0,55.555556
54,1116,3,1,75.0,15.75,42.635659,74.25,58.5,75.0,72.0,38.888889
47,1333,3,1,75.0,18.0,44.390244,69.0,51.0,71.666667,61.0,38.888889
23,1425,3,1,75.0,24.25,50.434783,76.25,52.0,79.666667,66.0,50.0
46,1393,2,1,66.666667,14.0,44.966443,66.333333,52.333333,76.5,46.0,25.0
43,1260,2,1,66.666667,20.333333,43.670886,66.666667,46.333333,71.0,58.0,25.0
