# NCAA Men's Basketball Tournament Predictor

## Overview
Using exploratory data analysis (see ncaa_tournament_eda.ipynb and 2017_2021_ncaa_tournament_teams.html), it is found that the number of games a team wins is highly correlated with the teams Total Efficiency (Offensive Efficiency / Defensive Efficiency). This notebook predicts the 2022 NCAA Tournament's outcome by selecting teams with the highest Total Efficiency.

In [1]:
import pandas as pd
import numpy as np
from random import randrange
import datetime as dt

from get_ncaa_data import get_ncaa_tournament_data



In [2]:
west = [
    'Gonzaga','Georgia State','Boise State','Memphis','UConn','New Mexico State','Arkansas',
    'Vermont','Alabama','Notre Dame','Texas Tech','Montana State','Michigan State','Davidson',
    'Duke','CSU Fullerton'
]
east = [
    'Baylor','Norfolk State','North Carolina','Marquette','Saint Mary\'s', 'Indiana','UCLA','Akron',
    'Texas','Virginia Tech','Purdue','Yale','Murray State','San Francisco','Kentucky','Saint Peter\'s'
]
south = [
    'Arizona','Wright State','Seton Hall','TCU','Houston','UAB','Illinois','Chattanooga','Colorado State',
    'Michigan','Tennessee','Longwood','Ohio State','Loyola Chicago','Villanova','Delaware'
]
midwest = [
    'Kansas','Texas Southern','San Diego State','Creighton','Iowa','Richmond','Providence',
    'South Dakota State','LSU','Iowa State','Wisconsin','Colgate','USC','Miami','Auburn','J\'Ville State'
]

all_teams = west+east+south+midwest

In [5]:
year = dt.datetime.now().year
df = get_ncaa_tournament_data([year],force_download=True)
df = df.rename(columns={
    'Offensive Efficiency':'AdjOE','Defensive Efficiency':'AdjDE','Total Efficiency':'AdjTE'})
to_replace = {
    'Connecticut':'UConn',
    'St.':'State',
    'Cal State Fullerton':'CSU Fullerton',
    'Miami FL':'Miami',
    'Jacksonville State':'J\'Ville State'
}

for r in to_replace.keys():
    df['Team'] = df['Team'].str.replace(r,to_replace[r])

df = df.loc[df['Team'].isin(all_teams),[
    'Team','AdjTE']].sort_values('AdjTE',ascending=False).reset_index(drop=True)

df['AdjTE'] = df['AdjTE'] / df['AdjTE'].max()

df.columns = ['team','win_likelihood']
df.head()

Unnamed: 0,team,win_likelihood
0,Gonzaga,1.0
1,Houston,0.974571
2,Kansas,0.959423
3,Texas Tech,0.955589
4,Baylor,0.949177


In [6]:
def play_game_random(teams):
    return teams[randrange(0,2)]

In [7]:
def play_game(teams):
    return df.loc[df['team'].isin(teams),['team','win_likelihood']].set_index('team').idxmax()[0]

In [29]:
def ncaa_champion(west,east,south,midwest):
    rounds_1 = ['R64','R32','Sweet 16','Elite 8']
    outcomes = {}
    
    wests = []
    easts = []
    souths = []
    midwests = []
    
    wests.append(west)
    easts.append(east)
    souths.append(south)
    midwests.append(midwest)
    
    final_four = []
    final = []
    
    i = 0
    for r in rounds_1:
        teams = []
        for c in wests,easts,souths,midwests:
            teams += c[i]
            c.append(list(map(play_game,list(zip(c[i][::2],c[i][1::2])))))
        outcomes[rounds_1[i]] = teams
        i += 1
        
    for c in wests,easts,souths,midwests:
        final_four.append(c[-1][0])
    outcomes['Final Four'] = final_four
    
    final.append(list(map(play_game,list(zip(final_four[::2],final_four[1::2])))))
    outcomes['Final'] = final[0]
    
    winner = play_game(final[0])
    outcomes['Winner'] = [winner]
    
    return outcomes

In [47]:
x = ncaa_champion(west,east,south,midwest)
tourney_arr = [list(zip([key]*len(x[key]),x[key])) for key in x.keys()]
arr = []
for a in tourney_arr:
    arr += a

In [52]:
tournament_prediction = pd.DataFrame(arr,columns=['tournament_round','predicted_team'])
tournament_prediction.to_csv('{}_ncaa_tournament_prediction.csv'.format(year))
tournament_prediction.head()

Unnamed: 0,tournament_round,predicted_team
0,R64,Gonzaga
1,R64,Georgia State
2,R64,Boise State
3,R64,Memphis
4,R64,UConn
