In [1]:
from API.fetch.data_fetch import get_team_data

start_year = 1993
get_team_data(url=f"https://www.sports-reference.com/cbb/seasons/{start_year}-school-stats.html",
              attrs={'id': 'basic_school_stats'}).head()

Unnamed: 0,Rk,School,G,W,L,W-L%,SRS,SOS,Unnamed: 8,W.1,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,1,Air Force,28,9,19,0.321,-7.45,2.05,,3,...,409,584,0.7,,965,285,178,109,385,546
1,2,Akron,26,8,18,0.308,-10.69,-5.07,,3,...,320,493,0.649,,795,316,163,47,352,573
2,3,Alabama-Birmingham,35,21,14,0.6,10.82,5.68,,5,...,456,650,0.702,,1273,501,246,135,498,650
3,4,Alabama State,27,14,13,0.519,-8.48,-9.7,,9,...,541,767,0.705,,1049,393,194,42,556,567
4,5,Alabama,29,16,13,0.552,9.66,7.83,,7,...,458,702,0.652,,1117,337,185,120,487,539


In [2]:
# Standard Python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from IPython.display import display
import re
from datetime import datetime
start_year = 1993
curr_year = datetime.now().year
import warnings
warnings.filterwarnings("ignore")

# Custom API
import data_fetch as fetch
from data_pipeline import dataset_pipeline, feature_pipeline, bracket_pipeline
from data_visualizations import get_yearly_base_rates, get_seed_pairs, format_plot
from model_selection import get_cv_models
from model_evaluation import evaluate_cv_models, get_classification_report

KeyboardInterrupt: 

# Data Fetching

### Perceived Predictors

Naturally, it will be vitally important to scrape available data that is pertinent to deciding the outcome of an NCAA March Madness game between any two given teams. To successfully do so, we must break down what are generally the most influential elements of a basketball team's success.

<br>Overall team performance during the regular season is generally a good indicator of how a team will perform in March Madness. This would be captured by statistics, both basic and advanced, such as the following:
<br>**Season Record (%)
<br>Conference Record (%); could be important given that the tournament is split into regions
<br>Regular Season Record vs. Tourney Opponent (%); set to theoretical discrete probability of 50% if no such matchups exist 
<br>Strength of Schedule (SOS); measures the difficulty of the teams played (higher number = greater difficulty)
<br>Top 25 Ranking (boolean); considered a consensus top-tier team
<br>Shots Made per Game (FG, 3P, FT)
<br>Point Differential per Game; measures how dominant/unsuccessful you are at outscoring your opponent on average
<br>Misc. Team Stats per Game (Rebounds, Assists, Blocks, etc.)**

<br>It's important to note that in the NCAA, more so than the NBA, experienced coaches can have just as much of an impact on a game's outcome as the players themselves. Hence, it's reasonable to assume that the following statistics could also be solid indicators:
**<br>Coach March Madness Appearances
<br>Coach Sweet Sixteen Appearances
<br>Coach Final Four Appearances
<br>Coach Championships Won**

<br>And lastly, we need the data for the structure of the tournaments themselves:
**<br>Favorite Seed
<br>Underdog Seed
<br>Round Number (0-6)
<br>Game Outcome (boolean); did the underdog upset the favorite?**

## Examples

### Team Regular Season Stats

In [None]:
fetch.get_team_data(url=f"https://www.sports-reference.com/cbb/seasons/{start_year}-school-stats.html",
              attrs={'id': 'basic_school_stats'}).head()

In [None]:
fetch.get_team_data(url=f"https://www.sports-reference.com/cbb/seasons/{start_year}-advanced-school-stats.html", 
              attrs={'id': 'adv_school_stats'}).head()

### Team Rankings

In [None]:
fetch.get_rankings_data(url=f"https://www.sports-reference.com/cbb/seasons/{start_year}-ratings.html").head()

### Coach Tournament Performance

In [None]:
fetch.get_coach_data(url=f"https://www.sports-reference.com/cbb/seasons/{start_year}-coaches.html").head()

### Tournament Game Data

In [None]:
fetch.get_team_data(url=("https://apps.washingtonpost.com/sports/search/?pri_school_id=&pri_conference=&pri_coach"
                   "=&pri_seed_from=1&pri_seed_to=16&pri_power_conference=&pri_bid_type=&opp_school_id"
                   "=&opp_conference=&opp_coach=&opp_seed_from=1&opp_seed_to=16&opp_power_conference=&opp_bid_type"
                   f"=&game_type=7&from={start_year}&to={start_year}&submit="), 
              attrs={'class': 'search-results'}, header=0).head()

# Data Cleaning

## Data Pipeline

In [None]:
try:
    mm_matchups_df = pd.read_csv('march_madness_hist_data.csv')
except FileNotFoundError:
    mm_matchups_df = dataset_pipeline(np.arange(start_year, curr_year - 1))
    mm_matchups_df.to_csv('march_madness_hist_data.csv', index=False)

mm_matchups_df

## Handling Missing Values

### Finding the Nulls

In [None]:
true_nulls = fetch.get_feature_null_counts(mm_matchups_df)
true_nulls

In [None]:
tov_null_fills = [col for col in true_nulls.index if ('TOV' in col)]
pf_null_fills = [col for col in true_nulls.index if ('PF' in col)]

null_drops = list(set(true_nulls.index) - set(tov_null_fills) - set(pf_null_fills))
mm_matchups_df.drop(null_drops, axis=1, inplace=True)

In [None]:
tov_nulls_rows = fetch.get_null_rows(tov_null_fills, mm_matchups_df)
pf_nulls_rows = fetch.get_null_rows(pf_null_fills, mm_matchups_df)

display(tov_nulls_rows), display(pf_nulls_rows)

### What are the Distributions of the Features We Wish to Impute?

In [None]:
tov_null_years = sorted(list(set(tov_nulls_rows['Year'])))

for year in tov_null_years:
    print(f"{year} feature distributions")
    year_df = mm_matchups_df[mm_matchups_df['Year'] == year]
    year_df[tov_null_fills].hist(figsize=(10, 5), layout=(len(tov_null_years), len(tov_null_fills)))

In [None]:
mm_matchups_df[pf_null_fills].hist(figsize=(10, 5), layout=(1, len(pf_null_fills)))

### Impute TOV Nulls by Season

In [None]:
tov_col_means = mm_matchups_df[tov_nulls_rows.columns].groupby('Year').mean()
tov_col_means.loc[tov_null_years]

In [None]:
for year in tov_null_years:
    for col in tov_null_fills:
        col_fill_rows = tov_nulls_rows[tov_nulls_rows['Year'] == year].index
        col_year_mean = np.round(tov_col_means.loc[year, col], 1)
        
        mm_matchups_df.loc[col_fill_rows, col] = mm_matchups_df.loc[col_fill_rows, col].fillna(col_year_mean)
        
mm_matchups_df.loc[tov_nulls_rows.index, tov_null_fills]

### Impute PF Nulls Using Entire Distribution

In [None]:
pf_col_means = mm_matchups_df[pf_null_fills].mean()
pf_col_means

In [None]:
for col in pf_null_fills:
    col_mean = np.round(mm_matchups_df[col].mean(), 1)
    mm_matchups_df[col].fillna(col_mean, inplace=True)
    
mm_matchups_df.loc[pf_nulls_rows.index, pf_null_fills]

# Exploratory Data Analysis (EDA)

As any good data scientist should do, there are a few questions I hope to address in my EDA

## What is a Bracket's Accuracy Given Random Guessing in Favor of the Majority Class (Base Rate: Favorite Beats Underdog)?

In [None]:
yearly_base_rates = get_yearly_base_rates(mm_matchups_df)
mean_base_rate = np.round(yearly_base_rates.mean(), 3)

years_ma = 2
base_rate_ma = np.round(yearly_base_rates.rolling(years_ma).mean(), 3)

plt.figure(figsize=(9, 6))

plt.plot(yearly_base_rates.index, [mean_base_rate] * len(yearly_base_rates), color='k', linewidth=3, label=f'Mean ({mean_base_rate})')
plt.plot(yearly_base_rates.index, base_rate_ma, color='r', linewidth=3, label=f'{years_ma}-Yr MA')
plt.bar(yearly_base_rates.index, yearly_base_rates)

format_plot(title='Dataset Base Rate Trends', xlabel='Season', ylabel='Base Rate')

## How Often Do Upsets Occur in a Given Year's March Madness? 

In [None]:
yearly_upsets = mm_matchups_df.groupby('Year').agg({'Underdog_Upset': ['sum', 'count']})
yearly_pct_upsets = yearly_upsets[('Underdog_Upset', 'sum')] / yearly_upsets[('Underdog_Upset', 'count')]

pct_upsets_ma = yearly_pct_upsets.rolling(years_ma).mean()
mean_pct_upsets = np.round(yearly_pct_upsets.mean(), 3)

plt.figure(figsize=(9, 6))

plt.plot(yearly_pct_upsets.index, [mean_pct_upsets] * len(yearly_pct_upsets), color='k', linewidth=3, 
         label=f'Mean ({mean_pct_upsets})')
plt.plot(yearly_pct_upsets.index, pct_upsets_ma, color='r', linewidth=3, label=f'{years_ma}-Yr MA')
plt.bar(yearly_pct_upsets.index, yearly_pct_upsets)

format_plot(title='Dataset Upsets Volume Trends', xlabel='Season', ylabel='Upsets (% of Games)')

## What is the Distribution of Upsets Across Each Tournament Round?

In [None]:
seed_pairs = get_seed_pairs(mm_matchups_df)

In [None]:
upset_pairs = seed_pairs[seed_pairs['Underdog_Upset'] == 1]
upset_rounds_freq = upset_pairs['Round'].value_counts(normalize=True)
upset_rounds_freq.plot(figsize=(9, 6), kind='bar', rot=15)

format_plot(title='Dataset Upset Volumes by Round', xlabel='Round', ylabel='% of Upsets')

## Which Seeding Combinations are the Most Likely to Produce Upsets?

In [None]:
upset_pairs_freq = np.round(upset_pairs['Pairs'].value_counts(normalize=True)[:25], 3)
upset_pairs_freq.plot(figsize=(9, 6), kind='bar', rot=35)

format_plot(title='Dataset Upset Volumes by Seed Combo', xlabel='Seed Combo', ylabel='% of Upsets')

# Feature Engineering & Analysis

In [None]:
all_rounds = mm_matchups_df['Round']
mm_matchups_df.drop(['Year', 'Team_Favorite', 'Team_Underdog'], axis=1, inplace=True)

scaled_mm_matchups_df = feature_pipeline(mm_matchups_df)
X = scaled_mm_matchups_df.drop('Underdog_Upset', axis=1)
y = scaled_mm_matchups_df['Underdog_Upset']

display(X), display(y)

## What are the Distributions of all our Engineered Features?

In [None]:
X.hist(figsize=(15, 10))
plt.tight_layout()

## What are the Correlations Between Features?

In [None]:
abs_desc_corr = np.abs(scaled_mm_matchups_df.corr().loc['Underdog_Upset']).sort_values()
abs_desc_corr.drop('Underdog_Upset', inplace=True)

plt.figure(figsize=(9, 6))
plt.barh(abs_desc_corr.index, abs_desc_corr.values)

format_plot(title='Target Variable Correlation', xlabel='Absolute Correlation Value', ylabel='Features')

## What Features have the Greatest Predictive Power?

In [None]:
rf = RandomForestClassifier().fit(X, y)

# Sort features & their corresponding values in by importance
importances = rf.feature_importances_
feat_importances = X.columns[np.argsort(importances)]
feat_values = np.sort(importances)

# Plot feature importances calculated above
plt.figure(figsize=(9, 6))
plt.barh(feat_importances, feat_values)

format_plot(title='Feature Importances', xlabel='Normalized Information Gain', ylabel='Features')

# Model Selection

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.drop('Round', axis=1, inplace=True)
X_test.drop('Round', axis=1, inplace=True)

cv_models = get_cv_models(y)
model_performance = evaluate_cv_models(cv_models, X_train, y_train)

model_performance

In [None]:
model_performance[['Mean_Accuracy', 'Mean_AUC']].plot(figsize=(9, 6), kind='barh', xticks=np.arange(0, 0.9, 0.05))

format_plot(title='Model Performance', xlabel='Metric Value', ylabel='Model')

In [None]:
model_performance.loc[['SVM', 'LogReg'], ['Mean_Accuracy_Std', 'Mean_AUC_Std']].plot(
    figsize=(9, 6), kind='barh', xticks=np.arange(0.01, 0.05, 0.01)
)

format_plot(title='Model Stability', xlabel='Standard Deviation (Std)', ylabel='Model')

# Model Evaluation

In [None]:
best_model = cv_models['SVM'][-1].best_estimator_
y_preds = best_model.predict(X_test)

display(best_model)

In [None]:
test_game_data = pd.concat([y_test, all_rounds, mm_matchups_df[['Seed_Favorite', 'Seed_Underdog']]], 
                           join='inner', axis=1).drop('Underdog_Upset', axis=1)
test_game_data['Underdog_Upset'] = y_preds

test_seed_pairs = get_seed_pairs(test_game_data)

In [None]:
test_upset_pairs = test_seed_pairs[test_seed_pairs['Underdog_Upset'] == 1]
test_upset_rounds_freq = test_upset_pairs['Round'].value_counts(normalize=True)
test_upset_rounds_freq.plot(figsize=(9, 6), kind='bar', rot=15)

format_plot(title='Test Set Upset Volumes by Round', xlabel='Round', ylabel='% of Upsets')

In [None]:
test_upset_pairs_freq = np.round(test_upset_pairs['Pairs'].value_counts(normalize=True)[:25], 3)
test_upset_pairs_freq.plot(figsize=(9, 6), kind='bar', rot=35)

format_plot(title='Test Set Upset Volumes by Seed Combo', xlabel='Seed Combo', ylabel='% of Upsets')

In [None]:
report = get_classification_report(y_test, y_preds)
print("Test Set Metrics Report \n\n", report)

# 2021 March Madness Predictions

## Data Pipeline

In [None]:
try:
    curr_bracket_df = pd.read_csv('march_madness_curr_start_bracket.csv')
except FileNotFoundError:
    curr_bracket_df = fetch.get_current_bracket('http://www.espn.com/mens-college-basketball/tournament/bracket')
    curr_bracket_df.to_csv('march_madness_curr_start_bracket.csv', index=False)
    
play_in = curr_bracket_df[:4]
play_in = play_in.reindex([0, 1, 3, 2])

first_round = curr_bracket_df[4:]
first_round.index = range(len(first_round))

display(play_in), display(first_round)

In [None]:
bracket_preds = bracket_pipeline(curr_year, play_in, first_round, best_model, null_drops)
display(bracket_preds)

## Pre-Tournament Model Evaluation (3/17/21)

In [None]:
curr_seed_pairs = get_seed_pairs(bracket_preds)
curr_upset_pairs = curr_seed_pairs[curr_seed_pairs['Underdog_Upset'] == 1]

curr_upset_rounds_freq = curr_upset_pairs['Round'].value_counts()
curr_upset_rounds_freq.plot(figsize=(9, 6), kind='bar', rot=0)

format_plot(title='Current Upset Volumes by Round', xlabel='Round', ylabel='% of Upsets')

In [None]:
curr_seed_pairs = get_seed_pairs(bracket_preds)
curr_upset_pairs = curr_seed_pairs[curr_seed_pairs['Underdog_Upset'] == 1]

curr_upset_pairs_freq = curr_upset_pairs['Pairs'].value_counts()
curr_upset_pairs_freq.plot(figsize=(9, 6), kind='bar', rot=35)

format_plot(title='Current Upset Volumes by Seed Combo', xlabel='Seed Combo', ylabel='% of Upsets')

In [None]:
for _round in bracket_preds['Round'].unique():
    display(bracket_preds[bracket_preds['Round'] == _round])

## Post-Tournament Model Evaluation