# Classification Model to Predict NCAA D1 Men's Basketball Outcomes More Accurately Than Seed-Based Predictions


Philip Knott \
INFO 4604: Applied Machine Learning \
University of Colorado, Boulder \
May 2023

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut

# Import data

In [3]:
game_results = pd.read_csv('./Big_Dance_CSV.csv') # Game Results Dataset
team_stats = pd.read_csv('./cbb.csv') # Team Statistics Dataset

# Only use data from dates spanning both datasets
game_results = game_results.loc[(game_results['Year'] >= 2013) & (game_results['Year'] <= 2019)]
team_stats = team_stats.loc[(team_stats['YEAR'] >= 2013) & (team_stats['YEAR'] <= 2019)]

# Define target variable
game_results['Target'] = (game_results['Score1'] > game_results['Score2']).astype('int')

# Remove unnecessary columns
game_results.drop(columns=['Round', 'Region Number', 'Region Name', 'Seed1', 'Seed2', 'Score1', 'Score2'], inplace=True)
team_stats.drop(columns=['CONF', 'G', 'W', 'POSTSEASON', 'WAB'], inplace=True)

game_results.columns = ['Year', 'Team', 'Opp', 'Target']

In [25]:
team_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2455 entries, 0 to 2454
Data columns (total 19 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   TEAM     2455 non-null   object 
 1   ADJOE    2455 non-null   float64
 2   ADJDE    2455 non-null   float64
 3   BARTHAG  2455 non-null   float64
 4   EFG_O    2455 non-null   float64
 5   EFG_D    2455 non-null   float64
 6   TOR      2455 non-null   float64
 7   TORD     2455 non-null   float64
 8   ORB      2455 non-null   float64
 9   DRB      2455 non-null   float64
 10  FTR      2455 non-null   float64
 11  FTRD     2455 non-null   float64
 12  2P_O     2455 non-null   float64
 13  2P_D     2455 non-null   float64
 14  3P_O     2455 non-null   float64
 15  3P_D     2455 non-null   float64
 16  ADJ_T    2455 non-null   float64
 17  SEED     476 non-null    float64
 18  YEAR     2455 non-null   int64  
dtypes: float64(17), int64(1), object(1)
memory usage: 383.6+ KB


In [23]:
game_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Year    441 non-null    int64 
 1   Team    441 non-null    object
 2   Opp     441 non-null    object
 3   Target  441 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 13.9+ KB


# Pre-processing

In [6]:
# Get rid of rows in game_results that contain teams without data in team_stats
ts_teams = set(team_stats['TEAM'])
game_results = game_results.loc[(game_results['Team'].isin(ts_teams)) & (game_results['Opp'].isin(ts_teams))]

# Create a compliment for each row in game_results so that 'Team' and 'Opp' become arbitrary
for row in game_results.iterrows():
    year, team, opp, teamWon = row[1]
    new_row = [year, opp, team, int(not teamWon)]
    game_results.loc[game_results.size] = new_row
    
game_results.reset_index(inplace=True)
game_results.drop(columns=['index'], inplace=True)

In [7]:
# Join the two dataframes by attaching yearly team stats (and their differences) 
# for each team in game_results

predictors = team_stats.drop(columns=['YEAR', 'TEAM']).columns
def get_row(team, opp, year, target):
		stats1 = team_stats.loc[(team_stats['TEAM'] == team) & (team_stats['YEAR'] == year), predictors]
		stats2 = team_stats.loc[(team_stats['TEAM'] == opp) & (team_stats['YEAR'] == year), predictors]
		stats2.columns = [p + '_opp' for p in predictors]
		d = stats1.to_dict(orient='records')[0]
		d_opp = stats2.to_dict(orient='records')[0]
		return {**d, **d_opp, 'Target': target}

rows = []
for result in game_results.iterrows():
	year, team, opp, target = result[1]
	rows.append(get_row(team, opp, year, target))

df = pd.DataFrame()
df = df.append(rows, ignore_index=True)

In [8]:
df

Unnamed: 0,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,ORB,DRB,FTR,...,DRB_opp,FTR_opp,FTRD_opp,2P_O_opp,2P_D_opp,3P_O_opp,3P_D_opp,ADJ_T_opp,SEED_opp,Target
0,123.0,89.9,0.9736,55.2,44.7,14.7,17.5,30.4,25.4,29.1,...,28.7,32.9,36.6,52.8,41.9,36.5,29.7,67.5,3.0,1
1,118.4,91.5,0.9507,53.9,45.5,15.6,20.4,28.8,32.3,39.2,...,28.2,40.0,27.8,46.7,47.6,37.0,33.1,63.2,15.0,1
2,111.4,87.8,0.9392,50.6,44.5,20.8,19.2,36.1,27.6,36.6,...,28.2,41.7,36.8,55.7,44.5,37.1,33.8,65.6,14.0,1
3,108.0,87.5,0.9181,50.4,45.5,17.6,23.6,27.9,29.1,44.0,...,29.8,47.0,30.7,49.8,43.2,31.8,33.9,65.1,13.0,1
4,107.1,88.6,0.8983,48.8,44.5,18.2,21.7,31.3,30.8,40.5,...,27.2,38.4,33.3,49.1,44.9,33.3,33.4,69.2,12.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,112.8,105.0,0.6956,52.1,51.6,17.1,19.3,32.8,33.4,32.7,...,28.2,35.5,29.5,49.4,43.6,35.6,32.4,65.3,2.0,0
437,104.9,99.9,0.6362,54.3,49.2,20.8,20.9,25.6,31.4,49.6,...,27.6,45.7,25.7,46.1,42.0,35.1,34.6,65.4,3.0,1
438,112.0,96.2,0.8516,51.9,49.3,17.1,21.3,29.0,34.2,31.3,...,33.8,31.7,36.8,46.8,46.5,35.8,33.3,63.0,4.0,1
439,108.9,96.2,0.8064,56.5,48.0,19.6,25.0,29.7,34.2,32.8,...,26.7,37.4,32.9,50.6,43.4,37.1,35.8,66.8,6.0,0


# Find statistically significant predictors

In [9]:
def get_p_value(label):
    s, i, r, p, std_err = stats.linregress(df[label], df['Target'])
    return p

p_values = [get_p_value(label) for label in predictors]

sig_df = pd.DataFrame({'predictor': predictors})
sig_df['p-value'] = p_values
sig_df['a=0.1'] = sig_df['p-value'] < .1
sig_df['a=0.05'] = sig_df['p-value'] < .05
sig_df['a=0.01'] = sig_df['p-value'] < .01
sig_df

Unnamed: 0,predictor,p-value,a=0.1,a=0.05,a=0.01
0,ADJOE,4.235524e-12,True,True,True
1,ADJDE,6.527617e-05,True,True,True
2,BARTHAG,1.005959e-15,True,True,True
3,EFG_O,0.003305653,True,True,True
4,EFG_D,0.2158915,False,False,False
5,TOR,0.0005088029,True,True,True
6,TORD,0.6149552,False,False,False
7,ORB,0.01334635,True,True,False
8,DRB,0.7352733,False,False,False
9,FTR,0.06093021,True,False,False


In [10]:
# Print out the statistically significant predictors
alpha = 0.01
predictors = list(sig_df.loc[sig_df[f'a={alpha}'] == True, 'predictor'])
print(f'# Predictors: {len(predictors)}')
print(predictors)

# Predictors: 7
['ADJOE', 'ADJDE', 'BARTHAG', 'EFG_O', 'TOR', '2P_O', 'SEED']


# Initial Models

In [11]:
# Get scaled data
X = df[predictors]
y = df['Target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X, y)

In [12]:
# Perform Leave-One-Out Cross Validation
cv = LeaveOneOut()

model1 = DecisionTreeClassifier()
model2 = RandomForestClassifier()
model3 = SVC()

scores1 = cross_val_score(model1, X_scaled, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores2 = cross_val_score(model2, X_scaled, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores3 = cross_val_score(model3, X_scaled, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

pd.DataFrame({
    	'Decision Tree': np.mean(abs(scores1)), 
     	'Random Forest': np.mean(abs(scores2)), 
			'Support Vector': np.mean(abs(scores3))
		}, index=['Mean Absolute Error'])

Unnamed: 0,Decision Tree,Random Forest,Support Vector
Mean Absolute Error,0.45805,0.396825,0.290249


# Hyperparameter Tuning

In [17]:
model = SVC(kernel='sigmoid', gamma='auto', C=.2)
scores = cross_val_score(model, X_scaled, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
print(f'Mean Cross Val Score: {np.mean(abs(scores))}')

Mean Cross Val Score: 0.3197278911564626


In [19]:
model = SVC(kernel='linear', gamma='scale', C=.5)
scores = cross_val_score(model, X_scaled, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
print(f'Mean Cross Val Score: {np.mean(abs(scores))}')

Mean Cross Val Score: 0.2857142857142857


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.20      0.33        35
           1       0.73      0.99      0.84        76

    accuracy                           0.74       111
   macro avg       0.80      0.59      0.58       111
weighted avg       0.77      0.74      0.68       111



In [None]:
# Find accuracy of just picking the team with the highest seed
seed_pred = np.where(df['SEED'] <= df['SEED_opp'], 1, 0)

print(classification_report(df['Target'], seed_pred))    

              precision    recall  f1-score   support

           0       0.67      0.23      0.35       149
           1       0.71      0.94      0.81       292

    accuracy                           0.70       441
   macro avg       0.69      0.59      0.58       441
weighted avg       0.70      0.70      0.65       441

