# Predicting NCAA Men's Basketball Games

This notebook will use the trained spread model and 2 inputed teams to predict the spread of a single game

## Imports

In [173]:
import numpy as np
import pandas as pd
import xgboost as xgb

## Load Models

In [174]:
winner_model = xgb.Booster({'nthread':4}) #init model
winner_model.load_model('../models/winners.model') # load data

spread_model = xgb.Booster({'nthread':4}) #init model
spread_model.load_model('../models/spreads.model') # load data

totals_model = xgb.Booster({'nthread':4}) #init model
totals_model.load_model('../models/totals.model') # load data

## Load Input Data

In [175]:
game_df = pd.read_csv('compact_test.csv')
game_df = game_df.iloc[:, :5]
game_df

Unnamed: 0,WTeamID,WScore,LTeamID,LScore,OppWinPercentage
0,1345,77,1251,64,0.692308
1,1155,81,1345,70,0.9
2,1345,93,1324,50,0.230769
3,1345,68,1434,61,0.375
4,1274,58,1345,54,0.444444
5,1345,80,1232,68,0.333333
6,1345,67,1326,60,0.727273
7,1345,88,1323,78,0.333333
8,1234,70,1345,55,0.833333
9,1345,73,1268,70,0.5


In [176]:
wTeamId = 1211
lTeamId = 1339

## Format Data

In [177]:
## Create Dataframe

# Find number of wins for each team in each season
wNumWins = game_df[game_df['WTeamID'] == wTeamId].count()['WTeamID']
lNumWins = game_df[game_df['WTeamID'] == lTeamId].count()['WTeamID']

# Find number of losses for each team
wNumLosses = game_df[game_df['LTeamID'] == wTeamId].count()['LTeamID']
lNumLosses = game_df[game_df['LTeamID'] == lTeamId].count()['LTeamID']

predict_df = pd.DataFrame(data=
                          {
                              'WTeamID': wTeamId,
                              'LTeamID': lTeamId,
                              'WNumWins': wNumWins,
                              'LNumWins': lNumWins,
                              'WNumLosses': wNumLosses,
                              'LNumLosses': lNumLosses
                          }, index=[0])

# Find win percentage for each team
predict_df['WWinPercentage'] = predict_df['WNumWins'] / (predict_df['WNumWins'] + predict_df['WNumLosses'])
predict_df['LWinPercentage'] = predict_df['LNumWins'] / (predict_df['LNumWins'] + predict_df['LNumLosses'])

# Find average win score
predict_df['WAvgWinScore'] = game_df[game_df['WTeamID'] == wTeamId].mean()['WScore']
predict_df['LAvgWinScore'] = game_df[game_df['WTeamID'] == lTeamId].mean()['WScore'] 

# Find average loss score
predict_df['WAvgLossScore'] = game_df[game_df['LTeamID'] == wTeamId].mean()['LScore']
predict_df['LAvgLossScore'] = game_df[game_df['LTeamID'] == lTeamId].mean()['LScore']

# Find Average Win Margin
predict_df['WAvgWinMargin'] = game_df[game_df['WTeamID'] == wTeamId].mean()['WScore'] - game_df[game_df['WTeamID'] == wTeamId].mean()['LScore']
predict_df['LAvgWinMargin'] = game_df[game_df['WTeamID'] == lTeamId].mean()['WScore'] - game_df[game_df['WTeamID'] == lTeamId].mean()['LScore']

# Find Average Loss Margin
predict_df['WAvgLossMargin'] = game_df[game_df['LTeamID'] == wTeamId].mean()['WScore'] - game_df[game_df['LTeamID'] == wTeamId].mean()['LScore']
predict_df['LAvgLossMargin'] = game_df[game_df['LTeamID'] == lTeamId].mean()['WScore'] - game_df[game_df['LTeamID'] == lTeamId].mean()['LScore']

# Find Strength of Schedule
predict_df['WSOS'] = (game_df[game_df['WTeamID'] == wTeamId].mean()['OppWinPercentage'] * wNumWins + game_df[game_df['LTeamID'] == wTeamId].mean()['OppWinPercentage'] * wNumLosses) / (wNumWins + wNumLosses)
predict_df['LSOS'] = (game_df[game_df['WTeamID'] == lTeamId].mean()['OppWinPercentage'] * lNumWins + game_df[game_df['LTeamID'] == lTeamId].mean()['OppWinPercentage'] * lNumLosses) / (lNumWins + lNumLosses)

# Find Adjusted Win Percentage
predict_df['WAdjustedWinPercentage'] = predict_df['WWinPercentage'] * predict_df['WSOS']
predict_df['LAdjustedWinPercentage'] = predict_df['LWinPercentage'] * predict_df['LSOS']

# Subtract relative columns (WNumWins - LNumWins, WAvgWinMargin - LAvgWinMargin, etc.)
predict_df['NumWinsDifference'] = predict_df['WNumWins'] - predict_df['LNumWins']
predict_df['NumLossesDifference'] = predict_df['WNumLosses'] - predict_df['LNumLosses']
predict_df['AvgWinScoreDifference'] = predict_df['WAvgWinScore'] - predict_df['LAvgWinScore']
predict_df['AvgLossScoreDifference'] = predict_df['WAvgLossScore'] - predict_df['LAvgLossScore']
predict_df['AvgWinMarginDifference'] = predict_df['WAvgWinMargin'] - predict_df['LAvgWinMargin']
predict_df['AvgLossMarginDifference'] = predict_df['WAvgLossMargin'] - predict_df['LAvgLossMargin']
predict_df['WinPercentDifference'] = predict_df['WWinPercentage'] - predict_df['LWinPercentage']

# Drop Team IDs
predict_df = predict_df.drop(['WTeamID', 'LTeamID'], axis=1)

# IMPORTANT: The column names must be in the same order as the model!
cols = predict_df.columns.tolist()
predict_df = predict_df[[cols[0]] + [cols[2]] + [cols[4]] + [cols[6]] + [cols[8]] + [cols[10]] + [cols[12]] + [cols[14]] + [cols[16]] + [cols[1]] + [cols[3]] + [cols[5]] + [cols[7]] + [cols[9]] + [cols[11]] + [cols[13]] + [cols[15]] + [cols[17]] + cols[18:]]

predict_df

Unnamed: 0,WNumWins,WNumLosses,WWinPercentage,WAvgWinScore,WAvgLossScore,WAvgWinMargin,WAvgLossMargin,WSOS,WAdjustedWinPercentage,LNumWins,...,LAvgLossMargin,LSOS,LAdjustedWinPercentage,NumWinsDifference,NumLossesDifference,AvgWinScoreDifference,AvgLossScoreDifference,AvgWinMarginDifference,AvgLossMarginDifference,WinPercentDifference
0,7,5,0.583333,78.0,62.6,13.571429,8.6,0.568316,0.331518,8,...,16.0,0.47927,0.34856,-1,2,-4.625,-3.066667,0.446429,-7.4,-0.143939


In [178]:
## Create second DataFrame for point totals
cols = predict_df.columns.tolist()
predict_total_df = predict_df[cols[3:7] + cols[12:16]]

# Subtract relative columns (WNumWins - LNumWins, WAvgWinMargin - LAvgWinMargin, etc.)
predict_total_df['AvgWinScoreSum'] = predict_total_df['WAvgWinScore'] + predict_total_df['LAvgWinScore']
predict_total_df['AvgLossScoreSum'] = predict_total_df['WAvgLossScore'] + predict_total_df['LAvgLossScore']
predict_total_df['WAvgScore'] = (predict_total_df['WAvgWinScore'] + predict_total_df['WAvgLossScore']) / 2
predict_total_df['LAvgScore'] = (predict_total_df['LAvgWinScore'] + predict_total_df['LAvgLossScore']) / 2
predict_total_df['WAvgWinTotalScore'] = game_df[game_df['WTeamID'] == wTeamId].mean()['WScore'] + game_df[game_df['WTeamID'] == wTeamId].mean()['LScore']
predict_total_df['WAvgLossTotalScore'] = game_df[game_df['LTeamID'] == wTeamId].mean()['WScore'] + game_df[game_df['LTeamID'] == wTeamId].mean()['LScore']
predict_total_df['LAvgWinTotalScore'] = game_df[game_df['WTeamID'] == lTeamId].mean()['WScore'] + game_df[game_df['WTeamID'] == lTeamId].mean()['LScore']
predict_total_df['LAvgLossTotalScore'] = game_df[game_df['LTeamID'] == lTeamId].mean()['WScore'] + game_df[game_df['LTeamID'] == lTeamId].mean()['LScore']
predict_total_df['AvgTotalScore'] = (predict_total_df['WAvgWinTotalScore'] + predict_total_df['WAvgLossTotalScore'] + predict_total_df['LAvgWinTotalScore'] + predict_total_df['LAvgLossTotalScore']) / 4

cols = predict_total_df.columns.tolist()
predict_total_df = predict_total_df[cols[:2] + cols[12:14] + cols[2:6] + cols[14:16] + cols[6:12] + [cols[16]]]

predict_total_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pand

Unnamed: 0,WAvgWinScore,WAvgLossScore,WAvgWinTotalScore,WAvgLossTotalScore,WAvgWinMargin,WAvgLossMargin,LAvgWinScore,LAvgLossScore,LAvgWinTotalScore,LAvgLossTotalScore,LAvgWinMargin,LAvgLossMargin,AvgWinScoreSum,AvgLossScoreSum,WAvgScore,LAvgScore,AvgTotalScore
0,78.0,62.6,142.428571,133.8,13.571429,8.6,82.625,65.666667,152.125,147.333333,13.125,16.0,160.625,128.266667,70.3,74.145833,143.921726


## Run Prediction Models

In [179]:
# Run models
d_test = xgb.DMatrix(predict_df)

pred = winner_model.predict(d_test)
print("Predicted Winner (1 for W team, 0 for L team): ", pred[0])

pred = spread_model.predict(d_test)
print("Predicted Spread: (positive for W team, negative for L team)", pred[0])

d_test2 = xgb.DMatrix(predict_total_df)

pred = totals_model.predict(d_test2)
print("Predicted Total Score: ", pred[0])

Predicted Winner (1 for W team, 0 for L team):  0.5611118
Predicted Spread: (positive for W team, negative for L team) 3.0572507
Predicted Total Score:  144.40196
