# Generating Predictions

Using the AdaBoost model that we selected in the Selecting a Model notebook, we will create preditions for the 2018 NCAA Tournament.

In [1]:
# Import packages
import sys
sys.path.append('/Users/phil/Documents/Documents/College_Basketball')

import pandas as pd
import collegebasketball as cbb
cbb.__version__

import warnings
warnings.filterwarnings('ignore')

## Train the Model

Using the same method as before, we will train the model. To understand how I arrived at this model, please look at the Selecting a Model notebook for more information.

However, there is one major difference in how we will train the model this time. Since we were using the tournament data as a test set before, we did not use it to train the model. However, since we are now predicting on the 2018 data, we can use the tournament data to help train the model.

In [2]:
# Load the csv files that contain the scores/kenpom data
path = '../Data/Training/'
kenpom_season = cbb.load_csv('{}kenpom_season.csv'.format(path))
kenpom_march = cbb.load_csv('{}kenpom_march.csv'.format(path))

# Get a sense for the size of each data set
print('Length of kenpom data: {}'.format(len(kenpom_season) + len(kenpom_march)))

Length of kenpom data: 60252


In [3]:
# Combine regular season and march data
kenpom_data = pd.concat([kenpom_season, kenpom_march])

# Get feature names
exclude = ['Favored', 'Underdog', 'Year', 'Label']
features = list(kenpom_season.columns)
for col in exclude:
    features.remove(col)

In [4]:
# Train the classifier
log = cbb.LogisticRegression(penalty='l1', C=10)
log.fit(kenpom_data[features], kenpom_data[['Label']])

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## Predict Games Using the Classifier

Now that we have a trained model, we can use it to predict games in the 2018 NCAA Tournament. First we need to load in the games from the first round and create feature vectors.

In [5]:
# Load games csv
games = cbb.load_csv('/Users/phil/Documents/Documents/College_Basketball/Data/Tourney/2019.csv')
games.head()

Unnamed: 0,Home,Away
0,Duke,North Dakota
1,VCU,UCF
2,Mississippi State,Liberty
3,Virginia Tech,Saint Louis
4,Maryland,Belmont


In [6]:
# Set up the training data set
path = '/Users/phil/Documents/Documents/College_Basketball/Data/Kenpom/2019_kenpom.csv'
kenpom = cbb.load_csv(path)
kenpom = cbb.update_kenpom(kenpom)
kenpom.head()

Unnamed: 0,Rank,Team,Conf,Wins,Losses,AdjEM,AdjO,AdjO Rank,AdjD,AdjD Rank,...,Luck,Luck Rank,OppAdjEM,OppAdjEM Rank,OppO,OppO Rank,OppD,OppD Rank,NCSOS AdjEM,NCSOS AdjEM Rank
0,1,Virginia,ACC,29,3,35.66,123.6,2,87.9,5,...,0.024,119,10.51,29,108.4,48,97.9,19,-2.82,248
1,2,Gonzaga,WCC,30,3,32.79,125.1,1,92.3,16,...,0.008,160,3.38,83,106.2,76,102.8,102,1.98,109
2,3,Duke,ACC,29,5,31.99,120.1,6,88.1,6,...,0.018,137,13.25,7,110.4,4,97.2,11,5.48,38
3,4,Michigan State,B10,28,6,31.36,121.7,4,90.4,8,...,-0.007,198,13.57,3,110.2,7,96.7,2,3.15,90
4,5,Michigan,B10,28,6,29.44,115.5,18,86.1,2,...,-0.014,227,11.65,17,109.3,29,97.6,16,-4.94,305


In [7]:
# Merge Games data with the different data sets
games = pd.merge(games, kenpom, left_on='Home', right_on='Team', sort=False)
games = pd.merge(games, kenpom, left_on='Away', right_on='Team', suffixes=('_Home', '_Away'), sort=False)
games.insert(0, 'Year', 2019)

games.head()

Unnamed: 0,Year,Home,Away,Rank_Home,Team_Home,Conf_Home,Wins_Home,Losses_Home,AdjEM_Home,AdjO_Home,...,Luck_Away,Luck Rank_Away,OppAdjEM_Away,OppAdjEM Rank_Away,OppO_Away,OppO Rank_Away,OppD_Away,OppD Rank_Away,NCSOS AdjEM_Away,NCSOS AdjEM Rank_Away
0,2019,Duke,North Dakota,3,Duke,ACC,29,5,31.99,120.1,...,-0.046,288,-2.99,223,104.8,123,107.8,316,2.18,104
1,2019,VCU,UCF,37,VCU,A10,25,7,15.69,104.3,...,0.029,104,3.61,78,105.2,109,101.6,65,-3.42,267
2,2019,Mississippi State,Liberty,21,Mississippi State,SEC,23,10,20.68,117.4,...,0.006,167,-4.61,268,101.3,283,105.9,223,-2.83,249
3,2019,Virginia Tech,Saint Louis,11,Virginia Tech,ACC,24,8,24.47,118.4,...,0.045,74,0.36,133,103.9,166,103.6,127,-1.05,199
4,2019,Maryland,Belmont,24,Maryland,B10,22,10,19.98,113.8,...,0.041,78,-3.23,230,102.7,227,105.9,221,3.44,87


Now that we have feature vectors for the first round of the tournament and a trained model, we can make our predictions for the 2018 NCAA Tournament.

In [16]:
# Make Predictions
predictions = cbb.predict(log, games, features)

In [17]:
# First Round
predictions.iloc[0:32,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities
0,Duke,North Dakota,Duke,0.002309
1,VCU,UCF,UCF,0.574567
2,Mississippi State,Liberty,Mississippi State,0.165522
3,Virginia Tech,Saint Louis,Virginia Tech,0.091554
4,Maryland,Belmont,Maryland,0.176541
5,LSU,Yale,LSU,0.114991
6,Louisville,Minnesota,Minnesota,0.531294
7,Michigan State,Bradley,Michigan State,0.014714
8,Gonzaga,Fairleigh Dickinson,Gonzaga,0.007385
9,Syracuse,Baylor,Baylor,0.48403


In [24]:
seeds = []
for i in range(4):
    seeds.extend([1, 8, 5, 4, 6, 3, 7, 2])
data = predictions[0:32].copy()
data['Top Seed'] = seeds

data = data.sort_values(by=['Top Seed', 'Probabilities'])

winner = []
count = 0
for row in data.iterrows():
    if row[1]['Top Seed'] < 5:
        winner.append(row[1].loc['Favored'])
    elif count < 2:
        winner.append(row[1].loc['Favored'])
    else:
        winner.append(row[1].loc['Underdog'])
    count = count + 1
    if count > 3:
        count = 0
data['Winner'] = winner

data = data.sort_index()
actual_winner = ['Duke', 'UCF', 'Liberty', 'Virginia Tech', 'Maryland', 'LSU',
                'Minnesota', 'Michigan State', 'Gonzaga', 'Baylor', 'Murray State',
                'Florida State', 'Buffalo', 'Texas Tech', 'Florida', 'Michigan',
                'Virginia', 'Oklahoma', 'Oregon', 'UC Irvine', 'Villanova',
                'Purdue', 'Iowa', 'Tennessee', 'UNC', 'Washington', 'Auburn',
                'Kansas', 'Ohio State', 'Houston', 'Wofford', 'Kentucky']
data['Actual Winner'] = actual_winner

print(sum(data['Winner'] == data['Actual Winner']))
print(sum(data['Winner'] == data['Predicted Winner']))

data

27
26


Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Top Seed,Winner,Actual Winner
0,Duke,North Dakota,Duke,0.002309,1,Duke,Duke
1,VCU,UCF,UCF,0.574567,8,UCF,UCF
2,Mississippi State,Liberty,Mississippi State,0.165522,5,Mississippi State,Liberty
3,Virginia Tech,Saint Louis,Virginia Tech,0.091554,4,Virginia Tech,Virginia Tech
4,Maryland,Belmont,Maryland,0.176541,6,Maryland,Maryland
5,LSU,Yale,LSU,0.114991,3,LSU,LSU
6,Louisville,Minnesota,Minnesota,0.531294,7,Minnesota,Minnesota
7,Michigan State,Bradley,Michigan State,0.014714,2,Michigan State,Michigan State
8,Gonzaga,Fairleigh Dickinson,Gonzaga,0.007385,1,Gonzaga,Gonzaga
9,Syracuse,Baylor,Baylor,0.48403,8,Syracuse,Baylor


In [10]:
# Second Round
predictions.iloc[32:48,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities
32,Duke,UCF,Duke,0.09381
33,Virginia Tech,Mississippi State,Virginia Tech,0.398811
34,LSU,Maryland,Maryland,0.458817
35,Michigan State,Minnesota,Michigan State,0.18031
36,Gonzaga,Baylor,Gonzaga,0.164791
37,Florida State,Marquette,Florida State,0.296769
38,Texas Tech,Buffalo,Texas Tech,0.312442
39,Michigan,Florida,Michigan,0.14557
40,Virginia,Ole Miss,Virginia,0.07078
41,Wisconsin,Kansas State,Kansas State,0.516616


In [11]:
# Later Rounds
predictions.iloc[48:,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities
48,Duke,Virginia Tech,Duke,0.259127
49,Michigan State,Maryland,Michigan State,0.23513
50,Gonzaga,Florida State,Florida State,0.424107
51,Michigan,Texas Tech,Michigan,0.396311
52,Virginia,Kansas State,Virginia,0.191543
53,Tennessee,Purdue,Purdue,0.41467
54,UNC,Kansas,UNC,0.334829
55,Kentucky,Houston,Kentucky,0.344947
56,Duke,Michigan State,Michigan State,0.471141
57,Michigan,Florida State,Michigan,0.358982
