# Generating Predictions

Using the Logistic Regression model that we chose in the Selecting a Model notebook, we will create predictions for the 2021 NCAA Tournament.

In [1]:
# Import packages
import sys
sys.path.append('../')

import pandas as pd
from sklearn.linear_model import LogisticRegression
import collegebasketball as cbb
cbb.__version__

import warnings
warnings.filterwarnings('ignore')

## Train the Model

Using the same method as before, we will train the model. To understand how I arrived at this model, please look at the Selecting a Model notebook for more information.

However, there is one major difference in how we will train the model this time. Before, we split the data into training and testing sets, but since we are predicting for new games, we will use all of the training data to train the model.

In [2]:
# Load the csv files that contain the scores/kenpom data
path = '../Data/Training/training_feat_reduced.csv'
train = pd.read_csv(path)

# Get a sense for the size of each data set
print('Length of training data: {}'.format(len(train)))

Length of training data: 14650


In [3]:
train.head()

Unnamed: 0,Favored,Underdog,Year,Tournament,Win_Loss_Fav,Win_Loss,AdjEM_Fav,AdjEM,AdjO_Fav,AdjO,...,FT%_opp_Fav,FT%_opp,AST_Fav,AST,AST_opp_Fav,AST_opp,BLK_Fav,BLK,BLK_opp_Fav,BLK_opp
0,UNC,William & Mary,2010,NIT,0.540541,0.666667,13.39,6.58,107.4,110.0,...,0.699,0.681,15.594595,14.030303,14.378378,13.151515,5.675676,2.545455,4.486486,3.333333
1,Wake Forest,UNC,2010,,0.645161,0.540541,14.12,13.39,107.1,107.4,...,0.687,0.699,11.903226,15.594595,11.419355,14.378378,5.225806,5.675676,3.903226,4.486486
2,UNC,Nevada,2010,,0.540541,0.617647,13.39,10.2,107.4,112.8,...,0.699,0.713,15.594595,14.117647,14.378378,15.029412,5.675676,4.382353,4.486486,2.588235
3,Syracuse,UNC,2010,,0.857143,0.540541,27.62,13.39,118.5,107.4,...,0.637,0.699,19.228571,15.594595,15.514286,14.378378,6.314286,5.675676,2.285714,4.486486
4,Dayton,UNC,2010,NIT,0.675676,0.540541,18.92,13.39,108.1,107.4,...,0.657,0.699,14.702703,15.594595,11.135135,14.378378,3.459459,5.675676,3.162162,4.486486


In [4]:
# Get feature names
exclude = ['Favored', 'Underdog', 'Year', 'Tournament', 'Label']

# Due to the differences in games among teams, we need to remove all of the stats that are an absolute number
exclude = exclude + ['3PA_Fav', '3PA', '3PA_opp_Fav', '3PA_opp', 'AST_Fav', 'AST', 
                     'AST_opp_Fav', 'AST_opp', 'BLK_Fav', 'BLK', 'BLK_opp_Fav', 'BLK_opp']

features = list(train.columns)
for col in exclude:
    features.remove(col)

In [5]:
# Train the classifier
log = LogisticRegression(penalty='l2', C=10, solver='liblinear', random_state=77)
log.fit(train[features], train[['Label']])

LogisticRegression(C=10, random_state=77, solver='liblinear')

## Get Input Data for this Year

Next, we'll need to get the input data for this year so we can use it to predict game results for tournament games. We'll retrieve data from each source for this year, clean the data and combine it into a single data set.

In [6]:
year = 2021
stats_path = '../Data/SportsReference/' + str(year) + '_stats.csv'
stats = cbb.load_stats_dataframe(year=2021, csv_file_path=stats_path)
stats = pd.read_csv(stats_path)
stats = cbb.update_basic(stats.rename(index=str, columns={'School': 'Team'}))
stats[stats['Team'] == 'Marquette']

Unnamed: 0,Team,G,SRS,SOS,Tm.,Opp.,MP,FG_opp,FGA_opp,FG%_opp,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
167,Marquette,27,9.27,9.34,1883.0,1885.0,1085.0,666.0,1609.0,0.414,...,382.0,520.0,0.735,258.0,979.0,333.0,137.0,100.0,388.0,465.0


In [7]:
kp_path = '../Data/Kenpom/' + str(year) + '_kenpom.csv'
kenpom = cbb.load_kenpom_dataframe(year=year, csv_file_path=kp_path)
kenpom = pd.read_csv(kp_path)
kenpom = cbb.update_kenpom(kenpom)
kenpom[kenpom['Team'] == 'Marquette']

Unnamed: 0,Rank,Team,Seed,Conf,Wins,Losses,AdjEM,AdjO,AdjO Rank,AdjD,...,Luck,Luck Rank,OppAdjEM,OppAdjEM Rank,OppO,OppO Rank,OppD,OppD Rank,NCSOS AdjEM,NCSOS AdjEM Rank
79,80,Marquette,,BE,13,14,10.76,107.6,87,96.8,...,-0.007,198,14.04,31,109.5,32,95.5,33,6.65,100


In [8]:
TRank_path = '../Data/TRank/' + str(year) + '_TRank.csv'
TRank = cbb.load_TRank_dataframe(year=year, csv_file_path=TRank_path)
TRank = pd.read_csv(TRank_path)
TRank = cbb.update_TRank(TRank)
TRank[TRank['Team'] == 'Marquette']

Unnamed: 0,Rk,Team,Conf,G,Wins,Losses,AdjOE,AdjOE Rank,AdjDE,AdjDE Rank,...,2P%D,2P%D Rank,3P%,3P% Rank,3P%D,3P%D Rank,Adj T.,Adj T. Rank,WAB,WAB Rank
64,65,Marquette,BE,27,13,14,107.4,83,95.8,65,...,45.8,34,32.3,239,33.9,174,67.9,201,-2.1,83


In [9]:
# Merge the data from each source (and drop columns that are repeats)
team_stats = pd.merge(kenpom, TRank.drop(['Conf', 'Wins', 'Losses'], axis=1), on='Team', sort=False)
team_stats = pd.merge(team_stats, stats.drop(['G', 'ORB', '3P%'], axis=1), on='Team', sort=False)
team_stats[team_stats['Team'] == 'Marquette']

Unnamed: 0,Rank,Team,Seed,Conf,Wins,Losses,AdjEM,AdjO,AdjO Rank,AdjD,...,3PA,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,PF
79,80,Marquette,,BE,13,14,10.76,107.6,87,96.8,...,555.0,382.0,520.0,0.735,979.0,333.0,137.0,100.0,388.0,465.0


In [10]:
# Load Tournament games
games_path = '../Data/Tourney/{}.csv'.format(year)
games = pd.read_csv(games_path)
games.head(3)

Unnamed: 0,Home,Away
0,Gonzaga,Norfolk State
1,Oklahoma,Missouri
2,Creighton,UCSB


In [11]:
# Join the team data with the game data
data = pd.merge(games, team_stats, left_on='Home', right_on='Team', sort=False)
data = pd.merge(data, team_stats, left_on='Away', right_on='Team', suffixes=('_Home', '_Away'), sort=False)
data.insert(0, 'Year', year)
data.insert(3, 'Tournament', 'NCAA Tournament')
data.head(3)

Unnamed: 0,Year,Home,Away,Tournament,Rank_Home,Team_Home,Seed_Home,Conf_Home,Wins_Home,Losses_Home,...,3PA_Away,FT_Away,FTA_Away,FT%_Away,TRB_Away,AST_Away,STL_Away,BLK_Away,TOV_Away,PF_Away
0,2021,Gonzaga,Norfolk State,NCAA Tournament,1,Gonzaga,1.0,WCC,26,0,...,489.0,368.0,521.0,0.706,850.0,282.0,146.0,76.0,301.0,453.0
1,2021,Oklahoma,Missouri,NCAA Tournament,39,Oklahoma,8.0,B12,15,10,...,540.0,376.0,539.0,0.698,892.0,330.0,168.0,88.0,336.0,494.0
2,2021,Creighton,UCSB,NCAA Tournament,19,Creighton,5.0,BE,20,8,...,501.0,379.0,505.0,0.75,915.0,423.0,197.0,72.0,285.0,413.0


## Predict Games Using the Classifier

Now that we have a trained model and data for the tournament games this year, we can use it to predict games in the 2021 NCAA Tournament.

In [12]:
# Make Predictions
features = [x.replace('_x', '') for x in features] # Fix an issue with training data
predictions = cbb.predict(log, data, features)
predictions.to_csv('../Data/predictions/predictions_2021.csv', index=False)
predictions['Upset'] = predictions['Underdog'] == predictions['Predicted Winner']

In [13]:
# First Round
predictions.iloc[0:32,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
0,Gonzaga,Norfolk State,Gonzaga,0.003954,False
1,Oklahoma,Missouri,Missouri,0.657428,True
2,Creighton,UCSB,Creighton,0.167511,False
3,Virginia,Ohio,Virginia,0.127899,False
4,USC,Drake,USC,0.318816,False
5,Kansas,Eastern Washington,Kansas,0.030479,False
6,Oregon,VCU,Oregon,0.420136,False
7,Iowa,Grand Canyon,Iowa,0.017802,False
8,Michigan,Texas Southern,Michigan,0.001852,False
9,St. Bonaventure,LSU,LSU,0.533925,True


In [14]:
# Second Round
predictions.iloc[32:48,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
32,Gonzaga,Missouri,Gonzaga,0.149259,False
33,Virginia,Creighton,Creighton,0.464976,True
34,USC,Kansas,Kansas,0.576325,True
35,Iowa,Oregon,Iowa,0.215813,False
36,Michigan,LSU,Michigan,0.163963,False
37,Florida State,Georgetown,Florida State,0.298596,False
38,BYU,Texas,Texas,0.532175,True
39,Alabama,Maryland,Maryland,0.319232,True
40,Baylor,Wisconsin,Baylor,0.298224,False
41,Villanova,Purdue,Purdue,0.616978,True


In [15]:
# Later Rounds
predictions.iloc[48:,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
48,Gonzaga,Creighton,Gonzaga,0.207331,False
49,Iowa,Kansas,Kansas,0.311212,True
50,Michigan,Florida State,Michigan,0.174874,False
51,Texas,Maryland,Maryland,0.472084,True
52,Baylor,Purdue,Purdue,0.320575,True
53,Ohio State,Colgate,Ohio State,0.242022,False
54,Illinois,Oklahoma State,Illinois,0.203641,False
55,Syracuse,Clemson,Clemson,0.677109,True
56,Gonzaga,Kansas,Gonzaga,0.228399,False
57,Michigan,Maryland,Michigan,0.163486,False


Congratulations to all Illinois fans because the model has predicted the Illini to win the 2021 NCAA Tournament!