# Generating Predictions

Using the Logistic Regression model that we chose in the Selecting a Model notebook, we will create predictions for the 2021 NCAA Tournament.

In [1]:
# Import packages
import sys
sys.path.append('../')

import pandas as pd
from sklearn.linear_model import LogisticRegression
import collegebasketball as cbb
cbb.__version__

import warnings
warnings.filterwarnings('ignore')

## Train the Model

Using the same method as before, we will train the model. To understand how I arrived at this model, please look at the Selecting a Model notebook for more information.

However, there is one major difference in how we will train the model this time. Before, we split the data into training and testing sets, but since we are predicting for new games, we will use all of the training data to train the model.

In [2]:
# Load the csv files that contain the scores/kenpom data
path = '../Data/Training/training_feat_reduced.csv'
train = pd.read_csv(path)

# Get a sense for the size of each data set
print('Length of training data: {}'.format(len(train)))

Length of training data: 14650


In [3]:
train.head()

Unnamed: 0,Favored,Underdog,Year,Tournament,Win_Loss_Fav,Win_Loss,AdjEM_Fav,AdjEM,AdjO_Fav,AdjO,...,FT%_opp_Fav,FT%_opp,AST_Fav,AST,AST_opp_Fav,AST_opp,BLK_Fav,BLK,BLK_opp_Fav,BLK_opp
0,UNC,William & Mary,2010,NIT,0.540541,0.666667,13.39,6.58,107.4,110.0,...,0.699,0.681,15.594595,14.030303,14.378378,13.151515,5.675676,2.545455,4.486486,3.333333
1,Wake Forest,UNC,2010,,0.645161,0.540541,14.12,13.39,107.1,107.4,...,0.687,0.699,11.903226,15.594595,11.419355,14.378378,5.225806,5.675676,3.903226,4.486486
2,UNC,Nevada,2010,,0.540541,0.617647,13.39,10.2,107.4,112.8,...,0.699,0.713,15.594595,14.117647,14.378378,15.029412,5.675676,4.382353,4.486486,2.588235
3,Syracuse,UNC,2010,,0.857143,0.540541,27.62,13.39,118.5,107.4,...,0.637,0.699,19.228571,15.594595,15.514286,14.378378,6.314286,5.675676,2.285714,4.486486
4,Dayton,UNC,2010,NIT,0.675676,0.540541,18.92,13.39,108.1,107.4,...,0.657,0.699,14.702703,15.594595,11.135135,14.378378,3.459459,5.675676,3.162162,4.486486


In [4]:
# Get feature names
exclude = ['Favored', 'Underdog', 'Year', 'Tournament', 'Label']

# Due to the differences in games among teams, we need to remove all of the stats that are an absolute number
exclude = exclude + ['3PA_Fav', '3PA', '3PA_opp_Fav', '3PA_opp', 'AST_Fav', 'AST', 
                     'AST_opp_Fav', 'AST_opp', 'BLK_Fav', 'BLK', 'BLK_opp_Fav', 'BLK_opp']

features = list(train.columns)
for col in exclude:
    features.remove(col)

In [5]:
# Train the classifier
log = LogisticRegression(penalty='l2', C=10, solver='liblinear', random_state=77)
log.fit(train[features], train[['Label']])

LogisticRegression(C=10, random_state=77, solver='liblinear')

## Get Input Data for this Year

Next, we'll need to get the input data for this year so we can use it to predict game results for tournament games. We'll retrieve data from each source for this year, clean the data and combine it into a single data set.

In [6]:
year = 2022
stats_path = '../Data/SportsReference/' + str(year) + '_stats.csv'
stats = cbb.load_stats_dataframe(year=year, csv_file_path=stats_path)
stats = pd.read_csv(stats_path)
stats = cbb.update_basic(stats.rename(index=str, columns={'School': 'Team'}))
stats[stats['Team'] == 'Marquette']

Unnamed: 0,Team,G,SRS,SOS,Tm.,Opp.,MP,FG_opp,FGA_opp,FG%_opp,...,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
168,Marquette,31,12.58,9.07,2306,2197,1250,790,1925,0.41,...,369,494,0.747,238,1076,496,244,158,386,541


In [7]:
kp_path = '../Data/Kenpom/' + str(year) + '_kenpom.csv'
kenpom = cbb.load_kenpom_dataframe(year=year, csv_file_path=kp_path)
kenpom = pd.read_csv(kp_path)
kenpom = cbb.update_kenpom(kenpom)
kenpom[kenpom['Team'] == 'Marquette']

Unnamed: 0,Rank,Team,Seed,Conf,Wins,Losses,AdjEM,AdjO,AdjO Rank,AdjD,...,Luck,Luck Rank,OppAdjEM,OppAdjEM Rank,OppO,OppO Rank,OppD,OppD Rank,NCSOS AdjEM,NCSOS AdjEM Rank
46,47,Marquette,9.0,BE,19,12,14.0,109.8,62,95.8,...,-0.0,175,9.25,25,107.4,38,98.1,20,4.2,57


In [8]:
TRank_path = '../Data/TRank/' + str(year) + '_TRank.csv'
TRank = cbb.load_TRank_dataframe(year=year, csv_file_path=TRank_path)
TRank = pd.read_csv(TRank_path)
TRank = cbb.update_TRank(TRank)
TRank[TRank['Team'] == 'Marquette']

Unnamed: 0,Rk,Team,Conf,G,Wins,Losses,AdjOE,AdjOE Rank,AdjDE,AdjDE Rank,...,2P%D,2P%D Rank,3P%,3P% Rank,3P%D,3P%D Rank,Adj T.,Adj T. Rank,WAB,WAB Rank
51,52,Marquette,BE,31,19,12,108.8,61,95.6,49,...,46.2,39,34.7,104,31.7,73,71.0,21,0.6,52


In [9]:
# Merge the data from each source (and drop columns that are repeats)
team_stats = pd.merge(kenpom, TRank.drop(['Conf', 'Wins', 'Losses'], axis=1), on='Team', sort=False)
team_stats = pd.merge(team_stats, stats.drop(['G', 'ORB', '3P%'], axis=1), on='Team', sort=False)
team_stats[team_stats['Team'] == 'Marquette']

Unnamed: 0,Rank,Team,Seed,Conf,Wins,Losses,AdjEM,AdjO,AdjO Rank,AdjD,...,3PA,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,PF
46,47,Marquette,9.0,BE,19,12,14.0,109.8,62,95.8,...,769,369,494,0.747,1076,496,244,158,386,541


In [10]:
# Load Tournament games
games_path = '../Data/Tourney/{}.csv'.format(year)
games = pd.read_csv(games_path)
games.head(3)

Unnamed: 0,Home,Away
0,Gonzaga,Georgia State
1,Boise State,Memphis
2,UConn,New Mexico State


In [11]:
# Join the team data with the game data
data = pd.merge(games, team_stats, left_on='Home', right_on='Team', sort=False)
data = pd.merge(data, team_stats, left_on='Away', right_on='Team', suffixes=('_Home', '_Away'), sort=False)
data.insert(0, 'Year', year)
data.insert(3, 'Tournament', 'NCAA Tournament')
data.head(3)

Unnamed: 0,Year,Home,Away,Tournament,Rank_Home,Team_Home,Seed_Home,Conf_Home,Wins_Home,Losses_Home,...,3PA_Away,FT_Away,FTA_Away,FT%_Away,TRB_Away,AST_Away,STL_Away,BLK_Away,TOV_Away,PF_Away
0,2022,Gonzaga,Georgia State,NCAA Tournament,1,Gonzaga,1.0,WCC,26,3,...,654,364,508,0.717,1043,379,250,125,332,441
1,2022,Boise State,Memphis,NCAA Tournament,26,Boise State,8.0,MWC,27,7,...,546,472,682,0.692,1201,498,270,176,507,580
2,2022,UConn,New Mexico State,NCAA Tournament,18,UConn,5.0,BE,23,9,...,769,439,629,0.698,1231,445,162,133,448,508


## Predict Games Using the Classifier

Now that we have a trained model and data for the tournament games this year, we can use it to predict games in the 2021 NCAA Tournament.

In [12]:
# Make Predictions
features = [x.replace('_x', '') for x in features] # Fix an issue with training data
predictions = cbb.predict(log, data, features)
predictions.to_csv('../Data/predictions/predictions_2022.csv', index=False)
predictions['Upset'] = predictions['Underdog'] == predictions['Predicted Winner']

In [13]:
# First Round
predictions.iloc[0:32,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
0,Gonzaga,Georgia State,Gonzaga,0.029598,False
1,Boise State,Memphis,Boise State,0.390398,False
2,UConn,New Mexico State,UConn,0.286115,False
3,Arkansas,Vermont,Arkansas,0.110342,False
4,Alabama,Notre Dame,Notre Dame,0.394191,True
5,Texas Tech,Montana State,Texas Tech,0.038384,False
6,Michigan State,Davidson,Michigan State,0.354773,False
7,Duke,Cal State Fullerton,Duke,0.050186,False
8,Baylor,Norfolk State,Baylor,0.016985,False
9,UNC,Marquette,UNC,0.397027,False


In [14]:
# Second Round
predictions.iloc[32:48,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
32,Gonzaga,Boise State,Gonzaga,0.224664,False
33,UConn,Arkansas,Arkansas,0.57988,True
34,Texas Tech,Notre Dame,Texas Tech,0.18639,False
35,Duke,Michigan State,Michigan State,0.316923,True
36,Baylor,UNC,Baylor,0.197952,False
37,UCLA,Indiana,UCLA,0.276987,False
38,Purdue,Virginia Tech,Purdue,0.210865,False
39,Kentucky,San Francisco,Kentucky,0.208234,False
40,Arizona,TCU,Arizona,0.217964,False
41,Houston,Illinois,Illinois,0.440442,True


In [15]:
# Later Rounds
predictions.iloc[48:,:]

Unnamed: 0,Favored,Underdog,Predicted Winner,Probabilities,Upset
48,Gonzaga,Arkansas,Gonzaga,0.28893,False
49,Texas Tech,Michigan State,Texas Tech,0.276606,False
50,Baylor,UCLA,UCLA,0.333325,True
51,Kentucky,Purdue,Purdue,0.454752,True
52,Arizona,Illinois,Arizona,0.281666,False
53,Villanova,Colorado State,Colorado State,0.374482,True
54,Kansas,Providence,Kansas,0.302777,False
55,USC,Iowa State,Iowa State,0.483924,True
56,Gonzaga,Texas Tech,Gonzaga,0.423187,False
57,UCLA,Purdue,Purdue,0.566776,True


Congratulations to all Kansas fans because the model has predicted the Jayhawks to win the 2022 NCAA Tournament!