## Data Collection

In [1]:
# Load libraries
import numpy as np
import pandas as pd

In [2]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='10/22/2019', league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42021,1610612738,BOS,Boston Celtics,42100406,2022-06-16,BOS vs. GSW,L,239,90,...,0.917,11,30,41,27,8,8,22,16,-13.0
1,42021,1610612744,GSW,Golden State Warriors,42100406,2022-06-16,GSW @ BOS,W,241,103,...,1.0,15,29,44,27,13,7,15,20,13.0
2,42021,1610612738,BOS,Boston Celtics,42100405,2022-06-13,BOS @ GSW,L,238,94,...,0.677,8,39,47,18,2,2,18,16,-10.0
3,42021,1610612744,GSW,Golden State Warriors,42100405,2022-06-13,GSW vs. BOS,W,240,104,...,0.867,4,35,39,23,9,2,6,28,10.0
4,42021,1610612744,GSW,Golden State Warriors,42100404,2022-06-10,GSW @ BOS,W,241,107,...,0.8,16,39,55,20,10,5,16,21,10.0


## Exploratory Data Analysis

In [3]:
games.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [4]:
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PTS', 'FGM', 'FGA', 'FG_PCT',
        'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
        'TOV', 'PF', 'PLUS_MINUS']]

In [5]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7588 entries, 0 to 7587
Data columns (total 24 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TEAM_NAME   7588 non-null   object 
 1   GAME_ID     7588 non-null   object 
 2   GAME_DATE   7588 non-null   object 
 3   MATCHUP     7588 non-null   object 
 4   WL          7588 non-null   object 
 5   PTS         7588 non-null   int64  
 6   FGM         7588 non-null   int64  
 7   FGA         7588 non-null   int64  
 8   FG_PCT      7588 non-null   float64
 9   FG3M        7588 non-null   int64  
 10  FG3A        7588 non-null   int64  
 11  FG3_PCT     7588 non-null   float64
 12  FTM         7588 non-null   int64  
 13  FTA         7588 non-null   int64  
 14  FT_PCT      7588 non-null   float64
 15  OREB        7588 non-null   int64  
 16  DREB        7588 non-null   int64  
 17  REB         7588 non-null   int64  
 18  AST         7588 non-null   int64  
 19  STL         7588 non-null  

In [6]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

In [7]:
games = games.sort_values('GAME_DATE')

In [8]:
games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

In [9]:
games_ = games['MATCHUP'].str.contains('@')
games_away = games[games_]
games_home = games[~games_]

In [10]:
games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))
games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [11]:
games_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3794 entries, 0 to 3793
Data columns (total 50 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   TEAM_NAME_home          3794 non-null   object        
 1   GAME_ID                 3794 non-null   object        
 2   GAME_DATE_home          3794 non-null   datetime64[ns]
 3   MATCHUP_home            3794 non-null   object        
 4   WL_home                 3794 non-null   object        
 5   PTS_home                3794 non-null   int64         
 6   FGM_home                3794 non-null   int64         
 7   FGA_home                3794 non-null   int64         
 8   FG_PCT_home             3794 non-null   float64       
 9   FG3M_home               3794 non-null   int64         
 10  FG3A_home               3794 non-null   int64         
 11  FG3_PCT_home            3794 non-null   float64       
 12  FTM_home                3794 non-null   int64   

In [12]:
games_merged.drop(['GAME_ID', 'GAME_DATE_home', 'MATCHUP_home', 'PLUS_MINUS_home', 'GAME_DATE_away', 'MATCHUP_away', 'WL_away'], axis=1, inplace=True)

In [13]:
games_merged[['WL_home', 'avg_30_plus_minus_diff']]

Unnamed: 0,WL_home,avg_30_plus_minus_diff
0,W,
1,W,
2,W,
3,L,
4,W,
...,...,...
3789,W,-6.900000
3790,W,4.233333
3791,L,4.933333
3792,W,-3.200000


In [14]:
games_model = games_merged[['WL_home', 'avg_30_plus_minus_diff']].dropna()

In [15]:
games_model['WL_home'] = games_model['WL_home'].map({'W': 1, 'L': 0})
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
443,0,-5.933333
449,0,6.000000
455,1,-9.300000
457,0,0.700000
458,1,-13.833333
...,...,...
3789,1,-6.900000
3790,1,4.233333
3791,0,4.933333
3792,1,-3.200000


## Prediction Model Building, Tuning & Evaluation

In [16]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(games_model, stratify=games_model['WL_home'], test_size=0.2, random_state=7)

In [17]:
target = 'WL_home'
X_train = df_train.drop(columns=target)
y_train = df_train[target]

X_test = df_test.drop(columns=target)
y_test= df_test[target]

### XGBoost Classifier

In [18]:
import xgboost as xgb
clf = xgb.XGBClassifier(use_label_encoder=False, random_state=7)
clf.fit(X_train, y_train)



In [19]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)

In [20]:
accuracy_score(y_test, y_pred)

0.6150375939849624

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform

In [22]:
hyp_params = {'learning_rate': loguniform(0.0001, 1),
              'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
              'subsample': [0.7, 0.8, 0.9, 1.0],
              'n_estimators': [50, 100, 150, 200]}

In [23]:
random_hyp = RandomizedSearchCV(estimator=clf, 
                                param_distributions=hyp_params, 
                                n_iter=20, 
                                cv=7,
                                scoring='accuracy',
                                random_state=7)

In [24]:
random_hyp.fit(X_train, y_train)



In [25]:
random_hyp.best_params_

{'learning_rate': 0.014255533717547383,
 'max_depth': 2,
 'n_estimators': 100,
 'subsample': 0.9}

In [26]:
model_hyp = random_hyp.best_estimator_

In [27]:
y_pred_hyp = model_hyp.predict(X_test)

In [28]:
accuracy_score(y_test, y_pred_hyp)

0.6300751879699248

## Model Deployment

In [38]:
from joblib import dump, load
dump(model_hyp, 'model_nba.joblib') 
model_saved = load('model_nba.joblib')

In [39]:
accuracy_score(y_test, model_saved.predict(X_test))

0.6300751879699248

In [45]:
def predict_games(home_team, away_team):
    gamefinder = leaguegamefinder.LeagueGameFinder(
        date_from_nullable='01/01/2021',
        league_id_nullable='00')
    games = gamefinder.get_data_frames()[0]
    games = games[
        ['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

    msk_home = (games['TEAM_NAME'] == home_team)
    games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
    home_plus_minus = games_30_home['PLUS_MINUS'].mean()

    msk_away = (games['TEAM_NAME'] == away_team)
    games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
    away_plus_minus = games_30_away['PLUS_MINUS'].mean()

    games_diff = home_plus_minus - away_plus_minus

    predict_home_win = model_saved.predict(np.array([games_diff]))[0]
    predict_winning_probability = model_saved.predict_proba(np.array([games_diff]))[0][1]
    return predict_home_win, predict_winning_probability

In [57]:
predict_games('Brooklyn Nets','Houston Rockets')

(1, 0.6890975)

### Random Forest Classifier

In [32]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [33]:
rfc.fit(X_train, y_train)

In [34]:
y_pred = rfc.predict(X_test)
accuracy_score(y_test, y_pred)

0.5639097744360902

### Naïve Bayes Model

In [35]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

In [36]:
nb.fit(X_train, y_train)

In [37]:
y_pred = nb.predict(X_test)
accuracy_score(y_test, y_pred)

0.6285714285714286