# Collect data

In [1]:
#%pip install nba_api

In [2]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/31/2020', league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42022,1610612751,BKN,Brooklyn Nets,42200124,2023-04-22,BKN vs. PHI,,229,76,...,0.688,5,32,37,18,2,6,10,12,-7.4
1,42022,1610612755,PHI,Philadelphia 76ers,42200124,2023-04-22,PHI @ BKN,,215,85,...,0.917,14,35,49,16,7,3,7,17,8.0
2,42022,1610612752,NYK,New York Knicks,42200133,2023-04-21,NYK vs. CLE,W,241,99,...,0.611,11,34,45,21,14,5,14,17,20.0
3,42022,1610612737,ATL,Atlanta Hawks,42200113,2023-04-21,ATL vs. BOS,W,240,130,...,0.813,11,37,48,24,5,6,18,15,8.0
4,42022,1610612738,BOS,Boston Celtics,42200113,2023-04-21,BOS @ ATL,L,240,122,...,0.688,6,23,29,31,11,0,12,17,-8.0


# Clean and Explore data

In [3]:
games.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [4]:
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

In [5]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
0,Brooklyn Nets,0042200124,2023-04-22,BKN vs. PHI,,-7.4
1,Philadelphia 76ers,0042200124,2023-04-22,PHI @ BKN,,8.0
2,New York Knicks,0042200133,2023-04-21,NYK vs. CLE,W,20.0
3,Atlanta Hawks,0042200113,2023-04-21,ATL vs. BOS,W,8.0
4,Boston Celtics,0042200113,2023-04-21,BOS @ ATL,L,-8.0
...,...,...,...,...,...,...
8815,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0
8816,Dallas Mavericks,0021900722,2020-01-31,DAL @ HOU,L,-7.0
8817,Brooklyn Nets,0021900721,2020-01-31,BKN vs. CHI,W,15.0
8818,Toronto Raptors,0021900720,2020-01-31,TOR @ DET,W,13.0


final DataFrame: two columns: 1.result of game: target 2.score statistics comparing two teams: feature

In [6]:
import pandas as pd # For Data Pre-processing

In [7]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8820 entries, 0 to 8819
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TEAM_NAME   8820 non-null   object 
 1   GAME_ID     8820 non-null   object 
 2   GAME_DATE   8820 non-null   object 
 3   MATCHUP     8820 non-null   object 
 4   WL          8818 non-null   object 
 5   PLUS_MINUS  8820 non-null   float64
dtypes: float64(1), object(5)
memory usage: 413.6+ KB


In [8]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE']) # Normalizing the Date

In [9]:
games['GAME_DATE']

0      2023-04-22
1      2023-04-22
2      2023-04-21
3      2023-04-21
4      2023-04-21
          ...    
8815   2020-01-31
8816   2020-01-31
8817   2020-01-31
8818   2020-01-31
8819   2020-01-31
Name: GAME_DATE, Length: 8820, dtype: datetime64[ns]

In [10]:
games = games.sort_values('GAME_DATE')
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
8819,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0
8806,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0
8807,Oklahoma City Thunder,0021900725,2020-01-31,OKC @ PHX,W,4.0
8808,Houston Rockets,0021900722,2020-01-31,HOU vs. DAL,W,7.0
8809,Portland Trail Blazers,0021900726,2020-01-31,POR @ LAL,W,8.0
...,...,...,...,...,...,...
5,Cleveland Cavaliers,0042200133,2023-04-21,CLE @ NYK,L,-20.0
6,Denver Nuggets,0042200143,2023-04-21,DEN @ MIN,W,9.0
4,Boston Celtics,0042200113,2023-04-21,BOS @ ATL,L,-8.0
1,Philadelphia 76ers,0042200124,2023-04-22,PHI @ BKN,,8.0


In [11]:
games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

In [12]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
8819,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,
8806,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0,
8807,Oklahoma City Thunder,0021900725,2020-01-31,OKC @ PHX,W,4.0,
8808,Houston Rockets,0021900722,2020-01-31,HOU vs. DAL,W,7.0,
8809,Portland Trail Blazers,0021900726,2020-01-31,POR @ LAL,W,8.0,
...,...,...,...,...,...,...,...
5,Cleveland Cavaliers,0042200133,2023-04-21,CLE @ NYK,L,-20.0,6.733333
6,Denver Nuggets,0042200143,2023-04-21,DEN @ MIN,W,9.0,3.466667
4,Boston Celtics,0042200113,2023-04-21,BOS @ ATL,L,-8.0,7.966667
1,Philadelphia 76ers,0042200124,2023-04-22,PHI @ BKN,,8.0,6.466667


In [13]:
games[games['TEAM_NAME']=='Toronto Raptors'].head(35)

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
8818,Toronto Raptors,21900720,2020-01-31,TOR @ DET,W,13.0,
8779,Toronto Raptors,21900740,2020-02-02,TOR vs. CHI,W,27.0,
8734,Toronto Raptors,21900758,2020-02-05,TOR vs. IND,W,1.0,
8712,Toronto Raptors,21900772,2020-02-07,TOR @ IND,W,9.0,
8692,Toronto Raptors,21900781,2020-02-08,TOR vs. BKN,W,1.0,
8669,Toronto Raptors,21900796,2020-02-10,TOR vs. MIN,W,11.0,
8626,Toronto Raptors,21900809,2020-02-12,TOR @ BKN,L,-10.0,
8589,Toronto Raptors,21900829,2020-02-21,TOR vs. PHX,W,17.0,
8556,Toronto Raptors,21900843,2020-02-23,TOR vs. IND,W,46.0,
8527,Toronto Raptors,21900858,2020-02-25,TOR vs. MIL,L,-11.0,


In [14]:
msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

In [15]:
games_home.shape

(4410, 7)

In [16]:
games_away.shape

(4410, 7)

In [17]:
games_home

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
8819,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,
8806,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0,
8808,Houston Rockets,0021900722,2020-01-31,HOU vs. DAL,W,7.0,
8814,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0,
8815,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0,
...,...,...,...,...,...,...,...
9,Golden State Warriors,0042200163,2023-04-20,GSW vs. SAC,W,17.0,4.066667
2,New York Knicks,0042200133,2023-04-21,NYK vs. CLE,W,20.0,4.733333
3,Atlanta Hawks,0042200113,2023-04-21,ATL vs. BOS,W,8.0,1.033333
7,Minnesota Timberwolves,0042200143,2023-04-21,MIN vs. DEN,L,-9.0,-1.766667


In [18]:
games_away

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
8807,Oklahoma City Thunder,0021900725,2020-01-31,OKC @ PHX,W,4.0,
8809,Portland Trail Blazers,0021900726,2020-01-31,POR @ LAL,W,8.0,
8810,Denver Nuggets,0021900724,2020-01-31,DEN @ MIL,W,12.0,
8811,Chicago Bulls,0021900721,2020-01-31,CHI @ BKN,L,-15.0,
8818,Toronto Raptors,0021900720,2020-01-31,TOR @ DET,W,13.0,
...,...,...,...,...,...,...,...
8,Sacramento Kings,0042200163,2023-04-20,SAC @ GSW,L,-17.0,2.800000
5,Cleveland Cavaliers,0042200133,2023-04-21,CLE @ NYK,L,-20.0,6.733333
6,Denver Nuggets,0042200143,2023-04-21,DEN @ MIN,W,9.0,3.466667
4,Boston Celtics,0042200113,2023-04-21,BOS @ ATL,L,-8.0,7.966667


In [19]:
games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away
0,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,,Toronto Raptors,2020-01-31,TOR @ DET,W,13.0,
1,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0,,Portland Trail Blazers,2020-01-31,POR @ LAL,W,8.0,
2,Houston Rockets,0021900722,2020-01-31,HOU vs. DAL,W,7.0,,Dallas Mavericks,2020-01-31,DAL @ HOU,L,-7.0,
3,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0,,Denver Nuggets,2020-01-31,DEN @ MIL,W,12.0,
4,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0,,Oklahoma City Thunder,2020-01-31,OKC @ PHX,W,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,Golden State Warriors,0042200163,2023-04-20,GSW vs. SAC,W,17.0,4.066667,Sacramento Kings,2023-04-20,SAC @ GSW,L,-17.0,2.800000
4406,New York Knicks,0042200133,2023-04-21,NYK vs. CLE,W,20.0,4.733333,Cleveland Cavaliers,2023-04-21,CLE @ NYK,L,-20.0,6.733333
4407,Atlanta Hawks,0042200113,2023-04-21,ATL vs. BOS,W,8.0,1.033333,Boston Celtics,2023-04-21,BOS @ ATL,L,-8.0,7.966667
4408,Minnesota Timberwolves,0042200143,2023-04-21,MIN vs. DEN,L,-9.0,-1.766667,Denver Nuggets,2023-04-21,DEN @ MIN,W,9.0,3.466667


In [20]:
games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [21]:
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away,avg_30_plus_minus_diff
0,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,,Toronto Raptors,2020-01-31,TOR @ DET,W,13.0,,
1,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0,,Portland Trail Blazers,2020-01-31,POR @ LAL,W,8.0,,
2,Houston Rockets,0021900722,2020-01-31,HOU vs. DAL,W,7.0,,Dallas Mavericks,2020-01-31,DAL @ HOU,L,-7.0,,
3,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0,,Denver Nuggets,2020-01-31,DEN @ MIL,W,12.0,,
4,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0,,Oklahoma City Thunder,2020-01-31,OKC @ PHX,W,4.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,Golden State Warriors,0042200163,2023-04-20,GSW vs. SAC,W,17.0,4.066667,Sacramento Kings,2023-04-20,SAC @ GSW,L,-17.0,2.800000,1.266667
4406,New York Knicks,0042200133,2023-04-21,NYK vs. CLE,W,20.0,4.733333,Cleveland Cavaliers,2023-04-21,CLE @ NYK,L,-20.0,6.733333,-2.000000
4407,Atlanta Hawks,0042200113,2023-04-21,ATL vs. BOS,W,8.0,1.033333,Boston Celtics,2023-04-21,BOS @ ATL,L,-8.0,7.966667,-6.933333
4408,Minnesota Timberwolves,0042200143,2023-04-21,MIN vs. DEN,L,-9.0,-1.766667,Denver Nuggets,2023-04-21,DEN @ MIN,W,9.0,3.466667,-5.233333


In [22]:
games_merged[['WL_home', 'avg_30_plus_minus_diff']]

Unnamed: 0,WL_home,avg_30_plus_minus_diff
0,L,
1,L,
2,W,
3,L,
4,L,
...,...,...
4405,W,1.266667
4406,W,-2.000000
4407,W,-6.933333
4408,L,-5.233333


In [23]:
games_model = games_merged[['WL_home', 'avg_30_plus_minus_diff']].dropna()
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
398,W,1.933333
399,L,-6.233333
401,L,-5.366667
402,W,2.466667
403,L,-5.600000
...,...,...
4404,L,-3.233333
4405,W,1.266667
4406,W,-2.000000
4407,W,-6.933333


In [24]:
games_model['WL_home'] = games_model['WL_home'].map({'W': 1, 'L': 0})

In [25]:
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
398,1,1.933333
399,0,-6.233333
401,0,-5.366667
402,1,2.466667
403,0,-5.600000
...,...,...
4404,0,-3.233333
4405,1,1.266667
4406,1,-2.000000
4407,1,-6.933333


# Build the predictive model, hyperparameter tuning, evaluation

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
df_train, df_test = train_test_split(games_model, stratify=games_model['WL_home'], test_size=0.2, random_state=7)

In [28]:
df_train.shape

(3112, 2)

In [29]:
df_test.shape

(779, 2)

In [30]:
target = 'WL_home'
X_train = df_train.drop(columns=target)
y_train = df_train[target]

X_test = df_test.drop(columns=target)
y_test= df_test[target]

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV



clf = RandomForestClassifier(random_state=7)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy_score(y_test, y_pred)

0.5661103979460848

In [32]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform

In [33]:
hyp_params = {'n_estimators': [10, 50, 100, 150, 200, 250],
              'criterion': ['gini', 'entropy'], # RF, Gini and Entropy are the metrics
              'max_depth': [3, 5, 7, 9, 11, 13], # Max-depth of the DT
              'min_samples_split': [2, 4, 6, 8],
              'min_samples_leaf': [1, 2, 3, 4, 5]}

In [34]:
random_hyp = RandomizedSearchCV(estimator=clf, 
                                param_distributions=hyp_params, 
                                n_iter=20, 
                                cv=7,
                                scoring='accuracy',
                                random_state=7)


In [35]:
random_hyp.fit(X_train, y_train)

random_hyp.best_params_

model_hyp = random_hyp.best_estimator_

y_pred_hyp = model_hyp.predict(X_test)

accuracy_score(y_test, y_pred_hyp)

0.6341463414634146

# Deploy the model with FastAPI

In [36]:
from joblib import dump, load
dump(model_hyp, 'model_nba.joblib') 
model_saved = load('model_nba.joblib') 

accuracy_score(y_test, model_saved.predict(X_test))

0.6341463414634146

In [37]:
# Similar Above Code
# copied all from above
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/31/2020', league_id_nullable='00')
games = gamefinder.get_data_frames()[0]

games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

import pandas as pd

games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

games = games.sort_values('GAME_DATE')

games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))

games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [38]:
team_home='Toronto Raptors'
team_away='Boston Celtics'

import numpy as np
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/01/2021',
                                           league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

msk_home = (games['TEAM_NAME'] == team_home)
games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
home_plus_minus = games_30_home['PLUS_MINUS'].mean()

msk_away = (games['TEAM_NAME'] == team_away)
games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
away_plus_minus = games_30_away['PLUS_MINUS'].mean()

games_diff=home_plus_minus - away_plus_minus
games_diff = np.array([games_diff])
predict_home_win=model_saved.predict(games_diff.reshape(1, -1))[0]
predict_winning_probability=model_saved.predict_proba(games_diff.reshape(1, -1))[0][1]


# predict_home_win=model_saved.predict(np.array([games_diff]))[0]
# predict_winning_probability=model_saved.predict_proba(np.array([games_diff]))[0][1]



In [39]:
def predict_games(team_home, team_away):
    gamefinder = leaguegamefinder.LeagueGameFinder(
        date_from_nullable='01/01/2021',
        league_id_nullable='00')
    games = gamefinder.get_data_frames()[0]
    games = games[
        ['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

    msk_home = (games['TEAM_NAME'] == team_home)
    games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
    home_plus_minus = games_30_home['PLUS_MINUS'].mean()

    msk_away = (games['TEAM_NAME'] == team_away)
    games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
    away_plus_minus = games_30_away['PLUS_MINUS'].mean()

    games_diff = home_plus_minus - away_plus_minus
    games_diff = np.array([games_diff])
    predict_home_win=model_saved.predict(games_diff.reshape(1, -1))[0]
    predict_winning_probability=model_saved.predict_proba(games_diff.reshape(1, -1))[0][1]


#     predict_home_win = model_saved.predict(np.array([games_diff]))[0]
#     predict_winning_probability = model_saved.predict_proba(np.array([games_diff]))[0][1]
    return predict_home_win, predict_winning_probability

In [40]:
predict_games('Boston Celtics','Toronto Raptors')



(1, 0.6991054176515216)