From the data analysis we will be using the following variables as independent variables in our model:

1. Field Goals Made
2. Field Goal Percentage
3. Three Point Made
4. Free Throws made
5. Free Throw Percentage
6. Offensive Rebounds
7. Defensive Rebounds
8. Assists
9. Steals
10. Blocks
11. Turnovers
12. Personal Fouls


In [463]:
#import all the libraries and functions we need
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [482]:
#Here we will change the records to be easier to read and use on the dataframe. Same as the transformations in the data exploration and visualisation
df = pd.read_excel("../DataScraper/NBAScores.xlsx")
df.drop(columns=df.columns[0:3], axis=1,  inplace=True)
df.drop(columns=df.columns[2:4], axis=1,  inplace=True)
df.drop(columns=df.columns[-1], axis=1,  inplace=True)
def strip_last_four(s):
    return s[-4:]

df['Game Date'] = df['Game Date'].apply(strip_last_four)
df = df.rename(columns={'Game Date': 'Year'})
df = df.rename(columns={'W/L': 'WinLoss', 'FG%': "FGpercent", '3P%':'threePcent', '3PM':'threeMade', '3PA':'threeAtt', 'FT%':'ftpercent'})
df['WinLoss'] = df['WinLoss'].replace({'W': 1, 'L': 0})
df.head()

Unnamed: 0,Year,WinLoss,FGM,FGA,FGpercent,threeMade,threeAtt,threePcent,FTM,FTA,ftpercent,OREB,DREB,REB,AST,STL,BLK,TOV,PF
0,2023,1,45,83,54.2,18,44,40.9,15,20,75.0,7,37,44,30,10,3,18,20
1,2023,0,35,89,39.3,10,37,27.0,15,16,93.8,11,33,44,27,9,5,16,24
2,2023,1,37,81,45.7,6,32,18.8,26,30,86.7,10,39,49,23,8,5,19,14
3,2023,0,41,97,42.3,14,45,31.1,13,21,61.9,14,37,51,28,11,5,13,21
4,2023,1,40,95,42.1,6,30,20.0,17,22,77.3,11,37,48,21,15,7,4,16


In [540]:
#Now we will trim the data frame to the variables we want for the model as well as all variables for comparison of results
allVarX = df.drop(['Year', 'WinLoss',], axis=1, inplace=False)
allVarX.head()

Unnamed: 0,FGM,FGA,FGpercent,threeMade,threeAtt,threePcent,FTM,FTA,ftpercent,OREB,DREB,REB,AST,STL,BLK,TOV,PF
0,45,83,54.2,18,44,40.9,15,20,75.0,7,37,44,30,10,3,18,20
1,35,89,39.3,10,37,27.0,15,16,93.8,11,33,44,27,9,5,16,24
2,37,81,45.7,6,32,18.8,26,30,86.7,10,39,49,23,8,5,19,14
3,41,97,42.3,14,45,31.1,13,21,61.9,14,37,51,28,11,5,13,21
4,40,95,42.1,6,30,20.0,17,22,77.3,11,37,48,21,15,7,4,16


In [541]:
y = df.WinLoss

In [542]:
X_train, X_test, y_train, y_test = train_test_split(allVarX,y,train_size=0.4)
X_train_setVar= X_train[['FGM', 'FGpercent', 'threeMade', 'FTM', 'ftpercent', 'DREB', 'OREB', 'AST', 'STL', 'BLK', 'TOV', 'PF']]
X_test_setVar= X_test[['FGM', 'FGpercent', 'threeMade', 'FTM', 'ftpercent', 'DREB','OREB', 'AST', 'STL', 'BLK', 'TOV', 'PF']]

In [543]:
#This is the results for all variables included
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)
model.predict(X_test)
model.score(X_test,y_test)

0.8373983739837398

In [544]:
#This is the results for the variables we chose
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train_setVar, y_train)
model.predict(X_test_setVar)
model.score(X_test_setVar, y_test)

0.8384146341463414

There seems to be a 0.2%-0.5% improvement with our selected variables.

In [546]:
model.predict_proba(X_test_setVar)

array([[7.94590660e-01, 2.05409340e-01],
       [4.04652561e-01, 5.95347439e-01],
       [9.19341877e-01, 8.06581233e-02],
       ...,
       [3.21436363e-01, 6.78563637e-01],
       [6.99181488e-04, 9.99300819e-01],
       [3.53291364e-01, 6.46708636e-01]])

In [547]:
model.predict(X_test_setVar)

array([0, 1, 0, ..., 1, 1, 1], dtype=int64)