In [1]:
##Author: Nishank Rainghani
##Project: Predicting the 2021 NBA Finals Champion using a SKLearn Decision Tree Regressor
##Data from Sports Reference, csv created by Nishank

##Import neccessary packages and modules
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn import tree
from sklearn.preprocessing import scale


##load in your data using the pandas module
pastDataDF = pd.read_csv("nbaadvancedstats22.csv")
currentDataDF = pd.read_csv("nbacurrentstats11.csv")

##clean up your data so all of it is scaled
##THIS IS THE MOST IMPORTANT PART
##If you have unscaled, bad data then there is no way you get good results
##The machine just processes the data, YOU need to make sure that it is relevent
currentDataDF = currentDataDF.drop(['Unnamed: 17','Unnamed: 22','Unnamed: 27','Unnamed: 31', 'Unnamed: 32'], axis=1)
currentDataDF['DRB%'] = (currentDataDF['DRB%'].values)/100
pastDataDF['DRB%'] = (pastDataDF['DRB%'].values)/100
currentDataDF['DTOV%'] = (currentDataDF['DTOV%'].values)/100
pastDataDF['DTOV%'] = (pastDataDF['DTOV%'].values)/100
currentDataDF['ORB%'] = (currentDataDF['ORB%'].values)/100
pastDataDF['ORB%'] = (pastDataDF['ORB%'].values)/100
currentDataDF['TOV%'] = (currentDataDF['TOV%'].values)/100
pastDataDF['TOV%'] = (pastDataDF['TOV%'].values)/100
currentDataDF['FTr'] = (currentDataDF['FTr'].values)*1
pastDataDF['FTr'] = (pastDataDF['FTr'].values)*1
currentDataDF['Pace'] = (currentDataDF['Pace'].values)/100
pastDataDF['Pace'] = (pastDataDF['Pace'].values)/100
currentDataDF['DFT/FGA'] = (currentDataDF['DFT/FGA'].values)*1
pastDataDF['DFT/FGA'] = (pastDataDF['DFT/FGA'].values)*1
currentDataDF['ORtg'] = (currentDataDF['ORtg'].values)/100
pastDataDF['ORtg'] = (pastDataDF['ORtg'].values)/100
currentDataDF['Drtg'] = (currentDataDF['DRtg'].values)/100
pastDataDF['DRtg'] = (pastDataDF['DRtg'].values)/100
currentDataDF['SRS'] = (currentDataDF['SRS'].values)/10
pastDataDF['SRS'] = (pastDataDF['SRS'].values)/10
currentDataDF['MOV'] = (currentDataDF['MOV'].values)/10
pastDataDF['MOV'] = (pastDataDF['MOV'].values)/10
currentDataDF['NRtg'] = (currentDataDF['NRtg'].values)/10
pastDataDF['NRtg'] = (pastDataDF['NRtg'].values)/10


##create your features. We are using various advanced statistics and only the most relevant ones will be used.
##For this project we are using SRS, FTr, 3Pr, TS%, eFG%, TOV%, ORB%, DeFG%, DTOV%, NRtg, ORtg, DRtg, FT/FGA, DFT/FGA, and MOV
##You can reference the basketball reference glossary for any terms you may be unfamiliar with
features = list(pastDataDF.columns[[4,9,10,11,12,13,14,15,16,18,19,23,24]])
features1 = list(currentDataDF.columns[[4,9,10,11,12,13,14,15,16,17,18,21,22]])


##Here is where we apply the features and define out input and output variables
y= pastDataDF['RANK']
x = pastDataDF[features]
X = currentDataDF[features1]

#Initialize your regressor and fit your model
clf = tree.DecisionTreeRegressor()
clf = clf.fit(x,y)

##Get your predictions and sort them in order
##The top team is your winner, the bottom team is the first pick in the draft
##Note: All of this data is from the regular season, therefore it does not account for injuries,
##So teams like the Nets may be a little low.
##But, according to our model, the winner should be the either the Suns, or Jazz
##As a basketball fan, I seem skeptical about these results, but keep in mind this is using raw stats,
##Whereas the game of basketball is a little subjective as well.
predictions = clf.predict(X)
predictionDF = DataFrame({'predictions':predictions})
finalDF = predictionDF.join(currentDataDF)
finalDF.sort_values("predictions")

Unnamed: 0,predictions,TeamID,Team,Age,W,L,PW,PL,MOV,SOS,...,ORB%,FT/FGA,DeFG%,DTOV%,DRB%,DFT/FGA,Arena,Attend.,Attend./G,Drtg
23,1.0,24,Phoenix Suns*,26.6,51,21,49,23,0.582,-0.15,...,0.208,0.177,0.534,0.124,0.785,0.194,Phoenix Suns Arena,104027.0,2890.0,1.113
28,1.0,29,Utah Jazz*,28.5,52,20,55,17,0.925,-0.29,...,0.245,0.195,0.507,0.103,0.793,0.159,Vivint Smart Home Arena,151300.0,4203.0,1.083
12,2.0,13,Los Angeles Clippers*,28.8,47,25,49,23,0.618,-0.16,...,0.227,0.186,0.531,0.119,0.791,0.186,STAPLES Center,13901.0,386.0,1.112
6,2.0,7,Dallas Mavericks*,26.3,42,30,41,31,0.226,-0.01,...,0.211,0.189,0.534,0.115,0.778,0.197,American Airlines Center,94849.0,2635.0,1.13
16,2.0,17,Milwaukee Bucks*,28.1,46,26,48,24,0.589,-0.32,...,0.233,0.177,0.536,0.115,0.797,0.157,Fiserv Forum,64780.0,1799.0,1.114
7,3.0,8,Denver Nuggets*,26.1,47,25,47,25,0.493,-0.11,...,0.247,0.176,0.545,0.129,0.789,0.2,Ball Arena,54563.0,1516.0,1.121
2,6.0,3,Brooklyn Nets*,28.2,48,24,46,26,0.45,-0.27,...,0.214,0.208,0.531,0.111,0.773,0.187,Barclays Center,30491.0,847.0,1.138
22,7.0,23,Philadelphia 76ers*,27.1,49,23,48,24,0.558,-0.31,...,0.232,0.225,0.521,0.138,0.782,0.2,Wells Fargo Center,68583.0,1905.0,1.076
11,9.0,12,Indiana Pacers,26.5,34,38,36,36,-0.004,-0.08,...,0.202,0.18,0.531,0.127,0.749,0.201,Bankers Life Fieldhouse,,,1.124
19,10.0,20,New York Knicks*,25.6,41,31,41,31,0.231,-0.18,...,0.219,0.19,0.509,0.117,0.786,0.196,Madison Square Garden (IV),42131.0,1170.0,1.082


In [2]:
##You can also look split up the data into a training and testing model to calculate your error
##As you can see there is in fact a decent bit of error with this model.
##You can toy with the stats and try and get a better result, but it is going to be hard to get a better accuracy than this
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
trainDF, testDF = train_test_split(pastDataDF, test_size = 0.2, random_state = 50, shuffle = True)
xx = trainDF[features]
yy = trainDF['RANK']
x1 = testDF[features]
clf2 = clf.fit(xx,yy)
pr2 = clf2.predict(x1)
print(pr2)
r2 = r2_score(testDF['RANK'].values, pr2)
print(r2)

[ 8.  7.  5. 30. 30.  8. 12. 20.  6.  5.  8. 20.  3.  8. 20. 18. 10.  9.
 25.  8. 24. 16.  1. 24. 18. 29. 26. 29. 17.  6. 29.  8. 25. 15.  2.  2.
  5. 23. 13. 11.  5.  4.]
0.6511627906976745
