## Part 1 - Data Preparation and Exploration 

In [1]:
%%capture
# Due to the configuration of the base Jupter image, the following imports are required for the regressions in the assignment to report the correct metrics

import sys 
!{sys.executable} -m pip uninstall statsmodels --yes 
!{sys.executable} -m pip uninstall numpy --yes
!{sys.executable} -m pip uninstall pandas --yes 
!{sys.executable} -m pip uninstall patsy --yes 
!{sys.executable} -m pip install numpy==1.17
!{sys.executable} -m pip install pandas==1.0
!{sys.executable} -m pip install patsy==0.5.2
!{sys.executable} -m pip install statsmodels==0.11.1

In [2]:
#Import Libraries

import pandas as pd
import datetime as dt
import scipy.stats as sp
import numpy as np
import statsmodels.formula.api as sm 

In [3]:
# Import Shotlog_14_15 and Player_Stats Datasets

Shotlog_1415=pd.read_csv("Assignment Data/Week 6/Shotlog_14_15.csv")
Player_Stats=pd.read_csv("Assignment Data/Week 6/Player_Stats_14_15.csv")
display(Shotlog_1415)

Unnamed: 0,game_id,date,match,home_team,away_team,home_away,result,final_margin,shot_number,quarter,...,closest_defender,closest_defender_id,closest_def_dist,current_shot_hit,points_earned,shoot_player,player_id,average_hit,shot_count,shot_per_game
0,21400280,5-Dec-14,ATL @ BKN,BKN,ATL,A,W,23,1,1,...,"Lopez, Brook",201572,6.6,1,2,al horford,201143,0.541259,715,10
1,21400280,5-Dec-14,ATL @ BKN,BKN,ATL,A,W,23,2,1,...,"Lopez, Brook",201572,5.6,0,0,al horford,201143,0.541259,715,10
2,21400280,5-Dec-14,ATL @ BKN,BKN,ATL,A,W,23,3,1,...,"Lopez, Brook",201572,4.7,0,0,al horford,201143,0.541259,715,10
3,21400280,5-Dec-14,ATL @ BKN,BKN,ATL,A,W,23,4,1,...,"Lopez, Brook",201572,5.8,0,0,al horford,201143,0.541259,715,10
4,21400280,5-Dec-14,ATL @ BKN,BKN,ATL,A,W,23,5,2,...,"Lopez, Brook",201572,6.4,0,0,al horford,201143,0.541259,715,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128064,21400350,14-Dec-14,WAS vs. UTA,WAS,UTA,H,W,9,11,3,...,"Burke, Trey",203504,4.7,1,2,john wall,202322,0.448513,874,15
128065,21400350,14-Dec-14,WAS vs. UTA,WAS,UTA,H,W,9,12,3,...,"Exum, Dante",203957,3.4,1,2,john wall,202322,0.448513,874,15
128066,21400350,14-Dec-14,WAS vs. UTA,WAS,UTA,H,W,9,13,4,...,"Kanter, Enes",202683,1.2,0,0,john wall,202322,0.448513,874,15
128067,21400350,14-Dec-14,WAS vs. UTA,WAS,UTA,H,W,9,14,4,...,"Kanter, Enes",202683,1.4,1,2,john wall,202322,0.448513,874,15


In [4]:
# Storing "date" as a date type variable and produce summary statistics for the variable.
Shotlog_1415['date']=pd.to_datetime(Shotlog_1415['date'])
Shotlog_1415['date'].describe()

count                  128069
unique                    120
top       2015-01-07 00:00:00
freq                     1941
first     2014-10-28 00:00:00
last      2015-03-04 00:00:00
Name: date, dtype: object

Create a lagged variable to indicate the result of the previous shot by the same player in the same game.

We will first sort the current shot outcome and match;

We will group the data by player and match and use the "shift" command to create a lag variable.

In [5]:
Shotlog_1415['lag_shot_hit']=Shotlog_1415.sort_values(by=['shot_number'], ascending=[True]).groupby(['player_id','game_id'])['current_shot_hit'].shift(1)
Shotlog_1415.head()

Unnamed: 0,game_id,date,match,home_team,away_team,home_away,result,final_margin,shot_number,quarter,...,closest_defender_id,closest_def_dist,current_shot_hit,points_earned,shoot_player,player_id,average_hit,shot_count,shot_per_game,lag_shot_hit
0,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,1,1,...,201572,6.6,1,2,al horford,201143,0.541259,715,10,
1,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,2,1,...,201572,5.6,0,0,al horford,201143,0.541259,715,10,1.0
2,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,3,1,...,201572,4.7,0,0,al horford,201143,0.541259,715,10,0.0
3,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,4,1,...,201572,5.8,0,0,al horford,201143,0.541259,715,10,0.0
4,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,5,2,...,201572,6.4,0,0,al horford,201143,0.541259,715,10,0.0


Create a variable “error” to indicate the prediction error for each shot and a variable “lagerror” for the prediction error for the previous shot.

In [6]:
Shotlog_1415['error']=Shotlog_1415['current_shot_hit']-Shotlog_1415['average_hit']
Shotlog_1415['lagerror']=Shotlog_1415['lag_shot_hit']-Shotlog_1415['average_hit']
Shotlog_1415.head()

Unnamed: 0,game_id,date,match,home_team,away_team,home_away,result,final_margin,shot_number,quarter,...,current_shot_hit,points_earned,shoot_player,player_id,average_hit,shot_count,shot_per_game,lag_shot_hit,error,lagerror
0,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,1,1,...,1,2,al horford,201143,0.541259,715,10,,0.458741,
1,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,2,1,...,0,0,al horford,201143,0.541259,715,10,1.0,-0.541259,0.458741
2,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,3,1,...,0,0,al horford,201143,0.541259,715,10,0.0,-0.541259,-0.541259
3,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,4,1,...,0,0,al horford,201143,0.541259,715,10,0.0,-0.541259,-0.541259
4,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,5,2,...,0,0,al horford,201143,0.541259,715,10,0.0,-0.541259,-0.541259


Create summary statistics for "error" and "lagerror" variables.

In [7]:
Shotlog_1415['error'].describe()

count    1.280690e+05
mean    -5.770049e-18
std      4.949640e-01
min     -7.124682e-01
25%     -4.491979e-01
50%     -3.850837e-01
75%      5.395973e-01
max      6.914894e-01
Name: error, dtype: float64

In [8]:
Shotlog_1415['lagerror'].describe()

count    113726.000000
mean          0.006303
std           0.496035
min          -0.712468
25%          -0.449198
50%          -0.382143
75%           0.542254
max           0.691489
Name: lagerror, dtype: float64

## Part 2 - Conditional Probability and Autocorrelation

Create a dummy variable “conse_shot_hit” to indicate a player made consecutive shots.

In [13]:
Shotlog_1415['conse_shot'] = np.where((Shotlog_1415['current_shot_hit']==1)&(Shotlog_1415['lag_shot_hit']==1), 1, 0) 
Shotlog_1415.head()

Unnamed: 0,game_id,date,match,home_team,away_team,home_away,result,final_margin,shot_number,quarter,...,points_earned,shoot_player,player_id,average_hit,shot_count,shot_per_game,lag_shot_hit,error,lagerror,conse_shot
0,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,1,1,...,2,al horford,201143,0.541259,715,10,,0.458741,,0
1,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,2,1,...,0,al horford,201143,0.541259,715,10,1.0,-0.541259,0.458741,0
2,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,3,1,...,0,al horford,201143,0.541259,715,10,0.0,-0.541259,-0.541259,0
3,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,4,1,...,0,al horford,201143,0.541259,715,10,0.0,-0.541259,-0.541259,0
4,21400280,2014-12-05,ATL @ BKN,BKN,ATL,A,W,23,5,2,...,0,al horford,201143,0.541259,715,10,0.0,-0.541259,-0.541259,0


Create a dataframe for the probability of making the previous shot and the joint probability for making both the previous and current shots.

In [15]:
Player_Prob=Shotlog_1415.groupby(['shoot_player'])['conse_shot','lag_shot_hit'].mean().reset_index()
Player_Prob.rename(columns={'lag_shot_hit':'average_lag_hit'}, inplace=True)
Player_Prob.rename(columns={'conse_shot':'conse_shot_hit'}, inplace=True)
Player_Prob.head()

Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit
0,aaron brooks,0.153298,0.418
1,aaron gordon,0.201923,0.532468
2,al farouq aminu,0.162791,0.465686
3,al horford,0.262937,0.537994
4,al jefferson,0.2075,0.48


Calculate the conditional probability for a player to make a shot given that he made the previous shot.

In [17]:
Player_Prob['conditional_prob']=Player_Prob['conse_shot_hit']/Player_Prob['average_lag_hit']
Player_Prob.head()

Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit,conditional_prob
0,aaron brooks,0.153298,0.418,0.366741
1,aaron gordon,0.201923,0.532468,0.379221
2,al farouq aminu,0.162791,0.465686,0.349572
3,al horford,0.262937,0.537994,0.488736
4,al jefferson,0.2075,0.48,0.432292


Merge the “Player_Prob” dataframe into the “Player_Stats” dataframe.

In [18]:
Player_Stats=pd.merge(Player_Prob, Player_Stats, on=['shoot_player'])
Player_Stats.head()

Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit,conditional_prob,average_hit
0,aaron brooks,0.153298,0.418,0.366741,0.41533
1,aaron gordon,0.201923,0.532468,0.379221,0.528846
2,al farouq aminu,0.162791,0.465686,0.349572,0.430233
3,al horford,0.262937,0.537994,0.488736,0.541259
4,al jefferson,0.2075,0.48,0.432292,0.4775


Calculate summary statistics for the unconditional probability of players making a shot, the conditional probability of players making a shot given they make the previous one, and the probability of players making consecutive shots.

In [19]:
Player_Stats['average_hit'].describe()

count    281.000000
mean       0.451545
std        0.059392
min        0.308511
25%        0.413223
50%        0.446078
75%        0.480480
max        0.712468
Name: average_hit, dtype: float64

In [20]:
Player_Stats['conditional_prob'].describe()

count    281.000000
mean       0.380233
std        0.062320
min        0.225801
25%        0.336689
50%        0.381570
75%        0.422801
max        0.613209
Name: conditional_prob, dtype: float64

In [21]:
Player_Stats['conse_shot_hit'].describe()

count    281.000000
mean       0.176987
std        0.047943
min        0.076190
25%        0.144543
50%        0.171625
75%        0.203512
max        0.422392
Name: conse_shot_hit, dtype: float64

Perform a t-test on the difference between conditional and unconditonal probabilities.

In [22]:
sp.stats.ttest_ind(Player_Stats['conditional_prob'], Player_Stats['average_hit'])

Ttest_indResult(statistic=-13.885932802814914, pvalue=6.925846314604593e-38)

Calculate the first order autocorrelation coefficient on making a shot for the entire shotlog dataset.

In [24]:
Shotlog_1415['current_shot_hit'].corr(Shotlog_1415['lag_shot_hit'])

-0.010502388301693177

Calculate the first order autocorrelation coefficient on making a shot for each player.

In [25]:
Autocorr_Hit=Shotlog_1415.groupby('shoot_player')[['current_shot_hit','lag_shot_hit']].corr().unstack().iloc[:,1].reset_index()
Autocorr_Hit.columns=Autocorr_Hit.columns.get_level_values(0)
Autocorr_Hit.rename(columns={'current_shot_hit':'autocorr'}, inplace=True)
Autocorr_Hit.sort_values(by=['autocorr'], ascending=[False]).head(10)

Unnamed: 0,shoot_player,autocorr
131,joey dorsey,0.334252
54,cole aldrich,0.174666
200,nate robinson,0.122107
267,tyler hansbrough,0.120608
7,alex len,0.118461
50,cj mccollum,0.115949
114,jason smith,0.105903
190,matt bonner,0.098577
143,jusuf nurkic,0.097465
195,mike miller,0.089366


## Part 3 - Regression Analyses

Reg1: linear least squares regression using the entire shotlog dataframe

In [None]:
reg1 = sm.ols(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+C(points)+C(quarter)+home_away+shoot_player+closest_defender+closest_def_dist', data= Shotlog_1415).fit()
print(reg1.summary())

Reg2: Weighted least squares regression using the entire shotlog dataframe, weighted by shot_per_game.

Dependent variable: error
Independent variables:lagerror, shot_dist, dribbles, touch_time, points, quarter, home_away, shoot_player, closest_defender, and closest_def_dist

In [None]:
reg2 = sm.wls(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+C(points)+C(quarter)+home_away+shoot_player+closest_defender+closest_def_dist',  weights=1/Shotlog_1415['shot_per_game'] , data= Shotlog_1415).fit()
print(reg2.summary())

Reg3_player: linear least squares regressions on individual players

Dependent variable: error
Independent variables:lagerror, shot_dist, dribbles, touch_time, points, quarter, home_away, and closest_def_dist

In [None]:
def Reg3_player(player):
    Shotlog_player=Shotlog_1415[Shotlog_1415.shoot_player==player]
    Reg3_player=sm.ols(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+C(points)+C(quarter)+home_away+closest_def_dist', data= Shotlog_player).fit()
    print(Reg3_player.summary())
    return; 

Show regression results for given players.

In [None]:
Reg3_player('andrew wiggins')

In [None]:
Reg3_player('stephen curry')

In [None]:
Reg3_player('james harden')

In [None]:
Reg3_player('russell westbrook')

Reg4_wls_player: weighted least squares regressions on individual players, weighted by shot_per_game.

Dependent variable: error
Independent variables:lagerror, shot_dist, dribbles, touch_time, points, quarter, home_away, and closest_def_dist

In [None]:
def Reg4_wls_player(player):
    Shotlog_player=Shotlog_1415[Shotlog_1415.shoot_player==player]
    Reg4_wls_player=sm.wls(formula = 'error ~ lagerror+shot_dist+dribbles+touch_time+C(points)+C(quarter)+home_away+closest_def_dist',weights=1/Shotlog_player['shot_per_game'] , data= Shotlog_player).fit()
    print(Reg4_wls_player.summary())
    return; 

In [None]:
Reg4_wls_player('reggie jackson')

In [None]:
Reg4_wls_player('alonzo gee')

In [None]:
Reg4_wls_player('cole aldrich')

In [None]:
Reg4_wls_player('stephen curry')
