In [26]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix

In [28]:
data = pd.read_csv("./all_data")
data.head()

Unnamed: 0.1,Unnamed: 0,PLAYER_ID,PTS,GP,PLAYER_NAME,POSITION,HEIGHT_WO_SHOES,WEIGHT,WINGSPAN,STANDING_REACH,STANDING_VERTICAL_LEAP,MAX_VERTICAL_LEAP,LANE_AGILITY_TIME
0,0,2403,15.6,965,Nene,PF,81.25,253.0,88.5,109.0,30.0,34.0,10.73
1,1,203500,12.8,588,Steven Adams,C,82.75,254.5,88.5,109.5,28.5,33.0,11.85
2,2,1628389,16.3,287,Edrice Adebayo,PF-C,80.75,242.6,86.75,108.0,33.5,38.5,11.94
3,3,200746,20.4,1029,LaMarcus Aldridge,PF-C,82.0,234.0,88.75,110.0,26.5,34.0,12.02
4,4,1628960,16.1,126,Grayson Allen,SG,75.0,198.0,79.25,97.0,32.5,40.5,10.31


In [20]:
# Response 
y = np.array(data.PTS)

# Treatment 
vertical = np.array(data.MAX_VERTICAL_LEAP)
A = np.array([1 if v > 35 else 0 for v in vertical])
data["A"] = A

# Covariates
X_Q = data[["HEIGHT_WO_SHOES", "WEIGHT", "WINGSPAN", "LANE_AGILITY_TIME", "A"]]
X_g = data[["HEIGHT_WO_SHOES", "WEIGHT", "WINGSPAN", "LANE_AGILITY_TIME"]]

In [25]:
Q = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=0, loss='ls')
Q.fit(X_Q, y)

g = LogisticRegressionCV(random_state=0)
g.fit(X_g, A)
A_pred = g.predict(X_g)

confusion_matrix(A, A_pred)


array([[ 48,  46],
       [ 23, 116]])

In [39]:
def aiptw(Q, g, A_name, x_names, X_Q, X_g, A, y):
    X_Q_all_treatment = X_Q.copy()
    X_Q_all_treatment[A_name] = 1
    
    X_Q_no_treatment = X_Q.copy()
    X_Q_no_treatment[A_name] = 0
    
    Q_1_x = Q.predict(X_Q_all_treatment)
    Q_0_x = Q.predict(X_Q_no_treatment)
    
    
    g_x = g.predict_proba(X_g)[:,1]
    
    summands = Q_1_x - Q_0_x + A*(y-Q_1_x)/g_x - (1-A)*(y-Q_0_x)/(1-g_x)
    
    return np.mean(summands), np.var(summands)

In [40]:
aiptw(Q, g, "A", ["HEIGHT_WO_SHOES", "WEIGHT", "WINGSPAN", "LANE_AGILITY_TIME"], X_Q, X_g, A, y)

(0.5869044458181463, 28.88961172484478)

In [113]:
def fold(train_index, test_index, A_name, x_names, X_Q, X_g, A, y):
    Q = GradientBoostingRegressor(n_estimators=50, learning_rate=0.2, max_depth=3, random_state=0, loss='ls')
    Q.fit(X_Q.iloc[train_index,:], y[train_index])

    g = LogisticRegressionCV(random_state=0)
    g.fit(X_g.iloc[train_index,:], A[train_index])
    
    return aiptw(
        Q, g, A_name, 
        x_names, 
        X_Q.iloc[test_index,:], X_g.iloc[test_index,:], A[test_index], y[test_index])

In [114]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True)
mean = []
var = []
for train_index, test_index in kf.split(A):
    mean_temp, var_temp = fold(train_index, test_index, "A", ["HEIGHT_WO_SHOES", "WEIGHT", "WINGSPAN", "LANE_AGILITY_TIME"], X_Q, X_g, A, y)
    mean.append(mean_temp)
    var.append(var_temp)

In [115]:
np.mean(mean)

0.7027459831129865

In [116]:
np.mean(var)/np.sqrt(5)

44.08619250945192