In [356]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [357]:
filename= "../data/kobe/kobe_bryant_shot_data.csv"
df = pd.read_csv(filename, na_values={'shot_made_flag': ''})
df = df.dropna()
df = df.drop([u'game_event_id', u'game_id',
       u'lat', u'lon', u'team_id', u'team_name', u'game_date',
        u'shot_id'], axis=1)

df = df.drop(['loc_x', 'loc_y', 'shot_type','shot_zone_basic', 'shot_zone_range'], axis=1)

In [358]:
df['home'] = df.matchup.apply(lambda matchup: 0 if '@' in matchup else 1)
df = df.drop(['matchup'], axis=1)

df['time_remaining'] = 60 * df['minutes_remaining'] + df['seconds_remaining']
df = df.drop(['minutes_remaining', 'seconds_remaining'], axis=1)

cols = df.columns.tolist()
cols.remove('shot_made_flag')
cols.append('shot_made_flag')

df = df[cols]

In [359]:
filename= "../data/intro/kobe_bryant_shot_data_refined.csv"
df.to_csv(filename, index=False)

In [360]:
filename= "../data/intro/kobe_bryant_shot_data_refined.csv"
df = pd.read_csv(filename)

In [361]:
df.head()

Unnamed: 0,action_type,combined_shot_type,period,playoffs,season,shot_distance,shot_zone_area,opponent,home,time_remaining,shot_made_flag
0,Jump Shot,Jump Shot,1,0,2000-01,15,Left Side(L),POR,0,622,0.0
1,Jump Shot,Jump Shot,1,0,2000-01,16,Left Side Center(LC),POR,0,465,1.0
2,Jump Shot,Jump Shot,1,0,2000-01,22,Right Side Center(RC),POR,0,412,0.0
3,Driving Dunk Shot,Dunk,2,0,2000-01,0,Center(C),POR,0,379,1.0
4,Jump Shot,Jump Shot,3,0,2000-01,14,Left Side(L),POR,0,572,0.0


In [362]:
original_df = df.copy()

In [363]:
q75 = np.percentile(df.shot_distance, 75)

In [364]:
q75

21.0

In [365]:
df['shot_distance'] = df.shot_distance.clip(upper=45)

In [366]:
from sklearn.preprocessing import StandardScaler

In [367]:
scaler = StandardScaler()

In [368]:
df['time_remaining'] = scaler.fit_transform(df.time_remaining.reshape(-1, 1)).reshape(-1, 1)



In [369]:
dist_scaler = StandardScaler()
df['shot_distance'] = dist_scaler.fit_transform(df.shot_distance.reshape(-1, 1)).reshape(-1, 1)



In [370]:
df['shot_zone_area'] = df.shot_zone_area.str.extract('\((\w).*\)')

  if __name__ == '__main__':


In [371]:
df['period'] = df.period.clip(upper=5)

In [372]:
df.head()

Unnamed: 0,action_type,combined_shot_type,period,playoffs,season,shot_distance,shot_zone_area,opponent,home,time_remaining,shot_made_flag
0,Jump Shot,Jump Shot,1,0,2000-01,0.170067,L,POR,0,1.442489,0.0
1,Jump Shot,Jump Shot,1,0,2000-01,0.278142,L,POR,0,0.688794,1.0
2,Jump Shot,Jump Shot,1,0,2000-01,0.926591,R,POR,0,0.434362,0.0
3,Driving Dunk Shot,Dunk,2,0,2000-01,-1.451055,C,POR,0,0.275942,1.0
4,Jump Shot,Jump Shot,3,0,2000-01,0.061993,L,POR,0,1.202459,0.0


In [373]:
# turn categorical variables into dummy variables
categorical_vars = ['combined_shot_type', 'season', 'period', 'shot_zone_area', 'opponent', 'action_type']
for var in categorical_vars:
    df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], 1)
    df = df.drop(var, 1)

In [374]:
df.head()

Unnamed: 0,playoffs,shot_distance,home,time_remaining,shot_made_flag,combined_shot_type_Bank Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,...,action_type_Running Tip Shot,action_type_Slam Dunk Shot,action_type_Step Back Jump shot,action_type_Tip Layup Shot,action_type_Tip Shot,action_type_Turnaround Bank shot,action_type_Turnaround Fadeaway shot,action_type_Turnaround Finger Roll Shot,action_type_Turnaround Hook Shot,action_type_Turnaround Jump Shot
0,0,0.170067,0,1.442489,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.278142,0,0.688794,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.926591,0,0.434362,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,-1.451055,0,0.275942,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.061993,0,1.202459,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [375]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.cross_validation import train_test_split

In [376]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('shot_made_flag', axis=1), 
                                                    df['shot_made_flag'], 
                                                    test_size=0.33, 
                                                    random_state=42)

In [377]:
X_train.head()

Unnamed: 0,playoffs,shot_distance,home,time_remaining,combined_shot_type_Bank Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,combined_shot_type_Tip Shot,...,action_type_Running Tip Shot,action_type_Slam Dunk Shot,action_type_Step Back Jump shot,action_type_Tip Layup Shot,action_type_Tip Shot,action_type_Turnaround Bank shot,action_type_Turnaround Fadeaway shot,action_type_Turnaround Finger Roll Shot,action_type_Turnaround Hook Shot,action_type_Turnaround Jump Shot
1119,0,0.386217,0,0.458365,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14464,0,-0.586456,1,-0.482554,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8725,0,0.710441,0,1.850541,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20743,0,-1.451055,0,-1.457077,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24844,1,1.250815,1,-1.533886,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [378]:
y_train.head()

1119     1.0
14464    0.0
8725     1.0
20743    0.0
24844    0.0
Name: shot_made_flag, dtype: float64

In [379]:
model = RandomForestClassifier(n_estimators=40, max_depth=10, criterion='entropy', random_state=42, n_jobs=-1)

In [380]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [381]:
y_pred = model.predict(X_test)

In [382]:
y_pred_proba = model.predict_proba(X_test)

In [383]:
confusion_matrix(y_test, y_pred), log_loss(y_test, y_pred_proba[:,1])

(array([[4028,  670],
        [2052, 1731]]), 0.61514625298114312)

In [384]:
pd.DataFrame({'feature': X_train.columns, 
              'importance': model.feature_importances_}).sort_values('importance', ascending=False).head()

Unnamed: 0,feature,importance
97,action_type_Jump Shot,0.184661
5,combined_shot_type_Dunk,0.126575
1,shot_distance,0.095734
98,action_type_Layup Shot,0.063318
7,combined_shot_type_Jump Shot,0.056714


In [385]:
pred_df = original_df.join(pd.DataFrame(y_pred, columns=['shot_made_pred'], index=X_test.index))

In [386]:
pred_df = pred_df[~pred_df.shot_made_pred.isnull()]

In [387]:
pred_df.head()

Unnamed: 0,action_type,combined_shot_type,period,playoffs,season,shot_distance,shot_zone_area,opponent,home,time_remaining,shot_made_flag,shot_made_pred
3,Driving Dunk Shot,Dunk,2,0,2000-01,0,Center(C),POR,0,379,1.0,1.0
6,Jump Shot,Jump Shot,3,0,2000-01,12,Left Side(L),POR,0,372,1.0,0.0
17,Jump Shot,Jump Shot,3,0,2000-01,18,Right Side Center(RC),UTA,1,544,0.0,0.0
29,Jump Shot,Jump Shot,2,0,2000-01,27,Center(C),VAN,0,4,1.0,0.0
30,Jump Shot,Jump Shot,3,0,2000-01,18,Left Side(L),VAN,0,672,0.0,0.0


In [388]:
pred_df[(pred_df.shot_made_flag != pred_df.shot_made_pred)]

Unnamed: 0,action_type,combined_shot_type,period,playoffs,season,shot_distance,shot_zone_area,opponent,home,time_remaining,shot_made_flag,shot_made_pred
6,Jump Shot,Jump Shot,3,0,2000-01,12,Left Side(L),POR,0,372,1.0,0.0
29,Jump Shot,Jump Shot,2,0,2000-01,27,Center(C),VAN,0,4,1.0,0.0
31,Layup Shot,Layup,3,0,2000-01,0,Center(C),VAN,0,613,1.0,0.0
34,Layup Shot,Layup,3,0,2000-01,0,Center(C),VAN,0,201,1.0,0.0
35,Reverse Dunk Shot,Dunk,3,0,2000-01,0,Center(C),VAN,0,175,0.0,1.0
49,Driving Layup Shot,Layup,3,0,2000-01,5,Center(C),LAC,1,37,0.0,1.0
54,Jump Shot,Jump Shot,1,0,2000-01,25,Center(C),HOU,0,191,1.0,0.0
75,Jump Shot,Jump Shot,2,0,2000-01,18,Left Side Center(LC),SAS,0,195,1.0,0.0
91,Driving Layup Shot,Layup,1,0,2000-01,0,Center(C),HOU,1,617,0.0,1.0
101,Jump Shot,Jump Shot,3,0,2000-01,15,Right Side(R),HOU,1,518,1.0,0.0


In [389]:
# TODO: Other models
# Guess by prev shot? Is it even leagle?
# 'action_type' made a huge diff! Why?

In [394]:
from sklearn.grid_search import GridSearchCV

param_grid={
    'n_estimators': range(10, 50, 5), 
    'max_depth': range(9, 15),
    'criterion': ['entropy', 'gini'],
}

grid = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1), param_grid=param_grid, scoring='log_loss')

In [395]:
grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 15, 20, 25, 30, 35, 40, 45], 'criterion': ['entropy', 'gini'], 'max_depth': [9, 10, 11, 12, 13, 14]},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [396]:
grid.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=45, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [397]:
grid.best_score_

-0.60592606779360292

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()

In [None]:
lda.fit(X_train, y_train)

In [None]:
confusion_matrix(lda.predict(X_test), y_test)

In [None]:
log_loss(y_test, lda.predict_proba(X_test))

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC, SVC
grid = GridSearchCV(estimator=LinearSVC(probability=True), param_grid={}, scoring='log_loss')

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.best_score_