In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
filename= "./kobe_bryant_shot_data_refined.csv"
df = pd.read_csv(filename)

In [3]:
df.head()

Unnamed: 0,combined_shot_type,period,playoffs,season,shot_distance,shot_made_flag,home,seconds_from_period_end
0,Jump Shot,1,0,2000-01,15,0.0,0,622
1,Jump Shot,1,0,2000-01,16,1.0,0,465
2,Jump Shot,1,0,2000-01,22,0.0,0,412
3,Dunk,2,0,2000-01,0,1.0,0,379
4,Jump Shot,3,0,2000-01,14,0.0,0,572


In [4]:
# turn categorical variables into dummy variables
categorical_vars = ['combined_shot_type', 'season', 'period']
for var in categorical_vars:
    df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], 1)
    df = df.drop(var, 1)

In [5]:
df.head()

Unnamed: 0,playoffs,shot_distance,shot_made_flag,home,seconds_from_period_end,combined_shot_type_Bank Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,...,season_2013-14,season_2014-15,season_2015-16,period_1,period_2,period_3,period_4,period_5,period_6,period_7
0,0,15,0.0,0,622,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,16,1.0,0,465,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,22,0.0,0,412,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,1.0,0,379,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0,14,0.0,0,572,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.cross_validation import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('shot_made_flag', axis=1), 
                                                    df['shot_made_flag'], 
                                                    test_size=0.33, 
                                                    random_state=42)

In [8]:
X_train.head()

Unnamed: 0,playoffs,shot_distance,home,seconds_from_period_end,combined_shot_type_Bank Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,combined_shot_type_Tip Shot,...,season_2013-14,season_2014-15,season_2015-16,period_1,period_2,period_3,period_4,period_5,period_6,period_7
1119,0,17,0,417,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
14464,0,8,1,221,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8725,0,20,0,707,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
20743,0,0,0,18,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
24844,1,25,1,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [9]:
y_train.head()

1119     1.0
14464    0.0
8725     1.0
20743    0.0
24844    0.0
Name: shot_made_flag, dtype: float64

In [10]:
model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42)

In [11]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [12]:
y_pred = model.predict(X_test)

In [13]:
y_pred_proba = model.predict_proba(X_test)

In [16]:
confusion_matrix(y_test, y_pred), log_loss(y_test, y_pred_proba[:,1])

(array([[4010,  688],
        [2696, 1087]]), 0.66153582422470292)

In [17]:
pd.DataFrame({'feature': X_train.columns, 'importance': model.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
3,seconds_from_period_end,0.360981
1,shot_distance,0.232609
5,combined_shot_type_Dunk,0.088051
7,combined_shot_type_Jump Shot,0.062369
2,home,0.031337
8,combined_shot_type_Layup,0.030191
0,playoffs,0.024019
32,period_3,0.013919
31,period_2,0.013261
30,period_1,0.013107


In [None]:
from sklearn.grid_search import GridSearchCV
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid={'n_estimators': range(10, 50, 5), 
                                                          'max_depth': range(10, 20)}, scoring='log_loss')

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.best_score_

In [18]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()

In [19]:
lda.fit(X_train, y_train)



LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [20]:
confusion_matrix(lda.predict(X_test), y_test)

array([[4014, 2621],
       [ 684, 1162]])

In [21]:
log_loss(y_test, lda.predict_proba(X_test))

0.65635158248976777

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC, SVC
grid = GridSearchCV(estimator=SVC(probability=True), param_grid={}, scoring='log_loss')

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.best_score_

In [None]:
df = df.drop([u'action_type', u'game_event_id', u'game_id',
       u'lat', u'lon', u'team_id', u'team_name', u'game_date',
       u'opponent', u'shot_id'], axis=1)
df = df.drop(['loc_x', 'loc_y', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range'], axis=1)
df['home'] = df.matchup.apply(lambda matchup: 0 if '@' in matchup else 1)
df = df.drop(['matchup'], axis=1)
df['seconds_from_period_end'] = 60 * df['minutes_remaining'] + df['seconds_remaining']
df = df.drop(['minutes_remaining', 'seconds_remaining'], axis=1)

In [None]:
# turn categorical variables into dummy variables
categorical_vars = ['combined_shot_type', 'season', 'period']
for var in categorical_vars:
    df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], 1)
    df = df.drop(var, 1)

In [None]:
df = df[df.shot_made_flag.isnull()]