In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
filename= "../data/kobe/kobe_bryant_shot_data_refined.csv"
df = pd.read_csv(filename)

In [3]:
df.head()

Unnamed: 0,combined_shot_type,period,playoffs,season,shot_distance,home,time_remaining,shot_made_flag
0,Jump Shot,1,0,2000-01,15,0,622,0.0
1,Jump Shot,1,0,2000-01,16,0,465,1.0
2,Jump Shot,1,0,2000-01,22,0,412,0.0
3,Dunk,2,0,2000-01,0,0,379,1.0
4,Jump Shot,3,0,2000-01,14,0,572,0.0


In [4]:
original_df = df.copy()

In [5]:
# turn categorical variables into dummy variables
categorical_vars = ['combined_shot_type', 'season', 'period']
for var in categorical_vars:
    df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], 1)
    df = df.drop(var, 1)

In [6]:
df.head()

Unnamed: 0,playoffs,shot_distance,home,time_remaining,shot_made_flag,combined_shot_type_Bank Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,...,season_2013-14,season_2014-15,season_2015-16,period_1,period_2,period_3,period_4,period_5,period_6,period_7
0,0,15,0,622,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,16,0,465,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,22,0,412,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,379,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0,14,0,572,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.cross_validation import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('shot_made_flag', axis=1), 
                                                    df['shot_made_flag'], 
                                                    test_size=0.33, 
                                                    random_state=42)

In [9]:
X_train.head()

Unnamed: 0,playoffs,shot_distance,home,time_remaining,combined_shot_type_Bank Shot,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,combined_shot_type_Tip Shot,...,season_2013-14,season_2014-15,season_2015-16,period_1,period_2,period_3,period_4,period_5,period_6,period_7
1119,0,17,0,417,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
14464,0,8,1,221,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8725,0,20,0,707,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
20743,0,0,0,18,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
24844,1,25,1,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [10]:
y_train.head()

1119     1.0
14464    0.0
8725     1.0
20743    0.0
24844    0.0
Name: shot_made_flag, dtype: float64

In [11]:
model = RandomForestClassifier(n_estimators=20, max_depth=9, criterion='entropy', random_state=42)

In [12]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [13]:
y_pred = model.predict(X_test)

In [14]:
y_pred_proba = model.predict_proba(X_test)

In [15]:
confusion_matrix(y_test, y_pred), log_loss(y_test, y_pred_proba[:,1])

(array([[4051,  647],
        [2661, 1122]]), 0.65433924298995227)

In [None]:
pd.DataFrame({'feature': X_train.columns, 
              'importance': model.feature_importances_}).sort_values('importance', ascending=False).head()

In [None]:
pred_df = original_df.join(pd.DataFrame(y_pred, columns=['shot_made_pred'], index=X_test.index))

In [None]:
pred_df = pred_df[~pred_df.shot_made_pred.isnull()]

In [None]:
pred_df.head()

In [None]:
pred_df[(pred_df.shot_made_flag != pred_df.shot_made_pred)]

In [None]:
from sklearn.grid_search import GridSearchCV

param_grid={
    'n_estimators': range(10, 50, 10), 
    'max_depth': range(8, 12),
    'criterion': ['gini', 'entropy'],
}

grid = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1), param_grid=param_grid, scoring='log_loss')

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.best_score_