In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
filename= "../data/kobe/kobe_bryant_shot_data_refined.csv"
df = pd.read_csv(filename)

In [None]:
df.head()

In [None]:
original_df = df.copy()

In [None]:
# turn categorical variables into dummy variables
categorical_vars = ['combined_shot_type', 'season', 'period']
for var in categorical_vars:
    df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], 1)
    df = df.drop(var, 1)

In [None]:
df.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.cross_validation import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('shot_made_flag', axis=1), 
                                                    df['shot_made_flag'], 
                                                    test_size=0.33, 
                                                    random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
model = RandomForestClassifier(random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred_proba = model.predict_proba(X_test)

In [None]:
confusion_matrix(y_test, y_pred), log_loss(y_test, y_pred_proba[:,1])

In [None]:
pd.DataFrame({'feature': X_train.columns, 
              'importance': model.feature_importances_}).sort_values('importance', ascending=False).head()

In [None]:
pred_df = original_df.join(pd.DataFrame(y_pred, columns=['shot_made_pred'], index=X_test.index))

In [None]:
pred_df = pred_df[~pred_df.shot_made_pred.isnull()]

In [None]:
pred_df.head()

In [None]:
pred_df[(pred_df.shot_made_flag != pred_df.shot_made_pred)]

In [None]:
from sklearn.grid_search import GridSearchCV

param_grid={
    'n_estimators': range(10, 50, 10), 
    'max_depth': range(8, 12),
    'criterion': ['gini', 'entropy'],
}

grid = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1), param_grid=param_grid, scoring='log_loss')

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
grid.best_score_