In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

import matplotlib.pyplot as plt

In [2]:
kobe = pd.read_csv('./datasets/data.csv')

### create a separate year column

In [4]:
kobe['game_year'] = pd.DatetimeIndex(kobe['game_date']).year

### Fillna Kobe's NaN shots

In [None]:
###Only 0.827% of Kobes made shots are unnaccounted for in this dataset

In [5]:
kobe['shot_made_flag'] = kobe['shot_made_flag'].fillna(0)

### how many different values in each column

In [26]:
#source code: https://stackoverflow.com/questions/23197324/pandas-value-counts-applied-to-each-column
for c in kobe_objects.columns:
    print(kobe_objects[c].value_counts())

Jump Shot                             18880
Layup Shot                             2567
Driving Layup Shot                     1978
Turnaround Jump Shot                   1057
Fadeaway Jump Shot                     1048
Running Jump Shot                       926
Pullup Jump shot                        476
Turnaround Fadeaway shot                439
Slam Dunk Shot                          411
Reverse Layup Shot                      395
Jump Bank Shot                          333
Driving Dunk Shot                       310
Dunk Shot                               262
Tip Shot                                182
Alley Oop Dunk Shot                     122
Step Back Jump shot                     118
Floating Jump shot                      114
Driving Reverse Layup Shot               97
Hook Shot                                84
Driving Finger Roll Shot                 82
Alley Oop Layup shot                     80
Reverse Dunk Shot                        75
Running Layup Shot              

In [12]:
#create a classification dataframe
kobe_objects = kobe[['period','season','shot_type','shot_zone_area','shot_zone_basic','shot_zone_range']]

In [13]:
#get dummies
kobe_objects_dummies = pd.get_dummies(kobe_objects)

In [14]:
kobe_objects_dummies.shape

(30697, 41)

### Classification

In [15]:
X = kobe_objects_dummies
y = kobe['shot_made_flag']

In [16]:
y.value_counts(normalize=True)

0.0    0.626511
1.0    0.373489
Name: shot_made_flag, dtype: float64

### Train/ test/ split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

## Model instantiation

In [18]:
rfc = RandomForestClassifier()
etc = ExtraTreesClassifier()

## Model Evaluation

In [19]:
rfc_cv_scores = cross_val_score(rfc, X_train, y_train)
print(rfc_cv_scores)
print(rfc_cv_scores.mean())

[0.5980456  0.60108578 0.61142485 0.59926151 0.60447437]
0.6028584217472226


In [20]:
etc_cv_scores = cross_val_score(etc, X_train, y_train)
print(etc_cv_scores)
print(etc_cv_scores.mean())

[0.59891422 0.61216069 0.61164205 0.607298   0.60881842]
0.6077666778923299


## Grid Search

In [21]:
RandomForestClassifier()

RandomForestClassifier()

In [22]:
grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 3, 5]
}

gs = GridSearchCV(rfc, param_grid=grid)

In [23]:
gs.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 3, 5, 7, 10],
                         'min_samples_split': [2, 3, 5],
                         'n_estimators': [100, 150, 200]})

In [24]:
gs.best_score_, gs.best_params_

(0.6324387611773175,
 {'max_depth': 7, 'min_samples_split': 3, 'n_estimators': 150})

In [25]:
gs.best_estimator_

RandomForestClassifier(max_depth=7, min_samples_split=3, n_estimators=150)

In [26]:
gs.score(X_train, y_train), gs.score(X_test, y_test)

(0.6385196768308574, 0.6297068403908794)

In [29]:
from sklearn import metrics

In [30]:
# Testing R2
metrics.r2_score(y_true=y_test, y_pred=gs.predict(X_test))

-0.5823782534283344