In [147]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
from sklearn.tree import export_graphviz
import pydot

In [148]:
def compute_metrics(y_true,y_pred):
    accuracy = accuracy_score(y_true,y_pred)
    f1_score_1 = f1_score(y_true,y_pred,average='binary',pos_label=1)
    f1_score_0 = f1_score(y_true,y_pred,average='binary',pos_label=0)
    f1_score_macro = f1_score(y_true,y_pred,average='macro')
    return [accuracy,f1_score_1,f1_score_0,f1_score_macro]

results = pd.DataFrame(columns=['Accuracy', 'F1-score (class 1)', 'F1-score (class 0)', 'F1-score (macro avg)'])

# Random Forrest on regualr season vs playoff

In [149]:
train = pd.read_csv('../data/data3_train.csv')
test = pd.read_csv('../data/data3_test.csv')
target_column = "shot_made_flag"  # y_column_name

In [150]:
def split_x_y(df, target):
    y = df.loc[:, [target]]
    x = df.drop([target], axis=1)
    return x, y

In [151]:
x_train, y_train = split_x_y(train, target_column)
x_test, y_test = split_x_y(test, target_column)

We are using out of the bag score (oob_score) as a way to validate our model.
Instead of using validation set and introducing data leakage, with the oob_score we are estimating the validation score.
# VERIFY

In [152]:
model_rf1 = RandomForestClassifier(oob_score=True).fit(x_train, y_train)

  model_rf1 = RandomForestClassifier(oob_score=True).fit(x_train, y_train)


In [153]:
pred = model_rf1.predict(x_train)

Performing a sanity check

In [154]:
confusion_matrix(y_train,pred)

array([[734,   0],
       [  0, 610]])

In [155]:
print('OOB accuracy=', model_rf1.oob_score_)

OOB accuracy= 0.6830357142857143


In [156]:
y_pred = model_rf1.predict(x_test)

In [157]:
confusion_matrix(y_test,y_pred)

array([[204,  30],
       [109,  85]])

In [158]:
results.loc['RF-basic', :] = compute_metrics(y_test, y_pred)
results.sort_values(by='F1-score (macro avg)', ascending=False)

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
RF-basic,0.675234,0.550162,0.745887,0.648024


# are the reults ok? not overfitted?

# try balanced

In [159]:
model_rf2 = RandomForestClassifier(n_estimators=100,
                                   oob_score=True,
                                   class_weight='balanced').fit(x_train, y_train)

  model_rf2 = RandomForestClassifier(n_estimators=100,


In [160]:
print('OOB accuracy=', model_rf2.oob_score_)

OOB accuracy= 0.6800595238095238


In [161]:
y_pred = model_rf2.predict(x_test)

In [162]:
results.loc['RF-balanced', :] = compute_metrics(y_test, y_pred)
results.sort_values(by='F1-score (macro avg)', ascending=False)

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
RF-balanced,0.682243,0.566879,0.749077,0.657978
RF-basic,0.675234,0.550162,0.745887,0.648024


# Check the tree that we generated

In [137]:
x_train

Unnamed: 0,loc_x,loc_y,period,playoffs,shot_distance,shot_zone_basic,shot_zone_range,time_remaining,last_5_games_avg,streak_before_shot,...,shot_type_2PT Field Goal,shot_type_3PT Field Goal,shot_zone_area_Back Court(BC),shot_zone_area_Center(C),shot_zone_area_Left Side Center(LC),shot_zone_area_Left Side(L),shot_zone_area_Right Side Center(RC),shot_zone_area_Right Side(R),matchup_away,matchup_home
0,0,0,1,0,0,0,0,660,0.00,0,...,1,0,0,1,0,0,0,0,0,1
1,0,0,4,0,0,0,0,113,0.00,0,...,1,0,0,1,0,0,0,0,0,1
2,143,28,4,0,14,2,1,216,0.00,0,...,1,0,0,0,0,0,0,1,0,1
3,-56,279,4,0,28,4,3,339,0.00,0,...,0,1,0,1,0,0,0,0,0,1
4,34,82,4,0,8,1,1,596,0.00,0,...,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,0,0,1,0,0,0,0,386,0.43,0,...,1,0,0,1,0,0,0,0,0,1
1340,-109,110,1,0,15,2,1,641,0.43,0,...,1,0,0,0,0,1,0,0,0,1
1341,-87,290,4,0,30,4,3,49,0.43,0,...,0,1,0,1,0,0,0,0,0,1
1342,70,132,3,0,14,1,1,622,0.43,0,...,1,0,0,1,0,0,0,0,0,1


In [141]:
# Import tools needed for visualization

# Pull out one tree from the forest
def visualize_rf_tree(model):
    tree = model.estimators_[0]# Import tools needed for visualization
    tree = model.estimators_[0]# Export the image to a dot file
    export_graphviz(tree, out_file = 'tree.dot', feature_names = list(x_train.columns), rounded = True, precision = 1)# Use dot file to create a graph
    (graph, ) = pydot.graph_from_dot_file('tree.dot')# Write graph to a png file
    graph.write_png('tree.png')

In [142]:
visualize_rf_tree(model_rf1)

# depth3

In [163]:
model_rf3_small = RandomForestClassifier(max_depth=3, n_estimators=10)
model_rf3_small.fit(x_train, y_train)

  model_rf3_small.fit(x_train, y_train)


RandomForestClassifier(max_depth=3, n_estimators=10)

In [164]:
y_pred = model_rf3_small.predict(x_test)

results.loc['RF-small', :] = compute_metrics(y_test, y_pred)
results.sort_values(by='F1-score (macro avg)', ascending=False)

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
RF-balanced,0.682243,0.566879,0.749077,0.657978
RF-small,0.672897,0.573171,0.734848,0.65401
RF-basic,0.675234,0.550162,0.745887,0.648024


# GridSearch
## What do we do with CV?

In [185]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer


model = RandomForestClassifier()

ntrees = [10, 100, 200,None]
max_depth = [5, 10, 50, 100, None]
min_samples_split = [4,6]
min_samples_leaf = [4,6]
balance = [None, 'balanced', 'balanced_subsample']

trc = GridSearchCV(estimator=model,
                   # scoring=scoring_dict,
                   param_grid={
                       'n_estimators': ntrees,
                       'max_depth':max_depth,
                       'min_samples_split':min_samples_split,
                       'min_samples_leaf':min_samples_leaf,
                       'class_weight':balance
                   },
                   cv=None,
                   return_train_score=False,
                   refit=False,
                   n_jobs=-1)

# model_rf4_gs = trc.fit(x_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

In [188]:
trc.best_params_

{'class_weight': None,
 'max_depth': 5,
 'min_samples_leaf': 4,
 'min_samples_split': 6,
 'n_estimators': 100}

In [196]:
model_rf4_gs = RandomForestClassifier(**trc.best_params_)


In [197]:
model_rf4_gs.fit(x_train, y_train)

  model_rf4_gs.fit(x_train, y_train)


RandomForestClassifier(max_depth=5, min_samples_leaf=4, min_samples_split=6)

In [198]:
y_pred = model_rf4_gs.predict(x_test)
results.loc['RF-gridsearch', :] = compute_metrics(y_test, y_pred)
results.sort_values(by='F1-score (macro avg)', ascending=False)

Unnamed: 0,Accuracy,F1-score (class 1),F1-score (class 0),F1-score (macro avg)
RF-balanced,0.682243,0.566879,0.749077,0.657978
RF-gridsearch,0.684579,0.557377,0.754991,0.656184
RF-small,0.672897,0.573171,0.734848,0.65401
RF-basic,0.675234,0.550162,0.745887,0.648024


In [199]:
confusion_matrix(y_test, y_pred)

array([[208,  26],
       [109,  85]])

# Check feature importance

# Run with reduced features