In [295]:
%matplotlib inline

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


In [296]:
data = pd.read_csv("data.csv")
pd.set_option("display.max_columns",100)
data.head(10)

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_made_flag,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,0,2000-01,27,18,,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,0,2000-01,22,15,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,0,2000-01,45,16,1.0,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,0,2000-01,52,22,0.0,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,0,2000-01,19,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,9,3,0,2000-01,32,14,0.0,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,6
6,Layup Shot,Layup,251,20000012,34.0443,0,0,-118.2698,8,3,0,2000-01,52,0,1.0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,7
7,Jump Shot,Jump Shot,254,20000012,34.0163,1,28,-118.2688,8,3,0,2000-01,5,2,,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,8
8,Jump Shot,Jump Shot,265,20000012,33.9363,-65,108,-118.3348,6,3,0,2000-01,12,12,1.0,2PT Field Goal,Left Side(L),In The Paint (Non-RA),8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,9
9,Running Jump Shot,Jump Shot,294,20000012,33.9193,-33,125,-118.3028,3,3,0,2000-01,36,12,0.0,2PT Field Goal,Center(C),In The Paint (Non-RA),8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,10


In [297]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

seconds_remaining = data["minutes_remaining"] * 60 + data["seconds_remaining"]
data["clutch"] = seconds_remaining < 5
seconds_remaining = scaler.fit_transform(seconds_remaining.values.reshape(-1, 1))


data["home"] = data["matchup"].str.contains("vs.")

actiontypes = dict(data.action_type.value_counts())
data["type"] = data.apply(lambda row: row['action_type'] if actiontypes[row['action_type']] > 20
                          else row['combined_shot_type'], axis=1)

X_all = pd.DataFrame(seconds_remaining, columns = ["seconds_remaining"])


features = ['clutch', 'type', 'home', 'shot_zone_area', 'shot_zone_basic', 'season', 'shot_type', 'playoffs', 'shot_zone_range', 'period']

for f in features:
    X_all = pd.concat([X_all, pd.get_dummies(data[f], prefix = f)], axis = 1)


X_all = X_all.join(data["shot_made_flag"])

X = X_all[np.isnan(X_all["shot_made_flag"]) == False]
X_test = X_all[np.isnan(X_all["shot_made_flag"])]
X_test = X_test.drop("shot_made_flag", axis = 1)
y = X["shot_made_flag"]
X = X.drop("shot_made_flag", axis=1)




In [298]:
X.head()

Unnamed: 0,seconds_remaining,clutch_False,clutch_True,type_Alley Oop Dunk Shot,type_Alley Oop Layup shot,type_Bank Shot,type_Driving Dunk Shot,type_Driving Finger Roll Layup Shot,type_Driving Finger Roll Shot,type_Driving Jump shot,type_Driving Layup Shot,type_Driving Reverse Layup Shot,type_Driving Slam Dunk Shot,type_Dunk,type_Dunk Shot,type_Fadeaway Bank shot,type_Fadeaway Jump Shot,type_Finger Roll Layup Shot,type_Finger Roll Shot,type_Floating Jump shot,type_Hook Shot,type_Jump Bank Shot,type_Jump Hook Shot,type_Jump Shot,type_Layup,type_Layup Shot,type_Pullup Jump shot,type_Reverse Dunk Shot,type_Reverse Layup Shot,type_Running Bank shot,type_Running Hook Shot,type_Running Jump Shot,type_Running Layup Shot,type_Slam Dunk Shot,type_Step Back Jump shot,type_Tip Shot,type_Turnaround Bank shot,type_Turnaround Fadeaway shot,type_Turnaround Jump Shot,home_False,home_True,shot_zone_area_Back Court(BC),shot_zone_area_Center(C),shot_zone_area_Left Side Center(LC),shot_zone_area_Left Side(L),shot_zone_area_Right Side Center(RC),shot_zone_area_Right Side(R),shot_zone_basic_Above the Break 3,shot_zone_basic_Backcourt,shot_zone_basic_In The Paint (Non-RA),shot_zone_basic_Left Corner 3,shot_zone_basic_Mid-Range,shot_zone_basic_Restricted Area,shot_zone_basic_Right Corner 3,season_1996-97,season_1997-98,season_1998-99,season_1999-00,season_2000-01,season_2001-02,season_2002-03,season_2003-04,season_2004-05,season_2005-06,season_2006-07,season_2007-08,season_2008-09,season_2009-10,season_2010-11,season_2011-12,season_2012-13,season_2013-14,season_2014-15,season_2015-16,shot_type_2PT Field Goal,shot_type_3PT Field Goal,playoffs_0,playoffs_1,shot_zone_range_16-24 ft.,shot_zone_range_24+ ft.,shot_zone_range_8-16 ft.,shot_zone_range_Back Court Shot,shot_zone_range_Less Than 8 ft.,period_1,period_2,period_3,period_4,period_5,period_6,period_7
1,1.443507,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0
2,0.689322,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
3,0.434725,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
4,0.276202,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0
5,1.203321,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0


In [299]:
print(X_all.shape)
print(X.shape)
print(X_test.shape)

(30697, 91)
(25697, 90)
(5000, 90)


In [300]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# model = XGBClassifier(max_depth=7, learning_rate=0.012, n_estimators=1000, subsample=0.62, colsample_bytree=0.6, seed=1)
# model = LogisticRegression()
# model = RandomForestClassifier()

lr_grid = GridSearchCV(
    estimator = LogisticRegression(random_state=1),
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 1, 10, 100, 1000]
    }, 
    cv = 5, 
    scoring = 'log_loss', 
    n_jobs = -1)

lr_grid.fit(X, y)

print(lr_grid.best_score_)
print(lr_grid.best_params_)

model = lr_grid.best_estimator_
model.fit(X, y)

# cv_score = cross_val_score(model, X, y, cv = 5)
# mean_cv_score = np.mean(cv_score)
# training_score = model.score(X, y)

# print("Mean CV score:\t\t", mean_cv_score)
# print("Training set score:\t", training_score)

-0.623479914011
{'C': 1, 'penalty': 'l2'}


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Log loss: -0.656206901113

Best params: {'C': 0.01, 'penalty': 'l1'}

| LogisticRegression||
|--------------------|----------------|
| Mean CV score      | 0.603805710963 |
| Training set score | 0.617270498502 |

| XGBClassifier||
|--------------------|----------------|
| Mean CV score      | 0.587306513175 |
| Training set score | 0.675993306612 |

| RandomForestClassifier||
|--------------------|----------------|
| Mean CV score      | 0.563879064724 |
| Training set score | 0.957037786512 |

In [301]:
predictions = model.predict_proba(X_test)

In [302]:
submit_pred = predictions[:, 1]

In [303]:
np.array(data[np.isnan(data["shot_made_flag"])]["shot_id"])

array([    1,     8,    17, ..., 30683, 30687, 30694], dtype=int64)

In [304]:
data_ids = data[np.isnan(data["shot_made_flag"])]["shot_id"]
submission = pd.DataFrame({"shot_id": data_ids, "shot_made_flag": submit_pred})
submission.to_csv("_dave_submit.csv", index = False)