## 3 Preprocessing

In [1]:
import pandas as pd

data = pd.read_csv("data.csv")
data = data[data.shot_made_flag.isnull()==False] # 같은 표현 : data = data.dropna()

In [2]:
X = data.loc[:,data.columns != 'shot_made_flag'].copy() # 'shot_made_flag'를 제외한 모든 columns를 선택
y = data.shot_made_flag.copy()

### 3-1 불필요한 feature 제거

In [3]:
for col in X.columns:
    print(col)

action_type
combined_shot_type
game_event_id
game_id
lat
loc_x
loc_y
lon
minutes_remaining
period
playoffs
season
seconds_remaining
shot_distance
shot_type
shot_zone_area
shot_zone_basic
shot_zone_range
team_id
team_name
game_date
matchup
opponent
shot_id


In [4]:
X.drop('game_id', axis=1, inplace=True) # Independent
X.drop('game_event_id', axis=1, inplace=True) # Independent

X.drop('lat', axis=1, inplace=True) # Correlated with loc_x
X.drop('lon', axis=1, inplace=True) # Correlated with loc_y

X.drop('team_id', axis=1, inplace=True) # Always one number
X.drop('team_name', axis=1, inplace=True) # Always LA Lakers

In [5]:
X.head()

Unnamed: 0,action_type,combined_shot_type,loc_x,loc_y,minutes_remaining,period,playoffs,season,seconds_remaining,shot_distance,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,game_date,matchup,opponent,shot_id
1,Jump Shot,Jump Shot,-157,0,10,1,0,2000-01,22,15,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,-101,135,7,1,0,2000-01,45,16,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,138,175,6,1,0,2000-01,52,22,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,0,0,6,2,0,2000-01,19,0,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,2000-10-31,LAL @ POR,POR,5
5,Jump Shot,Jump Shot,-145,-11,9,3,0,2000-01,32,14,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,2000-10-31,LAL @ POR,POR,6


## 3-2 Data Transformation

In [6]:
X.corr()

Unnamed: 0,loc_x,loc_y,minutes_remaining,period,playoffs,seconds_remaining,shot_distance,shot_id
loc_x,1.0,-0.017578,0.006624,-0.030059,-0.007751,0.001512,0.022307,-0.012453
loc_y,-0.017578,1.0,-0.077399,0.039737,0.000857,-0.057766,0.818124,0.033476
minutes_remaining,0.006624,-0.077399,1.0,-0.047021,0.009583,0.024232,-0.064159,-0.008251
period,-0.030059,0.039737,-0.047021,1.0,0.003905,0.007153,0.047311,-0.003357
playoffs,-0.007751,0.000857,0.009583,0.003905,1.0,-0.005951,-0.007751,0.612991
seconds_remaining,0.001512,-0.057766,0.024232,0.007153,-0.005951,1.0,-0.055875,-0.004833
shot_distance,0.022307,0.818124,-0.064159,0.047311,-0.007751,-0.055875,1.0,0.020464
shot_id,-0.012453,0.033476,-0.008251,-0.003357,0.612991,-0.004833,0.020464,1.0


In [7]:
# Remaining time
X['seconds_from_period_end'] = 60 * X['minutes_remaining'] + X['seconds_remaining']
X['last_5_sec_in_period'] = X['seconds_from_period_end'] < 5

X.drop('minutes_remaining', axis=1, inplace=True)
X.drop('seconds_remaining', axis=1, inplace=True)
X.drop('seconds_from_period_end', axis=1, inplace=True)

## Matchup - (away/home)
X['home_play'] = X['matchup'].str.contains('vs').astype('int')   #??? 어떻게 처리하는거지?
X.drop('matchup', axis=1, inplace=True)

# drop correlated featuers
X.drop('action_type', axis=1, inplace=True)
X.drop('shot_zone_basic', axis=1, inplace=True)
X.drop('season', axis=1, inplace=True)

# Game date
X['game_date'] = pd.to_datetime(X['game_date'])
X['game_year'] = X['game_date'].dt.year
X['game_month'] = X['game_date'].dt.month
X.drop('game_date', axis=1, inplace=True)

# Loc_x, and loc_y binning
X['loc_x'] = pd.cut(X['loc_x'], 25)     #? 25개로 분류해줌
# X['loc_y'] = pd.cut(X['loc_y'], 25)

# Replace 20 least common action types with value 'Other'
# rare_action_types = X['action_type'].value_counts().sort_values().index.values[:20]
# X.loc[X['action_type'].isin(rare_action_types), 'action_type'] = 'Other'

In [8]:
X.head()

Unnamed: 0,combined_shot_type,loc_x,loc_y,period,playoffs,shot_distance,shot_type,shot_zone_area,shot_zone_range,opponent,shot_id,last_5_sec_in_period,home_play,game_year,game_month
1,Jump Shot,"(-170.32, -150.4]",0,1,0,15,2PT Field Goal,Left Side(L),8-16 ft.,POR,2,False,0,2000,10
2,Jump Shot,"(-110.56, -90.64]",135,1,0,16,2PT Field Goal,Left Side Center(LC),16-24 ft.,POR,3,False,0,2000,10
3,Jump Shot,"(128.48, 148.4]",175,1,0,22,2PT Field Goal,Right Side Center(RC),16-24 ft.,POR,4,False,0,2000,10
4,Dunk,"(-10.96, 8.96]",0,2,0,0,2PT Field Goal,Center(C),Less Than 8 ft.,POR,5,False,0,2000,10
5,Jump Shot,"(-150.4, -130.48]",-11,3,0,14,2PT Field Goal,Left Side(L),8-16 ft.,POR,6,False,0,2000,10


In [9]:
# X 좌표만 의미. y 좌표는 거리로 커버 가능
X.drop('loc_y', axis=1, inplace=True)

In [10]:
X.drop('shot_zone_area', axis=1, inplace=True)
X.drop('shot_type', axis=1, inplace=True)
X.drop('shot_zone_range', axis=1, inplace=True)

In [11]:
X.drop('shot_id', axis=1, inplace=True)

In [12]:
X.head()

Unnamed: 0,combined_shot_type,loc_x,period,playoffs,shot_distance,opponent,last_5_sec_in_period,home_play,game_year,game_month
1,Jump Shot,"(-170.32, -150.4]",1,0,15,POR,False,0,2000,10
2,Jump Shot,"(-110.56, -90.64]",1,0,16,POR,False,0,2000,10
3,Jump Shot,"(128.48, 148.4]",1,0,22,POR,False,0,2000,10
4,Dunk,"(-10.96, 8.96]",2,0,0,POR,False,0,2000,10
5,Jump Shot,"(-150.4, -130.48]",3,0,14,POR,False,0,2000,10


In [13]:
X.columns

Index(['combined_shot_type', 'loc_x', 'period', 'playoffs', 'shot_distance',
       'opponent', 'last_5_sec_in_period', 'home_play', 'game_year',
       'game_month'],
      dtype='object')

In [14]:
X.corr()

Unnamed: 0,period,playoffs,shot_distance,last_5_sec_in_period,home_play,game_year,game_month
period,1.0,0.003905,0.047311,-0.022286,-0.020769,-0.01905,-0.018274
playoffs,0.003905,1.0,-0.007751,0.005305,-0.008559,-0.051543,-0.061754
shot_distance,0.047311,-0.007751,1.0,0.155063,-0.043032,0.113341,-0.012793
last_5_sec_in_period,-0.022286,0.005305,0.155063,1.0,-0.009046,-0.027351,-0.005117
home_play,-0.020769,-0.008559,-0.043032,-0.009046,1.0,-0.010225,0.038485
game_year,-0.01905,-0.051543,0.113341,-0.027351,-0.010225,1.0,-0.016996
game_month,-0.018274,-0.061754,-0.012793,-0.005117,0.038485,-0.016996,1.0


## 3-3 Categorical variable to dummies

In [15]:
categorial_cols = [
    'combined_shot_type', 'period', 'game_year','game_month', 'opponent', 'loc_x']

for cc in categorial_cols:
    dummies = pd.get_dummies(X[cc])
    dummies = dummies.add_prefix("{}-".format(cc))
    X.drop(cc, axis=1, inplace=True)
    X = X.join(dummies)

In [16]:
X.head()

Unnamed: 0,playoffs,shot_distance,last_5_sec_in_period,home_play,combined_shot_type-Bank Shot,combined_shot_type-Dunk,combined_shot_type-Hook Shot,combined_shot_type-Jump Shot,combined_shot_type-Layup,combined_shot_type-Tip Shot,...,"loc_x-(48.8, 68.72]","loc_x-(68.72, 88.64]","loc_x-(88.64, 108.56]","loc_x-(108.56, 128.48]","loc_x-(128.48, 148.4]","loc_x-(148.4, 168.32]","loc_x-(168.32, 188.24]","loc_x-(188.24, 208.16]","loc_x-(208.16, 228.08]","loc_x-(228.08, 248.0]"
1,0,15,False,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,16,False,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,22,False,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,False,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,14,False,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


- 총 207개 features

## 4 Model

### 4-1 Base model
- 가장 기본적인 모델, Logistic regression, LDA, KNN, Decision Tree, Naive Bayes, SVM을 사용하여 평가해보자

In [None]:
# 파라미터 서치
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator = rf,)

In [20]:
from sklearn.model_selection import cross_val_score

# setting parameters
seed = 7
processors=2   
kfold=5
num_instances=len(X)
# scoring='log_loss'    #accuracy 도 가능. 
scoring='accuracy'


In [18]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# base model를 활용하기 위한 package
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [21]:
# Prepare some basic models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('K-NN', KNeighborsClassifier(n_neighbors=5)))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))

results = []
names = []
print('all features selected')
for name, model in models:
    cv_results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors) # accuracy를 보려면 scoring을 지우면 default처리
    results.append(cv_results)
    names.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))   
print('\n')  

all features selected
LR: (0.592) +/- (0.022)
LDA: (0.594) +/- (0.022)
K-NN: (0.546) +/- (0.014)
CART: (0.531) +/- (0.012)
NB: (0.568) +/- (0.015)




In [None]:

# C가 클수록 weak regularization

#LR 파라미터 점검
penalty_set = ['l1', 'l2']
C_set = [0.1, 1, 10, 1e2, 1e3]

names_test=[]
models_test = []
results_test = []
for penalty in penalty_set:
    for C in C_set:
        models_test.append(('LR_{}_{}'.format(penalty,C), LogisticRegression(penalty=penalty, C=C, class_weight='balanced')))
        

#K-NN 파라미터 점검

weights_set = ['uniform', 'distance']
n_neighbors_set = [3, 5, 7, 9, 11]

for weights in weights_set:
    for n_neighbors in n_neighbors_set:
        models_test.append(('KNN_{}_{}'.format(weights, n_neighbors), KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)))


for name, model in models_test:
    cv_results = cross_val_score(model, X, y, cv=kfold, n_jobs=processors) 
    results_test.append(cv_results)
    names_test.append(name)
    print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std()))   
print('\n')  

LR_l1_0.1: (0.585) +/- (0.017)
LR_l1_1: (0.578) +/- (0.027)
LR_l1_10: (0.576) +/- (0.027)
LR_l1_100.0: (0.575) +/- (0.027)
LR_l1_1000.0: (0.576) +/- (0.027)
LR_l2_0.1: (0.579) +/- (0.027)
LR_l2_1: (0.577) +/- (0.028)
LR_l2_10: (0.577) +/- (0.028)
LR_l2_100.0: (0.577) +/- (0.028)
LR_l2_1000.0: (0.577) +/- (0.028)
KNN_uniform_3: (0.540) +/- (0.018)
KNN_uniform_5: (0.546) +/- (0.014)
KNN_uniform_7: (0.552) +/- (0.014)
KNN_uniform_9: (0.559) +/- (0.013)
KNN_uniform_11: (0.562) +/- (0.012)
KNN_distance_3: (0.540) +/- (0.018)
KNN_distance_5: (0.546) +/- (0.014)


In [22]:
for name, model in models:
    print(model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
GaussianNB(priors=None)


In [None]:
corr = X.corr()
cut_off = 0.8

for col in corr.columns:
    tmp = (corr[col] < 1) & (corr[col] > cut_off)
    try:
        if corr[col][tmp.values].values:
            print(col, corr[col][tmp.values])
    except:
        print(corr[col][tmp.values].values)

In [None]:
corr

In [None]:
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})