In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./data/nba_original.csv')

In [3]:
df.shape

(128069, 21)

In [4]:
df.columns

Index(['GAME_ID', 'MATCHUP', 'LOCATION', 'W', 'FINAL_MARGIN', 'SHOT_NUMBER',
       'PERIOD', 'GAME_CLOCK', 'SHOT_CLOCK', 'DRIBBLES', 'TOUCH_TIME',
       'SHOT_DIST', 'PTS_TYPE', 'SHOT_RESULT', 'CLOSEST_DEFENDER',
       'CLOSEST_DEFENDER_PLAYER_ID', 'CLOSE_DEF_DIST', 'FGM', 'PTS',
       'player_name', 'player_id'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_name,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,1:09,10.8,2,...,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,0:14,3.4,0,...,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148
2,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,3,1,0:00,,3,...,10.1,2,missed,"Bogdanovic, Bojan",202711,0.9,0,0,brian roberts,203148
3,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,4,2,11:47,10.3,2,...,17.2,2,missed,"Brown, Markel",203900,3.4,0,0,brian roberts,203148
4,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,5,2,10:34,10.9,2,...,3.7,2,missed,"Young, Thaddeus",201152,1.1,0,0,brian roberts,203148


In [6]:
df = df.drop(['GAME_ID', 'MATCHUP', 'LOCATION', 'W', 
              'FINAL_MARGIN', 'SHOT_NUMBER', 'SHOT_RESULT', 'CLOSEST_DEFENDER', 
              'CLOSEST_DEFENDER_PLAYER_ID', 'PTS', 
              'player_name', 'player_id'], 1)
df.head()

Unnamed: 0,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST,FGM
0,1,1:09,10.8,2,1.9,7.7,2,1.3,1
1,1,0:14,3.4,0,0.8,28.2,3,6.1,0
2,1,0:00,,3,2.7,10.1,2,0.9,0
3,2,11:47,10.3,2,1.9,17.2,2,3.4,0
4,2,10:34,10.9,2,2.7,3.7,2,1.1,0


In [7]:
# count the number of missing values per column
display(df.isnull().sum())

PERIOD               0
GAME_CLOCK           0
SHOT_CLOCK        5567
DRIBBLES             0
TOUCH_TIME           0
SHOT_DIST            0
PTS_TYPE             0
CLOSE_DEF_DIST       0
FGM                  0
dtype: int64

In [8]:
print("Original:", df.shape)

# drop rows with missing values
df = df.dropna()
print("Remove NAN:", df.shape)

Original: (128069, 9)
Remove NAN: (122502, 9)


In [9]:
GAME_CLOCK_SEC = []

for game_clock in df['GAME_CLOCK']:
    mins, secs = game_clock.split(':')
    mins, secs = int(mins), int(secs)
    GAME_CLOCK_SEC.append(mins*60 + secs)

In [10]:
df = df.assign(GAME_CLOCK=GAME_CLOCK_SEC)

In [11]:
df.head()

Unnamed: 0,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST,FGM
0,1,69,10.8,2,1.9,7.7,2,1.3,1
1,1,14,3.4,0,0.8,28.2,3,6.1,0
3,2,707,10.3,2,1.9,17.2,2,3.4,0
4,2,634,10.9,2,2.7,3.7,2,1.1,0
5,2,495,9.1,2,4.4,18.4,2,2.6,0


In [12]:
X = df.drop('FGM', 1)
y = pd.DataFrame(df['FGM'])
y.columns = ['FGM']

assert X.shape[0] == y.shape[0]
X.head()

Unnamed: 0,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST
0,1,69,10.8,2,1.9,7.7,2,1.3
1,1,14,3.4,0,0.8,28.2,3,6.1
3,2,707,10.3,2,1.9,17.2,2,3.4
4,2,634,10.9,2,2.7,3.7,2,1.1
5,2,495,9.1,2,4.4,18.4,2,2.6


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

In [14]:
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

In [17]:
X_train.to_csv('./data/X_train.csv', encoding='utf-8', index=False)
y_train.to_csv('./data/y_train.csv', encoding='utf-8', index=False)
X_test.to_csv('./data/X_test.csv', encoding='utf-8', index=False)
y_test.to_csv('./data/y_test.csv', encoding='utf-8', index=False)

In [18]:
X_train_new = pd.read_csv('./data/X_train.csv')

In [19]:
X_train.head()

Unnamed: 0,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST
83027,1,358,2.4,0,3.2,20.6,2,4.5
3198,1,585,8.3,0,1.2,3.0,2,0.5
90812,1,540,19.9,0,0.6,3.5,2,3.2
51737,1,392,9.0,0,0.9,21.1,2,4.9
112660,3,401,22.7,0,0.7,4.1,2,2.9


In [20]:
X_train_new.head()

Unnamed: 0,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST
0,1,358,2.4,0,3.2,20.6,2,4.5
1,1,585,8.3,0,1.2,3.0,2,0.5
2,1,540,19.9,0,0.6,3.5,2,3.2
3,1,392,9.0,0,0.9,21.1,2,4.9
4,3,401,22.7,0,0.7,4.1,2,2.9


In [21]:
y_train_new = pd.read_csv('./data/y_train.csv')

In [22]:
y_train.head()

Unnamed: 0,FGM
83027,0
3198,1
90812,1
51737,1
112660,1


In [23]:
y_train.shape

(85751, 1)

In [24]:
y_train_new.shape

(85751, 1)

In [25]:
y_train_new.head()

Unnamed: 0,FGM
0,0
1,1
2,1
3,1
4,1
