This notebook is used to initially play around with keras and the kobe data set.

In [75]:
import numpy as np
import csv
import pandas as pd
from scipy import stats

from keras.models import Sequential
from keras.layers import Dense

## Train/validation/test split

In [100]:
data = pd.read_csv('../data/data.csv')

In [101]:
data['shot_made_flag']

0        NaN
1        0.0
2        1.0
3        0.0
4        1.0
        ... 
30692    0.0
30693    NaN
30694    1.0
30695    0.0
30696    0.0
Name: shot_made_flag, Length: 30697, dtype: float64

# Trying to split with Pandas

In [161]:
# Create train/test split with pandas, WIP
split_on = int(len(data) * .8)
print(split_on)

train = data.iloc[:split_on, :]
validation = data.iloc[split_on:, :]
print(len(train))
print(len(validation))
print(len(validation) + len(train))
print(len(data))


24557
24557
6140
30697
30697


In [125]:
data.dtypes

action_type            object
combined_shot_type     object
game_event_id           int64
game_id                 int64
lat                   float64
loc_x                   int64
loc_y                   int64
lon                   float64
minutes_remaining       int64
period                  int64
playoffs                int64
season                 object
seconds_remaining       int64
shot_distance           int64
shot_made_flag        float64
shot_type              object
shot_zone_area         object
shot_zone_basic        object
shot_zone_range        object
team_id                 int64
team_name              object
game_date              object
matchup                object
opponent               object
shot_id                 int64
dtype: object

# Trying to split with numpy

In [77]:
with open('../data/data.csv', 'r') as f:
    data = list(csv.reader(f, delimiter=","))
data = np.array(data)

header = data[0]
data = data[1:]

In [78]:
# Extract the 5000 rows that have a missing label, i.e. the test set

def split(arr, cond):
    return [arr[cond], arr[~cond]]

split_data = split(data, data[:, 14] != '')

non_test = split_data[0]
test = split_data[1]

print('length of non_test set:', len(non_test))
print('length of test set:', len(test))

length of non_test set: 25697
length of test set: 5000


# TODO: To avoid leakage: method should only train on events that occurred prior to the shot for which you are predicting!

### Avoid leakage
For more info on leakage, see:

https://www.kaggle.com/dansbecker/data-leakage

In [79]:
# Split the non_test set into a training and a validation set

def train_test_split(data, test_percentage: float):
    if test_percentage < 0 or test_percentage > 1:
        raise ValueError('argument test_percentage must be a float between 0 and 1')

    data_len = len(data)
    test_size = int(data_len * test_percentage)
    
    return data[:test_size,:], data[test_size:, :]

train, validation = train_test_split(data, 0.2)

print('length of train set:', len(train))
print('length of validation set:', len(validation))

print(train[0:10])

length of train set: 6139
length of validation set: 24558
[['Jump Shot' 'Jump Shot' '10' '20000012' '33.9723' '167' '72'
  '-118.1028' '10' '1' '0' '2000-01' '27' '18' '' '2PT Field Goal'
  'Right Side(R)' 'Mid-Range' '16-24 ft.' '1610612747'
  'Los Angeles Lakers' '2000-10-31' 'LAL @ POR' 'POR' '1']
 ['Jump Shot' 'Jump Shot' '12' '20000012' '34.0443' '-157' '0'
  '-118.4268' '10' '1' '0' '2000-01' '22' '15' '0' '2PT Field Goal'
  'Left Side(L)' 'Mid-Range' '8-16 ft.' '1610612747' 'Los Angeles Lakers'
  '2000-10-31' 'LAL @ POR' 'POR' '2']
 ['Jump Shot' 'Jump Shot' '35' '20000012' '33.9093' '-101' '135'
  '-118.3708' '7' '1' '0' '2000-01' '45' '16' '1' '2PT Field Goal'
  'Left Side Center(LC)' 'Mid-Range' '16-24 ft.' '1610612747'
  'Los Angeles Lakers' '2000-10-31' 'LAL @ POR' 'POR' '3']
 ['Jump Shot' 'Jump Shot' '43' '20000012' '33.8693' '138' '175'
  '-118.1318' '6' '1' '0' '2000-01' '52' '22' '0' '2PT Field Goal'
  'Right Side Center(RC)' 'Mid-Range' '16-24 ft.' '1610612747'
  'Los A

# TODO: Encode categoricals to integers

In [98]:
from sklearn.preprocessing import LabelEncoder  

le = LabelEncoder()
x = le.fit_transform(train[0])

Set input dimensions equal to the number of columns in the training data

In [90]:
input_dim = len(train[0])
print(input_dim)

25


# TODO: Needs actual data from the kobe data set.

In [160]:
# Dummy data
x_train = np.array([[1,2,3], [1,2,2], [1,1,3]])
y_train = np.array([1, 0, 1])
input_dim = len(x_train[0])

## Create model
The model works on dummy data, see above

In [135]:
model = Sequential()

model.add(Dense(units=64, activation='relu', input_dim=input_dim))
model.add(Dense(units=1, activation='sigmoid'))

In [136]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [137]:
# x_train and y_train are Numpy arrays --just like in the Scikit-Learn API.
model.fit(x_train, y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x143835510>

In [156]:
x_validation = np.array([[1,2,3], [1,2,2], [1,1,3], [2,3,2], [2,1,3]])
y_validation = np.array([1, 0, 1, 1, 1])

In [157]:
loss_and_metrics = model.evaluate(x_validation, y_validation, batch_size=128)



In [158]:
loss_and_metrics

[0.7115203142166138, 0.4000000059604645]

In [159]:
classes = model.predict(x_validation, batch_size=128)
classes

array([[0.50366366],
       [0.48296374],
       [0.49760363],
       [0.47506434],
       [0.46307984]], dtype=float32)