<a href="https://colab.research.google.com/github/misharigot/kobe/blob/master/src/model/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains the neural network to predict kobe's shots.

## To Do from Trello
- [x] Implementeren van cross validation.
- [ ] Connecten van nieuwe cross validation module met de nn model module.
- [ ] Bouwen van verschillende netwerken (vorm, aantal nodes etc.)
- [ ] Kijken welke loss function we moeten gebruiken, cross entropy vs log loss. Log loss sowieso proberen om te vergelijken met competition entries.
- [ ] Implementeren van model export functie.

In [1]:
import numpy as np
import csv
import pandas as pd
from scipy import stats

from keras.models import Sequential
from keras.layers import Dense
import keras
from sklearn import preprocessing

Using TensorFlow backend.


## Load the data

In [5]:
df = pd.read_csv('../../data/data.csv')

## Data preprocessing

The following cell combines the two columns `minutes_remaining` and `seconds_remaining` into the new column `time_remaining`.

In [7]:
# Combine minutes and seconds remaining into decimal minutes remaining, e.g. 6.5 for 6 mins and 30 secs.
df['time_remaining'] = round(df['minutes_remaining'] + (df['seconds_remaining'] / 60), 2)
df[['time_remaining']]

Unnamed: 0,time_remaining
0,10.45
1,10.37
2,7.75
3,6.87
4,6.32
...,...
30692,6.08
30693,6.08
30694,3.47
30695,2.17


### Convert the data types of the columns to categoricals when relevant

In [8]:
# Categorize all columns based on their data type
categoricals = [
    'action_type',
    'combined_shot_type',
    'game_event_id', # Meaning?
    'game_id',
    'season',
    'shot_type',
    'shot_zone_area',
    'shot_zone_basic',
    'shot_zone_range',
    'team_id',
    'team_name',
    'matchup',
    'opponent'
]

temporal = [
    'game_date'
]

remaining = [
    'lat',
    'loc_x',
    'loc_y',
    'lon',
    'period',
    'shot_distance',
    'time_remaining',
    'shot_made_flag'  # y label
]

perhaps_to_exclude = [
    'shot_id',            # Just an auto-increment id, does not mean anything
    'minutes_remaining',  # Not needed, since we use the engineered field 'time_remaining'
    'seconds_remaining'   # Not needed, since we use the engineered field 'time_remaining'
]

# Convert to categorical
df[categoricals] = df[categoricals].astype('category')


## Encode categoricals to integers

In [9]:
cat_cols = df[categoricals]
cat_cols

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,season,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,matchup,opponent
0,Jump Shot,Jump Shot,10,20000012,2000-01,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,LAL @ POR,POR
1,Jump Shot,Jump Shot,12,20000012,2000-01,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,LAL @ POR,POR
2,Jump Shot,Jump Shot,35,20000012,2000-01,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,LAL @ POR,POR
3,Jump Shot,Jump Shot,43,20000012,2000-01,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,LAL @ POR,POR
4,Driving Dunk Shot,Dunk,155,20000012,2000-01,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL @ POR,POR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30692,Jump Shot,Jump Shot,397,49900088,1999-00,2PT Field Goal,Center(C),In The Paint (Non-RA),Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL vs. IND,IND
30693,Tip Shot,Tip Shot,398,49900088,1999-00,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL vs. IND,IND
30694,Running Jump Shot,Jump Shot,426,49900088,1999-00,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,LAL vs. IND,IND
30695,Jump Shot,Jump Shot,448,49900088,1999-00,3PT Field Goal,Center(C),Above the Break 3,24+ ft.,1610612747,Los Angeles Lakers,LAL vs. IND,IND


In [10]:
cat_cols[cat_cols['action_type'] == 'Alley Oop Dunk Shot']

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,season,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,matchup,opponent
144,Alley Oop Dunk Shot,Dunk,77,20000108,2000-01,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL vs. DEN,DEN
181,Alley Oop Dunk Shot,Dunk,328,20000124,2000-01,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL @ SAC,SAC
194,Alley Oop Dunk Shot,Dunk,73,20000140,2000-01,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL @ DEN,DEN
242,Alley Oop Dunk Shot,Dunk,121,20000168,2000-01,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL vs. GSW,GSW
244,Alley Oop Dunk Shot,Dunk,192,20000168,2000-01,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL vs. GSW,GSW
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28415,Alley Oop Dunk Shot,Dunk,379,40700402,2007-08,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL @ BOS,BOS
28830,Alley Oop Dunk Shot,Dunk,371,40800313,2008-09,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL @ DEN,DEN
28868,Alley Oop Dunk Shot,Dunk,309,40800315,2008-09,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL vs. DEN,DEN
29395,Alley Oop Dunk Shot,Dunk,382,40900401,2009-10,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,LAL vs. BOS,BOS


In [11]:
def one_hot_encode(df_with_only_categoricals):
    encoder = preprocessing.OneHotEncoder()
    encoder.fit(df_with_only_categoricals)
    result = encoder.transform(df_with_only_categoricals).toarray()
    return result

one_hot_encoded_df = pd.DataFrame(one_hot_encode(cat_cols))

In [12]:
remaining_df = df[remaining]

In [13]:
preprocessed_df = pd.concat([one_hot_encoded_df, remaining_df], axis=1)
# preprocessed_df[0]
# encoder[144]  # Verify that row 144 has a 1 in the first col

Set input dimensions equal to the number of columns in the training data

## Train/validation/test split

Do this only after data preprocessing

Perhaps use:

```python
sklearn.model_selection.train_test_split(*arrays, **options)
```

See https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

https://www.kaggle.com/c/kobe-bryant-shot-selection/leaderboard

In [16]:
df = preprocessed_df

In [None]:
# Filter out the rows that do not have a 'shot_made_flag', i.e. the test set with 5000 rows
test_set = df[df['shot_made_flag'].isnull()]
test_set.shape

(5000, 2399)

In [None]:
# Obtain the rows with a 'shot_made_flag', i.e. the train set
train_set = df[~df['shot_made_flag'].isnull()]
train_set.shape

(25697, 2399)

In [None]:
def split_train_test(df, split_float = 0.8):
    random_sample = np.random.rand(len(df)) < split_float
    train = df[random_sample]
    test = df[~random_sample]
    return train, test

train, test = split_train_test(train_set, 0.8)

In [None]:
print(train.shape)
print(test.shape)

(20488, 2399)
(5209, 2399)


In [None]:
input_dim = len(train[0])
print(input_dim)

20488


In [None]:
# Dummy data
# x_train = np.array([[1,2,3], [1,2,2], [1,1,3]])
# y_train = np.array([1, 0, 1])
# input_dim = len(x_train.columns)

In [None]:
def get_x_y_input(data):
    """Returns the input data X, the target output label Y and the number of 
    dimensions in the input
    """
    Y = train['shot_made_flag']
    X = train.drop(columns=['shot_made_flag'])
    input_dim = X.shape[1]  # number of columns
    return X, Y, input_dim

In [None]:
# The input data (x_train) and the label data used for training the model
x_train, y_train, input_dim = get_x_y_input(train)

## New split module

In [17]:
import sys; sys.path.insert(0, '..')  # Needed to make the import below work

from multiple_train_test_splits import MultipleTrainTestSplits

SyntaxError: invalid syntax (multiple_train_test_splits.py, line 18)

In [8]:
mtts = MultipleTrainTestSplits(csv_path='../../data/data.csv')

test_set = mtts.test_set
for train_set, validation_set in mtts.train_validation_split():
    # Still need to exclude the label (Y/shot_made_flag)
    train_set = preprocess(train_set)
    validation_set = preprocess(validation_set)

[['Jump Shot' 'Jump Shot' 12 ... 'POR' 2 0]
 ['Jump Shot' 'Jump Shot' 35 ... 'POR' 3 1]
 ['Jump Shot' 'Jump Shot' 43 ... 'POR' 4 2]
 ...
 ['Layup Shot' 'Layup' 98 ... 'MEM' 6120 5138]
 ['Jump Shot' 'Jump Shot' 108 ... 'MEM' 6121 5139]
 ['Jump Shot' 'Jump Shot' 115 ... 'MEM' 6122 5140]]
[['Jump Shot' 'Jump Shot' 12 ... 'POR' 2 0]
 ['Jump Shot' 'Jump Shot' 35 ... 'POR' 3 1]
 ['Jump Shot' 'Jump Shot' 43 ... 'POR' 4 2]
 ...
 ['Jump Shot' 'Jump Shot' 414 ... 'DEN' 12271 10277]
 ['Layup Shot' 'Layup' 21 ... 'SAS' 12272 10278]
 ['Layup Shot' 'Layup' 39 ... 'SAS' 12273 10279]]
[['Jump Shot' 'Jump Shot' 12 ... 'POR' 2 0]
 ['Jump Shot' 'Jump Shot' 35 ... 'POR' 3 1]
 ['Jump Shot' 'Jump Shot' 43 ... 'POR' 4 2]
 ...
 ['Jump Shot' 'Jump Shot' 368 ... 'CLE' 18387 15416]
 ['Pullup Jump shot' 'Jump Shot' 371 ... 'CLE' 18388 15417]
 ['Jump Shot' 'Jump Shot' 443 ... 'CLE' 18389 15418]]


## Create model
The model works on dummy data, see above

In [None]:
model = Sequential()

model.add(Dense(units=64, activation='relu', input_dim=input_dim))
model.add(Dense(units=1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x142aa9990>

In [None]:
# # Dummy
# x_validation = np.array([[1,2,3], [1,2,2], [1,1,3], [2,3,2], [2,1,3]])
# y_validation = np.array([1, 0, 1, 1, 1])

In [None]:
x_validation, y_validation, input_dim = get_x_y_input(test) 

In [None]:
loss_and_metrics = model.evaluate(x_validation, y_validation, batch_size=128)



In [None]:
loss_and_metrics

NameError: name 'loss_and_metrics' is not defined

In [None]:
classes = model.predict(x_validation, batch_size=128)
classes

array([[0.52622545],
       [0.26044014],
       [0.9890055 ],
       ...,
       [0.6824401 ],
       [0.39076325],
       [0.05106422]], dtype=float32)