<a href="https://colab.research.google.com/github/misharigot/kobe/blob/master/src/model/nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains the neural network to predict kobe's shots.

## To Do from Trello
- [x] Implementeren van cross validation.
- [ ] Connecten van nieuwe cross validation module met de nn model module.
- [ ] Bouwen van verschillende netwerken (vorm, aantal nodes etc.)
- [ ] Kijken welke loss function we moeten gebruiken, cross entropy vs log loss. Log loss sowieso proberen om te vergelijken met competition entries.
- [ ] Implementeren van model export functie.

In [2]:
# When using this notebook in Google Colab, clone the repo in the file system in
# order to use the python modules from the repo.
!git  clone https://github.com/misharigot/kobe.git

Cloning into 'kobe'...
remote: Enumerating objects: 178, done.[K
remote: Counting objects: 100% (178/178), done.[K
remote: Compressing objects: 100% (132/132), done.[K
remote: Total 178 (delta 74), reused 126 (delta 34), pack-reused 0[K
Receiving objects: 100% (178/178), 1.36 MiB | 19.90 MiB/s, done.
Resolving deltas: 100% (74/74), done.


In [1]:
import sys; sys.path.insert(0, '..')  # Needed to make the import below work

# Use the line below in Colab
# from kobe.src.multiple_train_test_splits import MultipleTrainTestSplits

# Use the line below in a local env
from multiple_train_test_splits import MultipleTrainTestSplits
from preprocessor import Preprocessor

import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint
from sklearn import preprocessing

Using TensorFlow backend.


In [2]:
def get_x(data: pd.DataFrame) -> pd.DataFrame:
    """Returns the features.
    """
    X = data.drop(columns=['shot_made_flag'])
    return X

def get_y(data: pd.DataFrame) -> pd.Series:
    """Returns the target.
    """
    Y = data['shot_made_flag'].copy()
    return Y


In [3]:
def create_model_1(input_dim: int):
    """Simple one hidden layer network.
    """
    model = Sequential()

    model.add(Dense(units=32, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.add(Dropout(0.5))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def create_model_2(input_dim: int):
    """2 hidden layers network.
    """
    model = Sequential()

    model.add(Dense(units=64, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def create_model_3(input_dim: int):
    """1 hidden layer network with a lot of neurons.
    """
    model = Sequential()

    model.add(Dense(units=int(input_dim/2), activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def get_models_dict():
    models = {}
    models['model_1'] = create_model_1
    models['model_2'] = create_model_2
    models['model_3'] = create_model_3
    return models


In [4]:
# import datetime as dt
# first_recorded_game = str(dt.datetime.strptime(
#             min(pp.raw_data['game_date']), '%Y-%m-%d').strftime('%Y-%m-%d'))
# print(first_recorded_game)

In [5]:
# Use in Colab
# csv_path = 'kobe/data/data.csv'

# Use in local env
csv_path = '../../data/data.csv'

In [22]:
mtts = MultipleTrainTestSplits(csv_path=csv_path)
pp = Preprocessor(path_to_raw_data=csv_path)

test_set = mtts.test_set

loss_and_metrics = {}
models = get_models_dict()

# Loop over the models
for model_name, model_func in models.items():
    # checkpoint_path = "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
    
    loss_and_metrics[model_name] = {}
    
    # Loop over the train/validation splits/folds
    n_fold = 0
    for train_set, validation_set in mtts.train_validation_split(as_dataframe=True):
        n_fold += 1
        print(f'Training model: {model_name}, Fold: {n_fold}')
        checkpoint_path = f"{model_name}_fold_{n_fold}_weights-improvement" + "-{epoch:02d}-{val_acc:.2f}.hdf5"

        # Preprocess the training set
        preprocessed_train_set = pp.preprocess(train_set)
        # Split the features from the target
        x_train = get_x(preprocessed_train_set)
        y_train = get_y(preprocessed_train_set)

        # Preprocess the validation set
        preprocessed_validation_set= pp.preprocess(validation_set)
        # Split the features from the target
        x_validation = get_x(preprocessed_validation_set)
        y_validation = get_y(preprocessed_validation_set)

        input_dim = x_train.shape[1]  # number of columns (dimensions for the input layer of the model)
        
        # model = create_model(input_dim=input_dim)
        model = model_func(input_dim)

#         # Create model checkpoint to be able to resume at a checkpoint when training crashes.
#         checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
#         callbacks_list = [checkpoint]

        # Fit the model
#         model.fit(x_train, y_train, epochs=2, batch_size=10, 
#                   validation_data=(x_validation, y_validation),
#                   callbacks=callbacks_list, verbose=0)
        model.fit(x_train, y_train, epochs=1, batch_size=128)
    
        loss_and_metrics[model_name][n_fold] = model.evaluate(x_validation, y_validation, batch_size=128)
    

Training model: model_1, Fold: 1
Epoch 1/1
Training model: model_1, Fold: 2
Epoch 1/1
Training model: model_1, Fold: 3
Epoch 1/1
Training model: model_2, Fold: 1
Epoch 1/1
Training model: model_2, Fold: 2
Epoch 1/1
Training model: model_2, Fold: 3
Epoch 1/1
Training model: model_3, Fold: 1
Epoch 1/1
Training model: model_3, Fold: 2
Epoch 1/1
Training model: model_3, Fold: 3
Epoch 1/1


In [24]:
loss_and_metrics

{'model_1': {1: [0.8631778114337034, 0.5641175508499146],
  2: [0.8561778308979197, 0.540182888507843],
  3: [1.7360578366443093, 0.5395991206169128]},
 'model_2': {1: [0.6847195672163079, 0.5765713453292847],
  2: [0.6894150146437795, 0.540182888507843],
  3: [0.6744304086290275, 0.5866900086402893]},
 'model_3': {1: [1.1515919211588612, 0.5641175508499146],
  2: [0.6852728027526357, 0.6337808966636658],
  3: [1.7372203638266441, 0.46040084958076477]}}

In [5]:
def print_average_metrics(loss_and_metrics):
    for model_name, model_folds in loss_and_metrics.items():
        sum_list = []
        print('folds')
        for i, fold in model_folds.items():
            sum_list.append(fold[1])
            'folds:'
        print(f'average for model {model_name}')
        print(sum(sum_list)/len(model_folds))

print_average_metrics(loss_and_metrics)

NameError: name 'loss_and_metrics' is not defined

In [2]:
!ls -lsa

total 248
  0 drwxr-xr-x  7 Misha  staff    224 Mar 28 21:21 [1m[36m.[m[m
  0 drwxr-xr-x  9 Misha  staff    288 Mar 28 17:35 [1m[36m..[m[m
  0 drwxr-xr-x  6 Misha  staff    192 Mar 26 23:41 [1m[36m.ipynb_checkpoints[m[m
 32 -rw-r--r--  1 Misha  staff  15084 Mar 28 14:22 decision_tree.ipynb
144 -rw-r--r--  1 Misha  staff  70555 Mar 26 23:41 knn_classifier.ipynb
 24 -rw-r--r--  1 Misha  staff   9549 Mar 28 20:08 knn_v2.ipynb
 48 -rw-r--r--  1 Misha  staff  23091 Mar 28 21:21 nn.ipynb


# Final test

In [6]:
mtts = MultipleTrainTestSplits(csv_path=csv_path)
pp = Preprocessor(path_to_raw_data=csv_path)

train_validation_set = mtts.train_validation_set
test_set = mtts.test_set

# Preprocess the training+validation
preprocessed_train_validation_set = pp.preprocess(train_validation_set)
# Split the features from the target
x_train_val = get_x(preprocessed_train_validation_set)
y_train_val = get_y(preprocessed_train_validation_set)

# Preprocess the test set
preprocessed_test_set = pp.preprocess(test_set)
# Split the features from the target
x_test = get_x(preprocessed_test_set)
y_test = get_y(preprocessed_test_set)

In [7]:
input_dim = x_train_val.shape[1]  # number of columns (dimensions for the input layer of the model)

# Winning model here
model = create_model_1(input_dim)

model.fit(x_train_val, y_train_val, epochs=1, batch_size=128)

final_loss_and_metrics = model.evaluate(x_test, y_test, batch_size=128)

Epoch 1/1


In [8]:
print(final_loss_and_metrics)

[0.7411688035576927, 0.5829927921295166]
