<a href="https://colab.research.google.com/github/misharigot/kobe/blob/master/src/model/nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains the neural network to predict kobe's shots.

## To Do from Trello
- [x] Implementeren van cross validation.
- [ ] Connecten van nieuwe cross validation module met de nn model module.
- [ ] Bouwen van verschillende netwerken (vorm, aantal nodes etc.)
- [ ] Kijken welke loss function we moeten gebruiken, cross entropy vs log loss. Log loss sowieso proberen om te vergelijken met competition entries.
- [ ] Implementeren van model export functie.

In [1]:
# When using this notebook in Google Colab, clone the repo in the file system in
# order to use the python modules from the repo.
!git  clone https://github.com/misharigot/kobe.git
!ls -lsa

fatal: destination path 'kobe' already exists and is not an empty directory.
total 20
4 drwxr-xr-x 1 root root 4096 Mar 29 09:40 .
4 drwxr-xr-x 1 root root 4096 Mar 29 09:39 ..
4 drwxr-xr-x 1 root root 4096 Mar 25 16:11 .config
4 drwxr-xr-x 6 root root 4096 Mar 29 09:40 kobe
4 drwxr-xr-x 1 root root 4096 Mar 18 16:23 sample_data


In [0]:
import sys; sys.path.insert(0, '..')  # Needed to make the import below work

# Use the line below in Colab
from kobe.src.multiple_train_test_splits import MultipleTrainTestSplits
from kobe.src.preprocessor import Preprocessor

# Use the line below in a local env
# from multiple_train_test_splits import MultipleTrainTestSplits
# from preprocessor import Preprocessor

In [19]:
# Because colab uses a different Keras version than we do.
!pip install keras=="2.3.1"

Collecting keras==2.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/ad/fd/6bfe87920d7f4fd475acd28500a42482b6b84479832bdc0fe9e589a60ceb/Keras-2.3.1-py2.py3-none-any.whl (377kB)
[K     |████████████████████████████████| 378kB 1.4MB/s 
Installing collected packages: keras
  Found existing installation: Keras 2.2.5
    Uninstalling Keras-2.2.5:
      Successfully uninstalled Keras-2.2.5
Successfully installed keras-2.3.1


In [3]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint
from sklearn import preprocessing

Using TensorFlow backend.


In [0]:
def get_x(data: pd.DataFrame) -> pd.DataFrame:
    """Returns the features.
    """
    X = data.drop(columns=['shot_made_flag'])
    return X

def get_y(data: pd.DataFrame) -> pd.Series:
    """Returns the target.
    """
    Y = data['shot_made_flag'].copy()
    return Y


In [0]:
def create_model_1(input_dim: int):
    """Simple one hidden layer network.
    """
    model = Sequential()
    model.add(Dense(units=32, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def create_model_2(input_dim: int):
    """2 hidden layers network.
    """
    model = Sequential()

    model.add(Dense(units=64, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def create_model_3(input_dim: int):
    """1 hidden layer network with a lot of neurons.
    """
    model = Sequential()

    model.add(Dense(units=int(input_dim/2), activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def create_model_4(input_dim: int):
    """2 hidden layers network with more neurons per layer.
    """
    model = Sequential()

    model.add(Dense(units=int(input_dim/2), activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=int(input_dim/4), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

def create_model_5(input_dim: int):
    """3 hidden layers network.
    """
    model = Sequential()

    model.add(Dense(units=128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

def create_model_6(input_dim: int):
    """Simple one hidden layer network with double the dim of model 1.
    """
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def get_models_dict():
    models = {}
    # models['model_1'] = create_model_1
    # models['model_2'] = create_model_2
    # models['model_3'] = create_model_3
    # models['model_4'] = create_model_4
    # models['model_5'] = create_model_5
    models['model_6'] = create_model_6

    return models


In [0]:
# Use in Colab
csv_path = 'kobe/data/data.csv'

# Use in local env
# csv_path = '../../data/data.csv'

In [27]:
mtts = MultipleTrainTestSplits(csv_path=csv_path)
pp = Preprocessor(path_to_raw_data=csv_path)

test_set = mtts.test_set

loss_and_metrics = {}
models = get_models_dict()

# Loop over the models
for model_name, model_func in models.items():
    # checkpoint_path = "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
    
    loss_and_metrics[model_name] = {}
    
    # Loop over the train/validation splits/folds
    n_fold = 0
    for train_set, validation_set in mtts.train_validation_split(as_dataframe=True):
        n_fold += 1
        print(f'Training model: {model_name}, Fold: {n_fold}')
        checkpoint_path = f"{model_name}_fold_{n_fold}_weights-improvement" + "-{epoch:02d}-{val_acc:.2f}.hdf5"

        # Preprocess the training set
        preprocessed_train_set = pp.preprocess(train_set)
        # Split the features from the target
        x_train = get_x(preprocessed_train_set)
        y_train = get_y(preprocessed_train_set)

        # Preprocess the validation set
        preprocessed_validation_set= pp.preprocess(validation_set)
        # Split the features from the target
        x_validation = get_x(preprocessed_validation_set)
        y_validation = get_y(preprocessed_validation_set)

        input_dim = x_train.shape[1]  # number of columns (dimensions for the input layer of the model)
        
        # model = create_model(input_dim=input_dim)
        model = model_func(input_dim)

#         # Create model checkpoint to be able to resume at a checkpoint when training crashes.
#         checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
#         callbacks_list = [checkpoint]

        # Fit the model
#         model.fit(x_train, y_train, epochs=2, batch_size=10, 
#                   validation_data=(x_validation, y_validation),
#                   callbacks=callbacks_list, verbose=0)
        model.fit(x_train, y_train, epochs=50, batch_size=128)
    
        loss_and_metrics[model_name][n_fold] = model.evaluate(x_validation, y_validation, batch_size=128)
    

Training model: model_6, Fold: 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training model: model_6, Fold: 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoc

In [13]:
run1_metrics = loss_and_metrics
run1_metrics

{'model_1': {1: [0.7175507575163328, 0.5962249636650085],
  2: [0.7793352637547977, 0.5539988279342651],
  3: [0.7826963207396489, 0.5483556985855103]},
 'model_2': {1: [0.6633978218251941, 0.6063436269760132],
  2: [0.6620032916298217, 0.5866900086402893],
  3: [0.6733418847248466, 0.5670363903045654]},
 'model_3': {1: [0.6824695722535788, 0.6322241425514221],
  2: [0.7345550649243585, 0.5139132142066956],
  3: [0.7718580756746022, 0.4928974509239197]}}

In [28]:
loss_and_metrics

{'model_6': {1: [0.6284209717268794, 0.6763961911201477],
  2: [0.6330148733457858, 0.655964195728302],
  3: [0.6457746217259639, 0.6598560214042664]}}

In [29]:
def print_average_metrics(loss_and_metrics):
    for model_name, model_folds in loss_and_metrics.items():
        sum_list = []
        for i, fold in model_folds.items():
            sum_list.append(fold[1])
            'folds:'
        print(f'average for model {model_name}')
        print(sum(sum_list)/len(model_folds))

print_average_metrics(loss_and_metrics)

average for model model_6
0.6640721360842387


In [0]:
!ls -lsa

total 248
  0 drwxr-xr-x  7 Misha  staff    224 Mar 28 21:21 [1m[36m.[m[m
  0 drwxr-xr-x  9 Misha  staff    288 Mar 28 17:35 [1m[36m..[m[m
  0 drwxr-xr-x  6 Misha  staff    192 Mar 26 23:41 [1m[36m.ipynb_checkpoints[m[m
 32 -rw-r--r--  1 Misha  staff  15084 Mar 28 14:22 decision_tree.ipynb
144 -rw-r--r--  1 Misha  staff  70555 Mar 26 23:41 knn_classifier.ipynb
 24 -rw-r--r--  1 Misha  staff   9549 Mar 28 20:08 knn_v2.ipynb
 48 -rw-r--r--  1 Misha  staff  23091 Mar 28 21:21 nn.ipynb


In [0]:
# Final test

In [0]:
mtts = MultipleTrainTestSplits(csv_path=csv_path)
pp = Preprocessor(path_to_raw_data=csv_path)

train_validation_set = mtts.train_validation_set
test_set = mtts.test_set

# Preprocess the training+validation
preprocessed_train_validation_set = pp.preprocess(train_validation_set)
# Split the features from the target
x_train_val = get_x(preprocessed_train_validation_set)
y_train_val = get_y(preprocessed_train_validation_set)

# Preprocess the test set
preprocessed_test_set = pp.preprocess(test_set)
# Split the features from the target
x_test = get_x(preprocessed_test_set)
y_test = get_y(preprocessed_test_set)

In [0]:
input_dim = x_train_val.shape[1]  # number of columns (dimensions for the input layer of the model)

# Winning model here
model = create_model_1(input_dim)

model.fit(x_train_val, y_train_val, epochs=1, batch_size=128)

final_loss_and_metrics = model.evaluate(x_test, y_test, batch_size=128)

Epoch 1/1


In [0]:
print(final_loss_and_metrics)

[0.7411688035576927, 0.5829927921295166]
