<a href="https://colab.research.google.com/github/misharigot/kobe/blob/master/src/model/nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains the neural network to predict kobe's shots.

## To Do from Trello
- [x] Implementeren van cross validation.
- [ ] Connecten van nieuwe cross validation module met de nn model module.
- [ ] Bouwen van verschillende netwerken (vorm, aantal nodes etc.)
- [ ] Kijken welke loss function we moeten gebruiken, cross entropy vs log loss. Log loss sowieso proberen om te vergelijken met competition entries.
- [ ] Implementeren van model export functie.

In [2]:
# When using this notebook in Google Colab, clone the repo in the file system in
# order to use the python modules from the repo.
!git  clone https://github.com/misharigot/kobe.git

Cloning into 'kobe'...
remote: Enumerating objects: 178, done.[K
remote: Counting objects:   0% (1/178)[Kremote: Counting objects:   1% (2/178)[Kremote: Counting objects:   2% (4/178)[Kremote: Counting objects:   3% (6/178)[Kremote: Counting objects:   4% (8/178)[Kremote: Counting objects:   5% (9/178)[Kremote: Counting objects:   6% (11/178)[Kremote: Counting objects:   7% (13/178)[Kremote: Counting objects:   8% (15/178)[Kremote: Counting objects:   9% (17/178)[Kremote: Counting objects:  10% (18/178)[Kremote: Counting objects:  11% (20/178)[Kremote: Counting objects:  12% (22/178)[Kremote: Counting objects:  13% (24/178)[Kremote: Counting objects:  14% (25/178)[Kremote: Counting objects:  15% (27/178)[Kremote: Counting objects:  16% (29/178)[Kremote: Counting objects:  17% (31/178)[Kremote: Counting objects:  18% (33/178)[Kremote: Counting objects:  19% (34/178)[Kremote: Counting objects:  20% (36/178)[Kremote: Counting objects:  21% (38/17

In [3]:
import sys; sys.path.insert(0, '..')  # Needed to make the import below work

# Use the line below in Colab
from kobe.src.multiple_train_test_splits import MultipleTrainTestSplits

# Use the line below in a local env
# from multiple_train_test_splits import MultipleTrainTestSplits

import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint
from sklearn import preprocessing

Using TensorFlow backend.


In [0]:
def combine_time(df: pd.DataFrame) -> pd.DataFrame:
    """Combine the minutes and seconds remaining columns into one column.
    """
    df['minutes_remaining'] = df['minutes_remaining'].astype(int)
    df['seconds_remaining'] = df['seconds_remaining'].astype(int)

    # Combine minutes and seconds remaining into decimal minutes remaining, e.g. 6.5 for 6 mins and 30 secs.
    df['time_remaining'] = round(df['minutes_remaining'] + (df['seconds_remaining'] / 60), 2)
    return df

In [0]:
def one_hot_encode(df: pd.DataFrame, encoder: preprocessing.OneHotEncoder = None) -> pd.DataFrame:
    """One-hot encode all categorical columns.
    Optionally provide an encoder. Use the training set encoder to one-hot encode the test set.
    """
     # Categorize all columns based on their data type
    categorical_columns = [
        'action_type',
        'combined_shot_type',
        'game_event_id', # Meaning?
        'game_id',
        'season',
        'shot_type',
        'shot_zone_area',
        'shot_zone_basic',
        'shot_zone_range',
        'team_id',
        'team_name',
        'matchup',
        'opponent'
    ]

    temporal_columns = [
        'game_date'
    ]

    remaining_columns = [
        'lat',
        'loc_x',
        'loc_y',
        'lon',
        'period',
        'shot_distance',
        'time_remaining',
        'shot_made_flag'  # y label
    ]

    excluded_columns = [
        'shot_id',            # Just an auto-increment id, does not mean anything
        'minutes_remaining',  # Not needed, since we use the engineered field 'time_remaining'
        'seconds_remaining'   # Not needed, since we use the engineered field 'time_remaining'
    ]

    # Convert relevant columns to categorical columns
    df[categorical_columns] = df[categorical_columns].astype('category')
    df_with_only_categoricals = df[categorical_columns]

    # One hot encode categorical columns
    if encoder is None:
        encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
        encoder.fit(df_with_only_categoricals)
    one_hot_encoded_df = pd.DataFrame(encoder.transform(df_with_only_categoricals).toarray())

    # Combine the one hot encoded part of the df with the remaining df
    non_categorical_df = df[remaining_columns]
    resulting_df = pd.concat([one_hot_encoded_df, non_categorical_df], axis=1)
    return resulting_df, encoder

In [0]:
def get_x(data: pd.DataFrame) -> pd.DataFrame:
    """Returns the features.
    """
    X = data.drop(columns=['shot_made_flag'])
    return X

def get_y(data: pd.DataFrame) -> pd.Series:
    """Returns the target.
    """
    Y = data['shot_made_flag'].copy()
    return Y


In [0]:
def preprocess(data: pd.DataFrame, encoder:preprocessing.OneHotEncoder = None) -> np.array:
    """Preprocess the raw kobe data from Kaggle.
    Optionally provide an encoder. Use the training set encoder to one-hot encode the test set.
    """
    df = combine_time(data)
    df, encoder = one_hot_encode(df, encoder)
    
    return df, encoder

In [0]:
def create_model_1(input_dim: int):
    """Simple one hidden layer network.
    """
    model = Sequential()

    model.add(Dense(units=32, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.add(Dropout(0.5))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def create_model_2(input_dim: int):
    """2 hidden layers network.
    """
    model = Sequential()

    model.add(Dense(units=64, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def create_model_3(input_dim: int):
    """1 hidden layer network with a lot of neurons.
    """
    model = Sequential()

    model.add(Dense(units=int(input_dim/2), activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def get_models_dict():
    models = {}
    # models['model_1'] = create_model_1
    # models['model_2'] = create_model_2
    models['model_3'] = create_model_3
    return models


In [13]:
# Use in Colab
mtts = MultipleTrainTestSplits(csv_path='kobe/data/data.csv')

# Use in local
# mtts = MultipleTrainTestSplits(csv_path='../../data/data.csv')

test_set = mtts.test_set

loss_and_metrics = {}
models = get_models_dict()

# Loop over the models
for model_name, model_func in models.items():
    # checkpoint_path = "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"

    # Loop over the train/validation splits/folds
    n_fold = 0
    for train_set, validation_set in mtts.train_validation_split(as_dataframe=True):
        n_fold = n_fold + 1
        checkpoint_path = f"{model_name}_fold_{n_fold}_weights-improvement" + "-{epoch:02d}-{val_acc:.2f}.hdf5"

        # Preprocess the training set
        preprocessed_train_set, one_hot_encoder = preprocess(train_set)
        # Split the features from the target
        x_train = get_x(preprocessed_train_set)
        y_train = get_y(preprocessed_train_set)

        # Preprocess the validation set (use the one hot encoder that was fit on the training set)
        preprocessed_validation_set, _ = preprocess(validation_set, encoder=one_hot_encoder)
        # Split the features from the target
        x_validation = get_x(preprocessed_validation_set)
        y_validation = get_y(preprocessed_validation_set)

        input_dim = x_train.shape[1]  # number of columns (dimensions for the input layer of the model)
        
        # model = create_model(input_dim=input_dim)
        model = model_func(input_dim)

        # Create model checkpoint to be able to resume at a checkpoint when training crashes.
        checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
        callbacks_list = [checkpoint]

        # Fit the model
        model.fit(x_train, y_train, epochs=2, batch_size=10, 
                  validation_data=(x_validation, y_validation),
                  callbacks=callbacks_list, verbose=0)

        loss_and_metrics[f'{model_name}_fold_{n_fold}'] = (model.evaluate(x_validation, y_validation, batch_size=128))
    


Epoch 00001: val_acc improved from -inf to 0.52909, saving model to model_3_fold_1_weights-improvement-01-0.53.hdf5

Epoch 00002: val_acc improved from 0.52909 to 0.61374, saving model to model_3_fold_1_weights-improvement-02-0.61.hdf5

Epoch 00001: val_acc improved from -inf to 0.59720, saving model to model_3_fold_2_weights-improvement-01-0.60.hdf5

Epoch 00002: val_acc improved from 0.59720 to 0.60809, saving model to model_3_fold_2_weights-improvement-02-0.61.hdf5

Epoch 00001: val_acc improved from -inf to 0.61004, saving model to model_3_fold_3_weights-improvement-01-0.61.hdf5

Epoch 00002: val_acc did not improve from 0.61004


In [0]:
def print_average_metrics(loss_and_metrics):
    # Get average accuracy
    accuracies = []
    for row in loss_and_metrics:
        accuracies.append(row[1])
    avg_accuracy = sum(accuracies) / len(accuracies)

    print('Average accuracy:', round(avg_accuracy, 4))


print_average_metrics(loss_and_metrics)

In [14]:
!ls -lsa

total 75216
    4 drwxr-xr-x 1 root root     4096 Mar 26 02:10 .
    4 drwxr-xr-x 1 root root     4096 Mar 26 02:02 ..
    4 drwxr-xr-x 1 root root     4096 Mar 24 16:59 .config
    4 drwxr-xr-x 7 root root     4096 Mar 26 02:04 kobe
  272 -rw-r--r-- 1 root root   277168 Mar 26 02:04 model_1_weights-improvement-01-0.45.hdf5
  352 -rw-r--r-- 1 root root   358328 Mar 26 02:04 model_1_weights-improvement-01-0.46.hdf5
  432 -rw-r--r-- 1 root root   442040 Mar 26 02:04 model_1_weights-improvement-01-0.47.hdf5
  700 -rw-r--r-- 1 root root   716464 Mar 26 02:05 model_2_weights-improvement-01-0.58.hdf5
  864 -rw-r--r-- 1 root root   883888 Mar 26 02:05 model_2_weights-improvement-01-0.61.hdf5
  700 -rw-r--r-- 1 root root   716464 Mar 26 02:05 model_2_weights-improvement-02-0.59.hdf5
  864 -rw-r--r-- 1 root root   883888 Mar 26 02:06 model_2_weights-improvement-02-0.65.hdf5
 3952 -rw-r--r-- 1 root root  4043104 Mar 26 02:09 model_3_fold_1_weights-improvement-01-0.53.hdf5
 3952 -rw-r--r-- 1 root

In [0]:
classes = model.predict(x_validation, batch_size=128)
classes

array([[0.26074135],
       [0.2928418 ],
       [0.2698071 ],
       ...,
       [0.29890507],
       [0.9255194 ],
       [0.4461364 ]], dtype=float32)