<a href="https://colab.research.google.com/github/misharigot/kobe/blob/master/src/model/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains the neural network to predict kobe's shots.

## To Do from Trello
- [x] Implementeren van cross validation.
- [ ] Connecten van nieuwe cross validation module met de nn model module.
- [ ] Bouwen van verschillende netwerken (vorm, aantal nodes etc.)
- [ ] Kijken welke loss function we moeten gebruiken, cross entropy vs log loss. Log loss sowieso proberen om te vergelijken met competition entries.
- [ ] Implementeren van model export functie.

In [1]:
import sys; sys.path.insert(0, '..')  # Needed to make the import below work

from multiple_train_test_splits import MultipleTrainTestSplits

import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from sklearn import preprocessing

Using TensorFlow backend.


In [22]:
def combine_time(df: pd.DataFrame) -> pd.DataFrame:
    """Combine the minutes and seconds remaining columns into one column.
    """
    df['minutes_remaining'] = df['minutes_remaining'].astype(int)
    df['seconds_remaining'] = df['seconds_remaining'].astype(int)

    # Combine minutes and seconds remaining into decimal minutes remaining, e.g. 6.5 for 6 mins and 30 secs.
    df['time_remaining'] = round(df['minutes_remaining'] + (df['seconds_remaining'] / 60), 2)
    return df

In [48]:
def one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    """One-hot encode all categorical columns.
    """
     # Categorize all columns based on their data type
    categorical_columns = [
        'action_type',
        'combined_shot_type',
        'game_event_id', # Meaning?
        'game_id',
        'season',
        'shot_type',
        'shot_zone_area',
        'shot_zone_basic',
        'shot_zone_range',
        'team_id',
        'team_name',
        'matchup',
        'opponent'
    ]

    temporal_columns = [
        'game_date'
    ]

    remaining_columns = [
        'lat',
        'loc_x',
        'loc_y',
        'lon',
        'period',
        'shot_distance',
        'time_remaining',
        'shot_made_flag'  # y label
    ]

    excluded_columns = [
        'shot_id',            # Just an auto-increment id, does not mean anything
        'minutes_remaining',  # Not needed, since we use the engineered field 'time_remaining'
        'seconds_remaining'   # Not needed, since we use the engineered field 'time_remaining'
    ]

    # Convert relevant columns to categorical columns
    df[categorical_columns] = df[categorical_columns].astype('category')
    df_with_only_categoricals = df[categorical_columns]
    
    print(df_with_only_categoricals.describe())
    print('#######################################################\n####')
    
    # One hot encode categorical columns
    encoder = preprocessing.OneHotEncoder()
    encoder.fit(df_with_only_categoricals)
    one_hot_encoded_df = pd.DataFrame(encoder.transform(df_with_only_categoricals).toarray())
    

    # Combine the one hot encoded part of the df with the remaining df
    non_categorical_df = df[remaining_columns]
    resulting_df = pd.concat([one_hot_encoded_df, non_categorical_df], axis=1)
    return resulting_df

In [49]:
def get_x(data: pd.DataFrame) -> pd.DataFrame:
    """Returns the features.
    """
    temp = data.copy()
    X = temp.drop(columns=['shot_made_flag'])
    return X

def get_y(data: pd.DataFrame) -> pd.Series:
    """Returns the target.
    """
    return data['shot_made_flag'].copy()


In [50]:
def preprocess(data: pd.DataFrame) -> np.array:
    """Preprocess the raw kobe data from Kaggle.
    """
    df = combine_time(data)
    df = one_hot_encode(df)

    return df

In [51]:
def create_model(input_dim: int):
    model = Sequential()

    model.add(Dense(units=64, activation='relu', input_dim=input_dim))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


In [56]:
mtts = MultipleTrainTestSplits(csv_path='../../data/data.csv')

test_set = mtts.test_set

loss_and_metrics = []

for train_set, validation_set in mtts.train_validation_split(as_dataframe=True):
    # Preprocess the training set
    preprocessed_train_set = preprocess(train_set)
    # Split the features from the target
    x_train = get_x(preprocessed_train_set)
    y_train = get_y(preprocessed_train_set)

    # Preprocess the validation set
    preprocessed_validation_set = preprocess(validation_set)
    # Split the features from the target
    x_validation = get_x(preprocessed_validation_set)
    y_validation = get_y(preprocessed_validation_set)

    input_dim = x_train.shape[1]  # number of columns (dimensions for the input layer of the model)
    
    model = create_model(input_dim=input_dim)
    model.fit(x_train, y_train, epochs=2, batch_size=32)

    loss_and_metrics.append(model.evaluate(x_validation, y_validation, batch_size=128))


       action_type combined_shot_type  game_event_id   game_id   season  \
count         5141               5141           5141      5141     5141   
unique          26                  5            557       291        4   
top      Jump Shot          Jump Shot            313  20200069  2002-03   
freq          3235               3817             22        39     1598   

             shot_type shot_zone_area shot_zone_basic  shot_zone_range  \
count             5141           5141            5141             5141   
unique               2              6               7                5   
top     2PT Field Goal      Center(C)       Mid-Range  Less Than 8 ft.   
freq              4422           2408            2300             1742   

           team_id           team_name      matchup opponent  
count         5141                5141         5141     5141  
unique           1                   1           60       30  
top     1610612747  Los Angeles Lakers  LAL vs. SAS      SAS  
f

ValueError: Error when checking input: expected dense_75_input to have shape (1002,) but got array with shape (1000,)

In [None]:
classes = model.predict(x_validation, batch_size=128)
classes

array([[0.52622545],
       [0.26044014],
       [0.9890055 ],
       ...,
       [0.6824401 ],
       [0.39076325],
       [0.05106422]], dtype=float32)