<a href="https://colab.research.google.com/github/misharigot/kobe/blob/master/src/model/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains the neural network to predict kobe's shots.

In [9]:
import sys; sys.path.insert(0, '..')  # Needed to make the import below work

from multiple_train_test_splits import MultipleTrainTestSplits

import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from sklearn import preprocessing


import numpy as np
import csv
import pandas as pd
from scipy import stats

from keras.models import Sequential
from keras.layers import Dense
import keras
from sklearn import preprocessing
import tensorflow as ft

from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt

In [10]:
def combine_time(df: pd.DataFrame) -> pd.DataFrame:
    """Combine the minutes and seconds remaining columns into one column.
    """
    df['minutes_remaining'] = df['minutes_remaining'].astype(int)
    df['seconds_remaining'] = df['seconds_remaining'].astype(int)

    # Combine minutes and seconds remaining into decimal minutes remaining, e.g. 6.5 for 6 mins and 30 secs.
    df['time_remaining'] = round(df['minutes_remaining'] + (df['seconds_remaining'] / 60), 2)
    return df

In [11]:
def one_hot_encode(df: pd.DataFrame, encoder: preprocessing.OneHotEncoder = None) -> pd.DataFrame:
    """One-hot encode all categorical columns.
    Optionally provide an encoder. Use the training set encoder to one-hot encode the test set.
    """
     # Categorize all columns based on their data type
    categorical_columns = [
        'action_type',
        'combined_shot_type',
        'game_event_id', # Meaning?
        'game_id',
        'season',
        'shot_type',
        'shot_zone_area',
        'shot_zone_basic',
        'shot_zone_range',
        'team_id',
        'team_name',
        'matchup',
        'opponent'
    ]

    temporal_columns = [
        'game_date'
    ]

    remaining_columns = [
        'lat',
        'loc_x',
        'loc_y',
        'lon',
        'period',
        'shot_distance',
        'time_remaining',
        'shot_made_flag'  # y label
    ]

    excluded_columns = [
        'shot_id',            # Just an auto-increment id, does not mean anything
        'minutes_remaining',  # Not needed, since we use the engineered field 'time_remaining'
        'seconds_remaining'   # Not needed, since we use the engineered field 'time_remaining'
    ]

    # Convert relevant columns to categorical columns
    df[categorical_columns] = df[categorical_columns].astype('category')
    df_with_only_categoricals = df[categorical_columns]

    # One hot encode categorical columns
    if encoder is None:
        encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
        encoder.fit(df_with_only_categoricals)
    one_hot_encoded_df = pd.DataFrame(encoder.transform(df_with_only_categoricals).toarray())
    

    # Combine the one hot encoded part of the df with the remaining df
    non_categorical_df = df[remaining_columns]
    resulting_df = pd.concat([one_hot_encoded_df, non_categorical_df], axis=1)
    return resulting_df, encoder

In [12]:
def get_x(data: pd.DataFrame) -> pd.DataFrame:
    """Returns the features.
    """
    X = data.drop(columns=['shot_made_flag'])
    return X

def get_y(data: pd.DataFrame) -> pd.Series:
    """Returns the target.
    """
    Y = data['shot_made_flag'].copy()
    return Y


In [13]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from mlxtend.plotting import plot_decision_regions

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# # Plot the decision boundary with the first 50 points in the test set
# numpy_x = train[['stature','span']].as_matrix()
# numpy_y = train.Gender.cat.codes.values

# # This is necessary if pandas read the CSV files as integers
# # (seems to depend on version/OS)
# numpy_x = numpy_x.astype(float)

# # Rebuild the classifier 
# # (a classifier trained on pandas data doesn't interoperate well with pure numpy data)
# tree = DecisionTreeClassifier()
# tree.fit(numpy_x, numpy_y)

# plot_decision_regions(numpy_x[:25, :], numpy_y[:25], clf=tree, res=0.1);


In [14]:
def preprocess(data: pd.DataFrame, encoder:preprocessing.OneHotEncoder = None) -> np.array:
    """Preprocess the raw kobe data from Kaggle.
    Optionally provide an encoder. Use the training set encoder to one-hot encode the test set.
    """
    df = combine_time(data)
    df, encoder = one_hot_encode(df, encoder)
    
    return df, encoder

In [15]:
# def print_df():
    
#     df.info(verbose=True) 
    
# print_df()

In [26]:
mtts = MultipleTrainTestSplits(csv_path='../../data/data.csv')

test_set = mtts.test_set

loss_and_metrics = []
scores = []
i = 0

for train_set, validation_set in mtts.train_validation_split(as_dataframe=True):
    i += 1
    # Preprocess the training set
    preprocessed_train_set, one_hot_encoder = preprocess(train_set)
    # Split the features from the target
    x_train = get_x(preprocessed_train_set)
    y_train = get_y(preprocessed_train_set)

    # Preprocess the validation set (use the one hot encoder that was fit on the training set)
    preprocessed_validation_set, _ = preprocess(validation_set, encoder=one_hot_encoder)
    # Split the features from the target
    x_validation = get_x(preprocessed_validation_set)
    y_validation = get_y(preprocessed_validation_set)


    tree = DecisionTreeClassifier(random_state = 5)

    tree.fit(x_train, y_train.astype('int'))

    y_predicted = tree.predict(x_validation)
    acc = accuracy_score(y_validation.astype('int'), y_predicted.astype('int'))

    print(f' i: {i} - acc: {acc}')

depth = tree.get_depth()
params = tree.get_params(deep=True)

print()
print('Tree depth is: ', depth)
print()
print('Params of the tree are: ', params)

 i: 1 - acc: 0.6199649737302977
 i: 2 - acc: 0.62405137186223
 i: 3 - acc: 0.616656937147305

Tree depth is:  184

Params of the tree are:  {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': 'deprecated', 'random_state': 5, 'splitter': 'best'}


In [None]:
classes = model.predict(x_validation, batch_size=128)
classes