<a href="https://colab.research.google.com/github/misharigot/kobe/blob/master/src/model/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains the neural network to predict kobe's shots.

In [6]:
import sys; sys.path.insert(0, '..')  # Needed to make the import below work

from multiple_train_test_splits import MultipleTrainTestSplits
from preprocessor import Preprocessor

import numpy as np
import pandas as pd


from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [7]:
def get_x(data: pd.DataFrame) -> pd.DataFrame:
    """Returns the features.
    """
    X = data.drop(columns=['shot_made_flag'])
    return X

def get_y(data: pd.DataFrame) -> pd.Series:
    """Returns the target.
    """
    Y = data['shot_made_flag'].copy()
    return Y


In [None]:
mtts = MultipleTrainTestSplits(csv_path='../../data/data.csv')
pp = Preprocessor('../../data/data.csv')

test_set = mtts.test_set

accuracies = {}

ks = range(1, 51)
all_averages = []

for k in ks:
    accuracies[k] = {}
    fold = 0
    scores = []

    for train_set, validation_set in mtts.train_validation_split(as_dataframe=True):
        fold += 1
        
        # Preprocess the training set
        preprocessed_train_set = pp.preprocess(train_set)
        # Split the features from the target
        x_train = get_x(preprocessed_train_set)
        y_train = get_y(preprocessed_train_set)

        # Preprocess the validation set (use the one hot encoder that was fit on the training set)
        preprocessed_validation_set = pp.preprocess(validation_set)
        # Split the features from the target
        x_validation = get_x(preprocessed_validation_set)
        y_validation = get_y(preprocessed_validation_set)

        knn = KNeighborsClassifier(k)
        knn.fit(x_train, y_train.astype(int))
        
        y_predicted = knn.predict(x_validation)
        accuracy = accuracy_score(y_validation.astype(int), y_predicted.astype(int))
        accuracies[k][fold] = accuracy
        
        scores.append(accuracy_score(y_validation.astype('int'), y_predicted.astype('int')))

        print(f'k = {k}. Accuracy = {accuracy}')
    
    
    avg_accuracy = sum(scores) / len(scores)
    all_averages.append(avg_accuracy)
    print('Average per k:', avg_accuracy)
    print()
    
    
# Results with old pre-processing
# Loop: 3 accuracy  0.5660634364662386
# Loop: 3 accuracy  0.5598365440747227
# Loop: 3 accuracy  0.5471881689044561

k = 1. Accuracy = 0.5571122786534345
k = 1. Accuracy = 0.5508853862619186
k = 1. Accuracy = 0.5489394823895699
Average per k: 0.5523123824349744

k = 2. Accuracy = 0.5752091846662775
k = 2. Accuracy = 0.5631445806577156
k = 2. Accuracy = 0.569955244210936
Average per k: 0.569436336511643

k = 3. Accuracy = 0.5742362327301032
k = 3. Accuracy = 0.5586690017513135
k = 3. Accuracy = 0.5504962054874489
Average per k: 0.5611338133229552

k = 4. Accuracy = 0.5882467406110138
k = 4. Accuracy = 0.5755983654407473
k = 4. Accuracy = 0.5732632807939287
Average per k: 0.5790361289485633

k = 5. Accuracy = 0.5767659077641565
k = 5. Accuracy = 0.5682039307258221
k = 5. Accuracy = 0.5555555555555556
Average per k: 0.566841798015178

k = 6. Accuracy = 0.5870791982876046
k = 6. Accuracy = 0.573068690406694
k = 6. Accuracy = 0.5693714730492314
Average per k: 0.5765064539145099

k = 7. Accuracy = 0.5779334500875657
k = 7. Accuracy = 0.5691768826619965
k = 7. Accuracy = 0.5610040863981319
Average per k: 0.

In [None]:
get_ipython().run_line_magic('matplotlib', 'inline')

# plot the relationship between K and testing accuracy
# plt.plot(x_axis, y_axis)
plt.plot(np.array(ks), np.array(all_averages))
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')

In [None]:
def print_average_metrics(accuracies):
    for k, folds in accuracies.items():
        accs = []
        for i, acc in folds.items():
           
            accs.append(acc)
        print(sum(accs)/len(folds))
    
    


print_average_metrics(accuracies)

In [16]:
# plot the relationship between K and testing accuracy
# plt.plot(x_axis, y_axis)
plt.plot(np.array(k_range), np.array(all_averages))
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')

In [None]:
classes = model.predict(x_validation, batch_size=128)
classes