In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
from sklearn import preprocessing # for label encoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None  # default='warn'

### Read the data
We read the data from a local csv file and construct our matrix of feature vectors

In [18]:
data_orig = pd.read_csv("data\play_by_play_2019.csv", dtype=object) # maybe should be dtype=str
data = data_orig.copy()

# Get rid of data after week 19
data = data.loc[data["week"].astype("int") < 19]

# Subtract 1 from each week date so that we match the elos csv
#data["week"] = data["week"].apply(lambda x: str(int(x) - 1))


# Trim up our data to only features we care about ( we'll add elo's later)
feature_vectors = data[["touchdown", "season_type", "yardline_100", "quarter_seconds_remaining", "half_seconds_remaining",
                 "game_seconds_remaining", "game_half", "drive", "sp", "qtr", "down", "goal_to_go", "ydstogo", "play_type", "rush_attempt", "pass_attempt",
                            "td_prob"]]
# Once I get to a certain point, I should try adding data from other csv files to this dataframe as well

In [19]:
# In this cell I should add ELO ratings to each row in the data.  
# Going to skip that for now and just do this with data we already have
# Weird inconsitency between elo vals and actual data - quick hack might be to just
# add an elo of 1500 where we can't find one....

with open("eloVals.json") as file:
    elos = json.load(file)

home_off_elo = []
home_def_elo = []
away_off_elo = []
away_def_elo = []

YEAR = 2019

#print(feature_vectors.shape)
# Loop through rows
for i in range(data.shape[0]):
    # Tricky Part --> if we're getting the elos going into a game for a team, and they didn't play
    #                  the week before, we have to get the elo from the game two weeks ago
    # 
    # The try except blocks are a bit of a hack but they works!
    try:
        home_off_elo.append(elos[data["home_team"].iloc[i]][str(YEAR)][str(int(data["week"].iloc[i]) - 1)]["off"])
    except KeyError:
        home_off_elo.append(elos[data["home_team"].iloc[i]][str(YEAR)][str(int(data["week"].iloc[i]) - 2)]["off"])
        
    try:
        home_def_elo.append(elos[data["home_team"].iloc[i]][str(YEAR)][str(int(data["week"].iloc[i]) - 1)]["def"])
    except KeyError:
        home_def_elo.append(elos[data["home_team"].iloc[i]][str(YEAR)][str(int(data["week"].iloc[i]) - 2)]["def"])
        
    try:
        away_off_elo.append(elos[data["away_team"].iloc[i]][str(YEAR)][str(int(data["week"].iloc[i]) - 1)]["off"])
    except KeyError:
        away_off_elo.append(elos[data["away_team"].iloc[i]][str(YEAR)][str(int(data["week"].iloc[i]) - 2)]["off"])
        
    try:
        away_def_elo.append(elos[data["away_team"].iloc[i]][str(YEAR)][str(int(data["week"].iloc[i]) - 1)]["def"])
    except KeyError:
        away_def_elo.append(elos[data["away_team"].iloc[i]][str(YEAR)][str(int(data["week"].iloc[i]) - 2)]["def"])
        
# Now we can add these elo arrays to our feature_vectors dataframe as columns
feature_vectors["home_off_elo"] = home_off_elo
feature_vectors["home_def_elo"] = home_def_elo
feature_vectors["away_off_elo"] = away_off_elo
feature_vectors["away_def_elo"] = away_def_elo

### Clean the data
Need to clean up the data so it can be processed by the KNN algorithm

In [21]:
#Delete all columns that have bad plays (i.e. kickoffs, field goals)
feature_vectors = feature_vectors[feature_vectors.yardline_100.notnull()]
feature_vectors = feature_vectors[feature_vectors.down.notnull()]
feature_vectors = feature_vectors[feature_vectors.play_type.notnull()]

# Go through and fill in missing values for the columns that initially had NA values
#feature_vectors["down"] = feature_vectors["down"].fillna(-1)
#feature_vectors["play_type"] = feature_vectors["play_type"].fillna("yikes")
#feature_vectors["rush_attempt"] = feature_vectors["rush_attempt"].fillna(-1)
#feature_vectors["pass_attempt"] = feature_vectors["pass_attempt"].fillna(-1)
#print(feature_vectors["play_type"])

# Encode columns in feature_vectors so that there aren't any strings 
le = preprocessing.LabelEncoder()
feature_vectors["season_type"] = le.fit_transform(feature_vectors["season_type"])
feature_vectors["game_half"] = le.fit_transform(feature_vectors["game_half"])
feature_vectors["play_type"] = le.fit_transform(feature_vectors["play_type"])

### Split the data into the training and test sets

In [22]:
# Get the training and test data
train_data, test_data = train_test_split(feature_vectors, test_size=0.2, random_state=35, shuffle=True)

train_labels = train_data[["touchdown"]]
test_labels = test_data[["touchdown"]]

train_data = train_data.drop("touchdown", 1)
test_data = test_data.drop("touchdown", 1)

#print(test_data.iloc[0])

### Set up the KNN model

In [23]:
model = KNeighborsClassifier(n_neighbors=5)

model.fit(train_data, train_labels.values.ravel())
predicted = model.predict(test_data)

# predicted is a ndarray while test_labels is a DataFrame
#print("Predicted: ", predicted[100:150])
#print("Actual Labels: ", test_labels.iloc[100:150])

### Calculate accuracy
Compare predicted with test labels and output the accuracy of our model

In [26]:
print("length predicted: ", predicted.shape)
print("shape actual labels: ", test_labels.shape)

test_labels_arr = test_labels.values.ravel() # Convert our test labels into a flat array
assert len(predicted) == len(test_labels_arr), "Predictions and test labels should have same len"

matches = 0

num_touchdowns = 0
correctly_pred_touchdowns = 0

for i in range(0, len(predicted)):
    if predicted[i] == test_labels_arr[i]:
        matches += 1
    
    if test_labels_arr[i] == '1':
        num_touchdowns += 1
        
    if test_labels_arr[i] == '1' and predicted[i] == '1':
        correctly_pred_touchdowns += 1
        
gen_accuracy = matches / len(predicted)
td_accuracy = correctly_pred_touchdowns / num_touchdowns

print("General Accuracy of model on test data: ", gen_accuracy)

print("Accuracy on touchdowns: ", td_accuracy)
    

length predicted:  (7753,)
shape actual labels:  (7753, 1)
General Accuracy of model on test data:  0.9650457887269444
Accuracy on touchdowns:  0.0
