In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import random

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

DATA_DIR = "data/"
LABELS_DIR = DATA_DIR + "labels/"

In [2]:
# CNN
def getModel_class(X_train, y_train):
    final_dense_val = y_train.shape[1]
    # shape
    input_shape = (X_train.shape[1],)
    model = Sequential([
        layers.Input(shape=input_shape),
        layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        layers.Dropout(0.4),
        layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        layers.Dropout(0.4),
        layers.Dense(final_dense_val, activation='sigmoid')
    ])
    early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
    model.compile(
        optimizer='adam', 
        loss='categorical_crossentropy', 
        metrics=['accuracy']
    )
    tf.function(experimental_relax_shapes=True)
    model.fit(
        X_train,
        y_train,
        verbose=0,
        epochs=100,
        validation_split=0.2,
        batch_size=8,
        callbacks=[early_stopping]
    )
    return model

In [3]:
# Load the Universal Sentence Encoder model
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(model_url)

In [4]:
# single primary key
def addPrimaryKey(df: pd.DataFrame):
    df['primary_key'] = df.apply(lambda x: '-'.join([x['key'], str(x['num'])]), axis=1)
    return df

# load frame
df = pd.read_csv("%s.csv" % (DATA_DIR + "allTables"))

In [23]:
# load/encode labels
labelName = "isExtraPoint"
labels = pd.read_csv("%s.csv" % (LABELS_DIR + labelName))
labels: pd.DataFrame = addPrimaryKey(labels)

X = embed(list(labels['detail'])).numpy()
y = labels[labelName]

In [24]:
# build model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_jobs=-1)
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
print(f"Accuracy: {acc}")

Accuracy: 1.0


In [25]:
# tf predict
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

encoder = OneHotEncoder()
encoder.fit((y.values).reshape(-1, 1))
y_train = (encoder.transform((y_train.values).reshape(-1, 1))).toarray()
y_test = (encoder.transform((y_test.values).reshape(-1, 1))).toarray()

# normalize
normalizer = layers.Normalization(input_shape=[X_train.shape[1], ], axis=-1)
normalizer.adapt(np.array(X_train))
normal_X_train = normalizer(X_train).numpy()
normal_X_test = normalizer(X_test).numpy()

tf_model: Sequential = getModel_class(normal_X_train, y_train)
score = tf_model.evaluate(normal_X_test, y_test)
print(f"Accuracy: {score[1]}")

Accuracy: 1.0


In [26]:
# test
test_key = '202309240gnb'
df = df.loc[df['key']==test_key]
df.reset_index(drop=True, inplace=True)
# preds = model.predict(embed(list(df['detail'])).numpy())
tf_preds = tf_model.predict(normalizer(embed(list(df['detail'])).numpy()).numpy())
tf_preds = np.nan_to_num(tf_preds)
tf_preds = encoder.inverse_transform(tf_preds).flatten()
tf_preds = tf_preds.flatten()
df[labelName] = tf_preds
df[[labelName, 'detail']].to_csv("%s.csv" % "temp", index=False)

In [27]:
# test = ['Jordan Love pass complete deep left to Romeo Doubs for 30 yards. Penalty on Tyrann Mathieu: Illegal Contact, 5 yards (declined)']
test = ['Jordan Love sacked by Alontae Taylor for -14 yards']
print(model.predict(embed(test).numpy()))

tf_preds = tf_model.predict(normalizer(embed(test).numpy()).numpy())
tf_preds = np.nan_to_num(tf_preds)
tf_preds = encoder.inverse_transform(tf_preds).flatten()
tf_preds = tf_preds.flatten()
print(tf_preds)

[0]
[0]


In [28]:
# check pass and run attempts
cd = pd.read_csv("%s.csv" % "../data/gameData")
cd = cd.loc[cd['key']==test_key]

print(sum(df[labelName]))
print(sum([cd['home_pass_attempts'].values[0], cd['away_pass_attempts'].values[0]]))

3
78
