In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import statsmodels.api as sm

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

# set seed for reproducibility
seed=99

### Neural Network (NN)

Neural Networks are "black box" models, meaning that it is often impossible to determine how exactly the models arrive at their predictions. This is because the networks process chunks of the training data one at a time and determine their own associations between the features and the result. This process can be controlled to a degree by manipulating the model layers, but there will still be an information gap when determining the exact relationships between variables. For a simple explanation of NNs, see AWS's [What is a Neural Network](https://aws.amazon.com/what-is/neural-network/#:~:text=A%20neural%20network%20is%20a,that%20resembles%20the%20human%20brain.).

I will create a NN with the full dataset, the PCA dataset, and the combined feature dataset, since NNs are often better able to handle larger amounts of features. The data needs to be scaled with the `MinMaxScaler` to have all the data values bet ween 0 and 1. I will use the `Adam` optimizer and `binary-crossentropy` loss function since they are designed for binary classification tasks. Additionally, the final layer needs to have `sigmoid` activation so the model prediction will be binary.




The Neural Network results are underwhelming. The model tends to reach its maximum accuracy in relatively few epochs, which is possibly due to the relatively small dataset (at least by NN standards), a lack of distinguishing features, or too much colinearity within the data. Given the necessary computational costs and iteration that is required for creating a NN model, the performance is far too poor to consider this method moving forward.

I have tested all the machine learning modeling methods, but model performance is still far short of the target accuracy of 68%. Also, the similarity between the models means that there is no one model that stands out. The final method I will attempt is creating an Elo rating system, which is described in-depth below.

In [None]:
# create a function to build the neural network model with a variable number of layers
def create_model(num_layers=5, hidden_units=16, activation='relu', optimizer='adam'):
    model = Sequential()
    model.add(Dense(hidden_units, input_dim=X_train.shape[1], activation=activation))
    
    for _ in range(num_layers - 1):  # Add additional hidden layers
        model.add(Dense(hidden_units, activation=activation))
    
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Load your dataset and preprocess it (X_train, y_train, X_test, y_test)

# Create a KerasClassifier for use with GridSearchCV
model = KerasClassifier(build_fn=create_model, verbose=0)

# Define hyperparameters and their possible values
param_grid = {
    'num_layers': [1, 2, 3],  # Try different numbers of layers
    'hidden_units': [16, 32, 64],
    'activation': ['relu', 'sigmoid'],
    'optimizer': ['adam', 'sgd']
}

# Use GridSearchCV for hyperparameter tuning
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X_train, y_train)

# Print the best parameters and their corresponding accuracy
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# Test the best model on the test data
best_model = grid_result.best_estimator_
test_loss, test_accuracy = best_model.model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
# full dataset
stat_columns = team_full_20.loc[:,'a_FG':'h_TOVp'].columns

X = team_full_20[stat_columns]
y = team_full_20['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.80, random_state=seed)

# scale the data
mm_scaler = MinMaxScaler()
X_train_scaled = mm_scaler.fit_transform(X_train)
X_test_scaled = mm_scaler.transform(X_test)

In [None]:
# explicitly set learning rate
learning_rate = 0.001
adam_optimizer = Adam(learning_rate=learning_rate)

model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(X_train_pca.shape[1],)))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # sigmoid activation for binary classification

model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# set epochs and batch_size for model training
epochs = 30
batch_size = 16
# train the model
model.fit(X_train_pca, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)

In [None]:
# calculate model test score
loss, accuracy = model.evaluate(X_test_pca, y_test, verbose=0)
# add model results to the results_df for comparison
nn_results = {'model_name': 'nn_pca', 'cv_score': None, 'gs_score': None, 'train_score': None, 'test_score': accuracy}
results_df = results_df.append(nn_results, ignore_index=True)

In [None]:
# combined dataset
X = team_full_20_combined
y = team_factor_20['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.80, random_state=seed)

# scale the data
mm_scaler = MinMaxScaler()
X_train_scaled = mm_scaler.fit_transform(X_train)
X_test_scaled = mm_scaler.transform(X_test)

In [None]:
# combined dataset PCA
X = team_full_20_combined[best_selected_feature_names]
y = team_factor_20['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.80, random_state=seed)

# scale the data
mm_scaler = MinMaxScaler()
X_train_scaled = mm_scaler.fit_transform(X_train)
X_test_scaled = mm_scaler.transform(X_test)

In [None]:
# explicitly set learning rate
learning_rate = 0.001
adam_optimizer = Adam(learning_rate=learning_rate)

model = Sequential()
model.add(Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # sigmoid activation for binary classification

model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# set epochs and batch_size for model training
epochs = 30
batch_size = 16
# train the model
model.fit(X_train_scaled, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)

In [None]:
# calculate model test score
loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
# add model results to the results_df for comparison
nn_results = {'model_name': 'nn_full', 'cv_score': None, 'gs_score': None, 'train_score': None, 'test_score': accuracy}
results_df = results_df.append(nn_results, ignore_index=True)