In [None]:
import sqlite3
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt 
import seaborn as sns # Import seaborn

from datetime import datetime
from datetime import date
from dateutil import parser

from collections import defaultdict

import time

from warnings import simplefilter

In [None]:
simplefilter(action="ignore",category=FutureWarning)

In [None]:
pd.options.mode.chained_assignment = None

# Import pre-processed data frame
curr_date = "2023-03-16"
pickle_file = 'sa-preprocessing-' + curr_date + '.pkl'
pickle_file_path = 'data/' + pickle_file
df_main_nn = pd.read_pickle(pickle_file_path)

In [None]:
# Create the dataframe for modeling
feature_names = ["last_ten","last_five","last_three","away_player_height_average","away_player_height_average","home_player_weight_average",
              "away_player_weight_average",'home_player_age_average','away_player_age_average','home_player_rating_average','away_player_rating_average',
              'home_team_bmi' ,'away_team_bmi']
X = df_main_nn[feature_names].values
y = df_main_nn['outcome'].values



X_train,X_test,y_train,y_test = train_test_split(X,y)
pd.DataFrame(X).tail(100)

In [None]:
# PCA
from sklearn import preprocessing
from sklearn.decomposition import PCA

scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)

pca = PCA(n_components = 13)
pca.fit(X)

pca_evr = pd.DataFrame(pca.explained_variance_ratio_)

# Create a figure with two subplots
pca_evr.hist()
plt.title("PCA",fontsize=10)
plt.xlabel('Feature #',fontsize=10)
plt.ylabel('Explained Variance',fontsize=10)


In [None]:
# Baseline models
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()

decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

print(decision_tree.score(X_test, y_test))
print(random_forest.score(X_test, y_test))

In [None]:
# Feature importance (RF) - based on mean decrease in impurity
importances = random_forest.feature_importances_
forest_importances = pd.Series(importances, index=feature_names)
std = np.std([tree.feature_importances_ for tree in random_forest.estimators_], axis=0)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)

ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    random_forest, X_test, y_test) # n_repeats=10, random_state=42, n_jobs=2

forest_importances = pd.Series(result.importances_mean, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
# Logistic regression
from sklearn import linear_model

logistic_regression = linear_model.LogisticRegression()
logistic_regression.fit(X_train, y_train)

print(logistic_regression.score(X_test, y_test))

In [None]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

gauss_nb = GaussianNB()
gauss_nb.fit(X_train, y_train)

print(gauss_nb.score(X_test, y_test))

In [None]:
# K nearest neighbor
from sklearn.neighbors import KNeighborsClassifier

k_neighbors = KNeighborsClassifier()
k_neighbors.fit(X_train, y_train)

print(k_neighbors.score(X_test, y_test))

In [None]:
# Adaboost
from sklearn.ensemble import AdaBoostClassifier

ada_boost = AdaBoostClassifier(n_estimators = 200)
ada_boost.fit(X_train, y_train)

print(ada_boost.score(X_test, y_test))

In [None]:
# Gradient Boost Classifier
from sklearn.ensemble import GradientBoostingClassifier

gradient_boost = GradientBoostingClassifier()
gradient_boost.fit(X_train, y_train)

print(gradient_boost.score(X_test, y_test))

In [None]:
# Comparison of Models
model_names = ['Decision Tree', 'Random Forest', 'Logistic Regression', 'Gaussian Naive Bayes', 'KNN', 
               "Adaboost", "Gradient Boosting"]
scores = [decision_tree.score(X_test, y_test), random_forest.score(X_test, y_test), logistic_regression.score(X_test, y_test), 
          gauss_nb.score(X_test, y_test),k_neighbors.score(X_test, y_test),ada_boost.score(X_test, y_test),
          gradient_boost.score(X_test, y_test)]

# Create a bar chart
plt.bar(model_names, scores)

# Add labels and title
plt.xlabel('Model')
plt.ylabel('Mean score')
plt.title('Model comparison')

# Display the graph
for i in range(len(model_names)):
    plt.text(i, scores[i]/2, '{:.2f}'.format(scores[i]), ha='center', va='center')
plt.xticks(rotation=90)
plt.show()