In [None]:
# !pip install lightgbm
# !pip install optuna

In [None]:
import sqlite3
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.inspection import permutation_importance
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA

from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


import matplotlib.pyplot as plt 
import seaborn as sns # Import seaborn

from datetime import datetime
from datetime import date
from dateutil import parser

import lightgbm as lgb
from sklearn.metrics import accuracy_score
import optuna
from optuna.integration import OptunaSearchCV

from collections import defaultdict

import time

from warnings import simplefilter

In [None]:
simplefilter(action="ignore",category=FutureWarning)

In [None]:
pd.options.mode.chained_assignment = None

# Import pre-processed data frame
curr_date = "2023-04-19"
pickle_file = 'sa-preprocessing-' + curr_date + '.pkl'
pickle_file_path = 'data/' + pickle_file
df_main_nn = pd.read_pickle(pickle_file_path)

In [None]:
# Create the dataframe for modeling
# feature_names = ["last_ten","last_five","last_three",
#                  "away_player_height_average","away_player_height_average",
# #                  "home_player_weight_average",
# #               "away_player_weight_average",
# #                  'home_player_age_average','away_player_age_average',
#                  'home_player_rating_average','away_player_rating_average',
#               'home_team_bmi' ,'away_team_bmi', 
#                  'last_10_diff']
feature_names = ["last_ten","last_five","last_three",  'last_10_diff', 
                "avg_height_diff", "avg_weight_diff", "avg_rating_diff",
                "avg_age_diff", "avg_bmi_diff"]
X = df_main_nn[feature_names].values
y = df_main_nn['outcome'].values

In [None]:
scaler = preprocessing.MinMaxScaler().fit(X)
X = scaler.transform(X)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y)
pd.DataFrame(X).tail(100)

In [None]:
df_main_nn[feature_names].tail(100)

In [None]:
# PCA

pca = PCA(n_components = len(feature_names))
pca.fit(X)

pca_evr = pd.DataFrame(pca.explained_variance_ratio_)

# Create a figure with two subplots
pca_evr.hist()
plt.title("PCA",fontsize=10)
plt.xlabel('Feature #',fontsize=10)
plt.ylabel('Explained Variance',fontsize=10)


In [None]:
# Baseline models
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()

decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

print(decision_tree.score(X_test, y_test))
print(random_forest.score(X_test, y_test))

In [None]:
# Feature importance (RF) - based on mean decrease in impurity
importances = random_forest.feature_importances_
forest_importances = pd.Series(importances, index=feature_names)
std = np.std([tree.feature_importances_ for tree in random_forest.estimators_], axis=0)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)

ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:

result = permutation_importance(
    random_forest, X_test, y_test) # n_repeats=10, random_state=42, n_jobs=2

forest_importances = pd.Series(result.importances_mean, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
# Logistic regression

logistic_regression = linear_model.LogisticRegression()
logistic_regression.fit(X_train, y_train)

print(logistic_regression.score(X_test, y_test))

In [None]:
# Gaussian Naive Bayes

gauss_nb = GaussianNB()
gauss_nb.fit(X_train, y_train)

print(gauss_nb.score(X_test, y_test))

In [None]:
# K nearest neighbor

k_neighbors = KNeighborsClassifier()
k_neighbors.fit(X_train, y_train)

print(k_neighbors.score(X_test, y_test))

In [None]:
# Adaboost

ada_boost = AdaBoostClassifier(n_estimators = 200)
ada_boost.fit(X_train, y_train)

print(ada_boost.score(X_test, y_test))

In [None]:
# Gradient Boost Classifier

gradient_boost = GradientBoostingClassifier()
gradient_boost.fit(X_train, y_train)

print(gradient_boost.score(X_test, y_test))

In [None]:
# Define the objective function
def objective(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'random_state': 42
    }
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
# Define the search space using IntLogUniformDistribution
search_space = {
    'num_leaves': optuna.distributions.IntLogUniformDistribution(2, 256),
    'learning_rate': optuna.distributions.LogUniformDistribution(0.001, 0.1),
    'max_depth': optuna.distributions.IntUniformDistribution(3, 10),
    'feature_fraction': optuna.distributions.UniformDistribution(0.1, 1.0),
    'bagging_fraction': optuna.distributions.UniformDistribution(0.1, 1.0),
    'bagging_freq': optuna.distributions.IntUniformDistribution(1, 10),
    'min_child_samples': optuna.distributions.IntUniformDistribution(5, 100),
}

# Create a study object and run the optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Train the final model with the best hyperparameters
best_params = study.best_params
model = lgb.LGBMClassifier(**best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_lgbm = accuracy_score(y_test, y_pred)

# Print the results
print('Best accuracy:', study.best_value)
print('Best parameters:', study.best_params)
print('Accuracy on test data:', accuracy_lgbm)

In [None]:
# Comparison of Models
model_names = ['Decision Tree', 'Random Forest', 'Logistic Regression', 'Gaussian Naive Bayes', 'KNN', 
               "Adaboost", "Gradient Boosting","LGBM"]
scores = [decision_tree.score(X_test, y_test), random_forest.score(X_test, y_test), logistic_regression.score(X_test, y_test), 
          gauss_nb.score(X_test, y_test),k_neighbors.score(X_test, y_test),ada_boost.score(X_test, y_test),
          gradient_boost.score(X_test, y_test),accuracy_lgbm]

# Create a bar chart
plt.bar(model_names, scores)

# Add labels and title
plt.xlabel('Model')
plt.ylabel('Mean score')
plt.title('Model comparison')

# Display the graph
for i in range(len(model_names)):
    plt.text(i, scores[i]/2, '{:.2f}'.format(scores[i]), ha='center', va='center')
plt.xticks(rotation=90)
plt.show()