In [None]:
import sqlite3
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt 
import seaborn as sns # Import seaborn

from datetime import datetime
from datetime import date
from dateutil import parser

from collections import defaultdict

import warnings
import time


In [None]:
# open connection 
conn = sqlite3.connect("data/database.sqlite")
cur = conn.cursor()

In [None]:
# function to execute queries
def executeQuery(cur, query):
    print("executing query: ")
    cur.execute(query)
    return cur.fetchall()

In [None]:
# list of all tables
q_all_tables = """SELECT name FROM sqlite_master
    WHERE type='table';"""
all_tables = executeQuery(cur, q_all_tables)


## Read the dataframe from preprocessing 1

In [None]:
curr_date = "2023-03-05"
pickle_file = 'sa-preprocessing-' + curr_date + '.pkl'
pickle_file_path = 'data/' + pickle_file
df_main_nn = pd.read_pickle(pickle_file_path)

In [None]:
# calculate age
pd.options.mode.chained_assignment = None

hp = 'home_player_'
ap = 'away_player_'
for i in range(1, 12):
    hp_n = hp+str(i)
    ap_n = ap+str(i)
    df_main_nn['date'] = pd.to_datetime(df_main_nn['date'])
    df_main_nn[hp_n+"_birthday"] = pd.to_datetime(df_main_nn[hp_n+"_birthday"])
    df_main_nn[ap_n+"_birthday"] = pd.to_datetime(df_main_nn[ap_n+"_birthday"])
    
    df_main_nn[hp_n+"_age"] = (df_main_nn['date'].dt.year - df_main_nn[hp_n+"_birthday"].dt.year)
    df_main_nn[ap_n+"_age"] = (df_main_nn['date'].dt.year - df_main_nn[ap_n+"_birthday"].dt.year)   

In [None]:
print(df_main_nn.shape)

df_main_nn.tail()

In [None]:
df_main_nn.head()

In [None]:
# Average Height for teams

hp = 'home_player_'
hp_col_list_height = [hp+str(i)+'_height' for i in range(1,12)] 

ap = 'away_player_'
ap_col_list_height = [ap+str(i)+'_height' for i in range(1,12)] 
  

df_main_nn[hp+'height_total'] = df_main_nn[hp_col_list_height].sum(axis=1)
df_main_nn[ap+'height_total'] = df_main_nn[ap_col_list_height].sum(axis=1)



In [None]:
# Average Weight for teams

hp = 'home_player_'
hp_col_list_weight = [hp+str(i)+'_weight' for i in range(1,12)] 
hp_col_list_age = [hp+str(i)+'_age' for i in range(1,12)]
hp_col_list_rating = [hp+str(i)+'_rating' for i in range(1,12)] 
# print(hp_col_list_weight)    

ap = 'away_player_'
ap_col_list_weight = [ap+str(i)+'_weight' for i in range(1,12)]
ap_col_list_age = [ap+str(i)+'_age' for i in range(1,12)]
ap_col_list_rating = [hp+str(i)+'_rating' for i in range(1,12)] 
# print(ap_col_list_weight)

df_main_nn[hp+'weight_total'] = df_main_nn[hp_col_list_weight].sum(axis=1)
df_main_nn[ap+'weight_total'] = df_main_nn[ap_col_list_weight].sum(axis=1)

df_main_nn[hp+'rating_total'] = df_main_nn[hp_col_list_rating].sum(axis=1)
df_main_nn[ap+'rating_total'] = df_main_nn[ap_col_list_rating].sum(axis=1)

df_main_nn[hp+'age_total'] = df_main_nn[hp_col_list_age].sum(axis=1)
df_main_nn[ap+'age_total'] = df_main_nn[ap_col_list_age].sum(axis=1)

df_main_nn[hp+'age_average'] = (df_main_nn[hp+'age_total']/11)
df_main_nn[ap+'age_average'] = (df_main_nn[ap+'age_total']/11)

df_main_nn[hp+'rating_average'] = (df_main_nn[hp+'rating_total']/11)
df_main_nn[ap+'rating_average'] = (df_main_nn[ap+'rating_total']/11)


In [None]:
# calculate BMI for teams 
df_main_nn['home_team_bmi'] = ((df_main_nn['home_player_weight_total']/2.20462)/((df_main_nn['home_player_height_total']/100)**2))*11
df_main_nn['away_team_bmi'] = ((df_main_nn['away_player_weight_total']/2.20462)/((df_main_nn['away_player_height_total']/100)**2))*11

In [None]:
df_main_nn.tail()

In [None]:
df_main_nn['outcome'] = (df_main_nn["home_team_goal"] > df_main_nn["away_team_goal"])

In [None]:
df_main_nn.tail()

In [None]:
hist_home_team_bmi = df_main_nn.hist(column='home_team_bmi', bins=10)

In [None]:
hist_away_team_bmi = df_main_nn.hist(column='away_team_bmi', bins=10)

In [None]:
hist_home_player_age_average = df_main_nn.hist(column='home_player_age_average', bins=10)

In [None]:
hist_away_player_age_average = df_main_nn.hist(column='away_player_age_average', bins=10)

In [None]:
count_home_team_win = df_main_nn['outcome'].value_counts()[True]
count_home_team_not_win = df_main_nn['outcome'].value_counts()[False]
game_sum = count_home_team_win + count_home_team_not_win 
print("Home team win count")
print(count_home_team_win)
print("\nHome team no win count")
print(count_home_team_not_win)
print("\nActual win percentage for home team")
print((count_home_team_win / game_sum )* 100)

In [None]:
# Create the dataframe for modeling
X = df_main_nn[["home_team_bmi", "away_team_bmi", "home_player_age_average", "away_player_age_average", "home_player_rating_average", "away_player_rating_average"]].values
y = df_main_nn['outcome'].values

X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()

decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)

print(decision_tree.score(X_test, y_test))
print(random_forest.score(X_test, y_test))


In [None]:
# always run this at connecting
conn.close()