In [None]:
import sqlite3
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt 
import seaborn as sns # Import seaborn

from datetime import datetime
from datetime import date
from dateutil import parser

from collections import defaultdict

import warnings
import time

## Read the dataframe from eda and visualization

In [None]:
curr_date = "2023-03-16"
pickle_file = 'sa-eda-' + curr_date + '.pkl'
pickle_file_path = 'data/' + pickle_file
df_main_nn = pd.read_pickle(pickle_file_path)

In [None]:
# show all columns
pd.options.display.max_columns = None

In [None]:
# calculate age
pd.options.mode.chained_assignment = None

hp = 'home_player_'
ap = 'away_player_'
for i in range(1, 12):
    hp_n = hp+str(i)
    ap_n = ap+str(i)
    df_main_nn['date'] = pd.to_datetime(df_main_nn['date'])
    df_main_nn[hp_n+"_birthday"] = pd.to_datetime(df_main_nn[hp_n+"_birthday"])
    df_main_nn[ap_n+"_birthday"] = pd.to_datetime(df_main_nn[ap_n+"_birthday"])
    
    df_main_nn[hp_n+"_age"] = (df_main_nn['date'].dt.year - df_main_nn[hp_n+"_birthday"].dt.year)
    df_main_nn[ap_n+"_age"] = (df_main_nn['date'].dt.year - df_main_nn[ap_n+"_birthday"].dt.year)   

In [None]:
print(df_main_nn.shape)

df_main_nn.tail()

In [None]:
df_main_nn['outcome'] = (df_main_nn["home_team_goal"] > df_main_nn["away_team_goal"])

In [None]:
df_main_nn = df_main_nn.sort_values(['home_team_api_id', 'date'])

In [None]:
df_main_nn['last_ten'] = False

In [None]:
# df_main_clean['last_ten'] = df_main_clean.groupby('home_team_api_id')['outcome'].rolling(window=10, min_periods=1).sum().reset_index(level=0, drop=True)
# df_main_clean['last_ten'] = df_main_clean.groupby(['date', 'home_team_api_id'])['outcome'].rolling(window=10, min_periods=1).sum().shift(1)
df_main_nn['last_ten'] = df_main_nn.groupby('home_team_api_id')['outcome'].rolling(window=30, min_periods=1, closed="left").sum().reset_index(0, drop=True)
df_main_nn = df_main_nn[~df_main_nn.isnull().any(axis=1)]

In [None]:
# Proof of Concept for rolling window of 10 
df = pd.DataFrame({
    'team_id': ['A', 'A', 'A', 'B', 'B', 'B', 'A', 'A', 'B', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'B', 'A', 'A', 'B'],
    'game_date': pd.date_range(start='2022-01-01', periods=20, freq='D'),
    'game_result': [True, False, True, True, True, False, True, True, False, True, True, False, True, False, True, False, True, True, True, False]
})

# Compute the number of wins in the last 10 games for each team
# df['last_ten'] = df.groupby('team_id')['game_result'].rolling(window=10, min_periods=2).sum().reset_index(0, drop=True)
df = df.sort_values(['team_id', 'game_date'])
df['last_ten'] = df.groupby(['team_id'])['game_result'].rolling(window=15, min_periods=5, closed="left").sum().reset_index(0, drop=True)


In [None]:
print(df_main_nn.shape)
export_list = ['id_main', 'date', 'match_api_id', 'home_team_api_id', 'away_team_api_id', 'home_team_goal', 'away_team_goal', 'outcome', 'last_ten']
df_main_streak = df_main_nn[export_list]
df_2_csv = df_main_streak.tail(400).to_csv('data/last_ten.csv')

In [None]:
df_main_nn.head()

In [None]:
# Average Height for teams

hp = 'home_player_'
hp_col_list_height = [hp+str(i)+'_height' for i in range(1,12)] 

ap = 'away_player_'
ap_col_list_height = [ap+str(i)+'_height' for i in range(1,12)] 
  

df_main_nn[hp+'height_total'] = df_main_nn[hp_col_list_height].sum(axis=1)
df_main_nn[ap+'height_total'] = df_main_nn[ap_col_list_height].sum(axis=1)



In [None]:
# Average Weight for teams

hp = 'home_player_'
hp_col_list_weight = [hp+str(i)+'_weight' for i in range(1,12)] 
hp_col_list_age = [hp+str(i)+'_age' for i in range(1,12)]
hp_col_list_rating = [hp+str(i)+'_rating' for i in range(1,12)] 
# print(hp_col_list_weight)    

ap = 'away_player_'
ap_col_list_weight = [ap+str(i)+'_weight' for i in range(1,12)]
ap_col_list_age = [ap+str(i)+'_age' for i in range(1,12)]
ap_col_list_rating = [hp+str(i)+'_rating' for i in range(1,12)] 
# print(ap_col_list_weight)

df_main_nn[hp+'weight_total'] = df_main_nn[hp_col_list_weight].sum(axis=1)
df_main_nn[ap+'weight_total'] = df_main_nn[ap_col_list_weight].sum(axis=1)

df_main_nn[hp+'rating_total'] = df_main_nn[hp_col_list_rating].sum(axis=1)
df_main_nn[ap+'rating_total'] = df_main_nn[ap_col_list_rating].sum(axis=1)

df_main_nn[hp+'age_total'] = df_main_nn[hp_col_list_age].sum(axis=1)
df_main_nn[ap+'age_total'] = df_main_nn[ap_col_list_age].sum(axis=1)

df_main_nn[hp+'age_average'] = (df_main_nn[hp+'age_total']/11)
df_main_nn[ap+'age_average'] = (df_main_nn[ap+'age_total']/11)

df_main_nn[hp+'rating_average'] = (df_main_nn[hp+'rating_total']/11)
df_main_nn[ap+'rating_average'] = (df_main_nn[ap+'rating_total']/11)


In [None]:
# calculate BMI for teams 
df_main_nn['home_team_bmi'] = ((df_main_nn['home_player_weight_total']/2.20462)/((df_main_nn['home_player_height_total']/100)**2))*11
df_main_nn['away_team_bmi'] = ((df_main_nn['away_player_weight_total']/2.20462)/((df_main_nn['away_player_height_total']/100)**2))*11

In [None]:
df_main_nn.tail()

In [None]:
df_main_nn.tail()

In [None]:
hist_home_team_bmi = df_main_nn.hist(column='home_team_bmi', bins=10)

In [None]:
hist_away_team_bmi = df_main_nn.hist(column='away_team_bmi', bins=10)

In [None]:
hist_home_player_age_average = df_main_nn.hist(column='home_player_age_average', bins=10)

In [None]:
hist_away_player_age_average = df_main_nn.hist(column='away_player_age_average', bins=10)

In [None]:
count_home_team_win = df_main_nn['outcome'].value_counts()[True]
count_home_team_not_win = df_main_nn['outcome'].value_counts()[False]
game_sum = count_home_team_win + count_home_team_not_win 
print("Home team win count")
print(count_home_team_win)
print("\nHome team no win count")
print(count_home_team_not_win)
print("\nActual win percentage for home team")
print((count_home_team_win / game_sum )* 100)

In [None]:
curr_date = date.today().strftime('%Y-%m-%d')
pickle_file = 'sa-preprocessing-' + curr_date + '.pkl'
pickle_file_path = 'data/' + pickle_file
df_main_nn.to_pickle(pickle_file_path)
print('Saved dataframe into .pkl file')