In [None]:
import sqlite3
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt 
import seaborn as sns # Import seaborn

from datetime import datetime
from datetime import date
from dateutil import parser

from collections import defaultdict

import warnings
import time


## Acquire data

Our data comes from a database file in SQLite format. We import it into a Pandas DataFrame for preprocessing. 

In [None]:
# open connection 
conn = sqlite3.connect("data/database.sqlite")
cur = conn.cursor()

In [None]:
# function to execute queries
def executeQuery(cur, query):
    print("executing query: ")
    cur.execute(query)
    return cur.fetchall()

In [None]:
# list of all tables
q_all_tables = """SELECT name FROM sqlite_master
    WHERE type='table';"""
all_tables = executeQuery(cur, q_all_tables)


In [None]:
# read the Match table into Pandas DataFrame
q_matches = "SELECT * FROM MATCH;"
df_matches = pd.read_sql_query(q_matches, conn)


### Preprocessing Data
- we are dropping the columns that will not be needed for preliminary analysis

In [None]:
# do this only 1
# drop betting
df_matches = df_matches.drop(df_matches.iloc[:, 85:], axis=1)

#drop statistics
df_matches = df_matches.drop(df_matches.iloc[:, 77:], axis=1)
print(df_matches.info())
# drop X, Y positions
df_matches = df_matches.drop(df_matches.iloc[:, 11:55], axis=1)

# drop fifa_api_id and a couple other columns irrelevant
df_matches = df_matches.drop(df_matches.columns[[1, 2, 4]], axis=1)
print(df_matches.info())

In [None]:

df_matches.head()
# print(df_matches.shape)


In [None]:
# Drop player_fifa_api_id from player table
# read the Match table into Pandas DataFrame
q_player = "SELECT * FROM PLAYER;"
df_player = pd.read_sql_query(q_player, conn)

print(df_player.shape)
# df_player = df_player.set_index('player_api_id')
df_player = df_player.drop(['player_fifa_api_id', 'id'], axis=1)
print(df_player.shape)
df_player.tail()

In [None]:
# import data from player attribute table

q_player_attr = "SELECT * FROM Player_Attributes;"
df_player_attr = pd.read_sql_query(q_player_attr, conn)

print(df_player_attr.shape)
df_player_attr = df_player_attr.loc[:, [ 'player_api_id', 'date', 'overall_rating']]

dict_player_attr = defaultdict(dict)
for index, row in df_player_attr.iterrows():
    dict_player_attr[row['player_api_id']][row['date']] = row['overall_rating']

print(len(dict_player_attr))
print(dict_player_attr[39902])
print(df_player_attr['player_api_id'].nunique())


In [None]:
def mostRecentRating(dates_dict, given_date_str):
    given_date = parser.parse(given_date_str).date()
    dates_dict_dt = {parser.parse(date_str).date(): value for date_str, value in dates_dict.items()}
    smaller_dates = {date: value for date, value in dates_dict_dt.items() if date <= given_date}
    highest_date = max(smaller_dates.keys())
    return dates_dict_dt[highest_date]

dates_list = ['2022-01-01', '2022-01-05', '2022-01-15', '2022-01-15', '2022-01-20', '2022-01-11']
dates_ratings = [80, 89, 32, 45, 11, 33]

all_dates = dict(zip(dates_list, dates_ratings))
print(all_dates)
# given date
given_date_str = '2022-01-05'
# print(all_dates)
print(mostRecentRating(all_dates, given_date_str))
# print(mostRecentRating(dict_player_attr[39902], '2009-02-21'))


In [None]:
print(df_player.shape)

print(df_player.tail())
print(df_player.shape)


In [None]:
# import team name
q_team = "SELECT * FROM Team"
df_team = pd.read_sql_query(q_team, conn)
df_team = df_team.loc[:, ['team_api_id', 'team_long_name', 'team_short_name']]

df_team.tail(10)


In [None]:
# import Country but wont use them they dont add value to the model
q_country = "SELECT * FROM Country"
df_country = pd.read_sql_query(q_country, conn)
df_country.tail()


In [None]:
# import League but wont use them they dont add value to the model
q_league = "SELECT * FROM League"
df_league = pd.read_sql_query(q_league, conn)
df_league.tail()

In [None]:
# import Team_Attributes but wont use them they dont add value to the model
q_team_attr = "SELECT * FROM Team_Attributes"
df_team_attr = pd.read_sql_query(q_team_attr, conn)
df_team_attr.tail()

## Consolidating features from Matches, Players, and other dataframes into a single DF
This is the basis for model building 

In [None]:
df_main = df_matches
df_main.tail()



## Feb 22, Join player name into main_df


In [None]:
print(df_main.shape)

In [None]:
# NB RUN THIS CODE ONLY ONCE or restart needed
pd.options.mode.chained_assignment = None

df_main = df_main.rename(columns={"id":"id_main"})
df_player = df_player.rename(columns={"id":"id_player"})


In [None]:

hp = 'home_player_'
for i in range(1, 12):
    hp_n = hp+str(i)
    df_main = df_main.merge(df_player, left_on=hp_n, right_on="player_api_id")
    df_main = df_main.rename(columns={"id_player": hp_n+"_id", "player_name": hp_n+"_name", "birthday": hp_n+"_birthday", "height":hp_n+"_height", "weight":hp_n+"_weight"})
    
ap = 'away_player_'
for i in range(1, 12):
    ap_n = ap+str(i)
    df_main = df_main.merge(df_player, left_on=ap_n, right_on="player_api_id")
    df_main = df_main.rename(columns={"id_player": ap_n+"_id", "player_name": ap_n+"_name", "birthday": ap_n+"_birthday", "height":ap_n+"_height", "weight":ap_n+"_weight"})
df_main.drop(['player_api_id_x', 'player_api_id_y'], axis=1)
# print(df_main.tail())  

In [None]:
pd.options.display.max_columns = None

## Merging Overall_rating into main DataFrame

In [None]:
no_nans = df_main[~df_main.isnull().any(axis=1)]
print(no_nans.shape)
no_nans.tail()

## Most recent rating for each player on match

In [None]:
# dropped null values from the dataframe
df_main_nn = no_nans
pd.options.display.max_columns = None

In [None]:
# # Ratings for home players
# hp = 'home_player_'
# ap = 'away_player_'
# start_time = time.time()
# for i in range(1, 12):
#     hp_n = hp+str(i)
#     ap_n = ap+str(i)
#     hp_n_rating = hp_n+"_rating"
#     ap_n_rating = ap_n+"_rating"
#     for index, row in df_main_nn.iterrows():
#         df_main_nn.at[index, hp_n_rating] = mostRecentRating(dict_player_attr[row[hp_n]], row['date'])
#         df_main_nn.at[index, ap_n_rating] = mostRecentRating(dict_player_attr[row[ap_n]], row['date'])
  
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"Elapsed time: {elapsed_time:.4f} seconds")
# # print(df_main_nn.tail())


In [None]:
# import pandas as pd
# from collections import defaultdict
# from datetime import datetime

df_player_attr = df_player_attr.loc[:, ['player_api_id', 'date', 'overall_rating']]

dict_player_attr = defaultdict(dict)
for index, row in df_player_attr.iterrows():
    dict_player_attr[row['player_api_id']][pd.to_datetime(row['date'])] = row['overall_rating']

def mostRecentRating(dates_dict, given_date):
    smaller_dates = {date: value for date, value in dates_dict.items() if date <= given_date}
    highest_date = max(smaller_dates.keys())
    return dates_dict[highest_date]

hp = 'home_player_'
ap = 'away_player_'

def fill_ratings(row):
    for i in range(1, 12):
        hp_n = hp + str(i)
        ap_n = ap + str(i)
        hp_n_rating = hp_n + "_rating"
        ap_n_rating = ap_n + "_rating"
        row[hp_n_rating] = mostRecentRating(dict_player_attr[row[hp_n]], row['date'])
        row[ap_n_rating] = mostRecentRating(dict_player_attr[row[ap_n]], row['date'])
    return row

# Convert date strings to pandas datetime objects
df_main_nn['date'] = pd.to_datetime(df_main_nn['date'])

# Apply fill_ratings to each row in df_main_nn
df_main_nn = df_main_nn.apply(fill_ratings, axis=1)


In [None]:
df_main_nn.tail(10)

In [None]:
curr_date = date.today().strftime('%Y-%m-%d')
pickle_file = 'sa-eda-' + curr_date + '.pkl'
pickle_file_path = 'data/' + pickle_file
df_main_nn.to_pickle(pickle_file_path)
print('Saved dataframe into .pkl file')

In [None]:
conn.close()