In [1]:
import sqlite3
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt 
import seaborn as sns # Import seaborn

from datetime import datetime
from datetime import date
from dateutil import parser

from collections import defaultdict

import warnings
import time


## Acquire data

Our data comes from a database file in SQLite format. We import it into a Pandas DataFrame for preprocessing. 

In [2]:
# open connection 
conn = sqlite3.connect("data/database.sqlite")
cur = conn.cursor()

In [3]:
# function to execute queries
def executeQuery(cur, query):
    print("executing query: ")
    cur.execute(query)
    return cur.fetchall()

In [4]:
# list of all tables
q_all_tables = """SELECT name FROM sqlite_master
    WHERE type='table';"""
all_tables = executeQuery(cur, q_all_tables)


executing query: 


In [5]:
# read the Match table into Pandas DataFrame
q_matches = "SELECT * FROM MATCH;"
df_matches = pd.read_sql_query(q_matches, conn)


### Preprocessing Data
- we are dropping the columns that will not be needed for preliminary analysis

In [6]:
# do this only 1
# drop betting
df_matches = df_matches.drop(df_matches.iloc[:, 85:], axis=1)

#drop statistics
df_matches = df_matches.drop(df_matches.iloc[:, 77:], axis=1)
print(df_matches.info())
# drop X, Y positions
df_matches = df_matches.drop(df_matches.iloc[:, 11:55], axis=1)

# drop fifa_api_id and a couple other columns irrelevant
df_matches = df_matches.drop(df_matches.columns[[1, 2, 4]], axis=1)
print(df_matches.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25979 entries, 0 to 25978
Data columns (total 77 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                25979 non-null  int64  
 1   country_id        25979 non-null  int64  
 2   league_id         25979 non-null  int64  
 3   season            25979 non-null  object 
 4   stage             25979 non-null  int64  
 5   date              25979 non-null  object 
 6   match_api_id      25979 non-null  int64  
 7   home_team_api_id  25979 non-null  int64  
 8   away_team_api_id  25979 non-null  int64  
 9   home_team_goal    25979 non-null  int64  
 10  away_team_goal    25979 non-null  int64  
 11  home_player_X1    24158 non-null  float64
 12  home_player_X2    24158 non-null  float64
 13  home_player_X3    24147 non-null  float64
 14  home_player_X4    24147 non-null  float64
 15  home_player_X5    24147 non-null  float64
 16  home_player_X6    24147 non-null  float6

In [7]:

df_matches.head()
# print(df_matches.shape)


Unnamed: 0,id,season,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,...,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11
0,1,2008/2009,2008-08-17 00:00:00,492473,9987,9993,1,1,,,...,,,,,,,,,,
1,2,2008/2009,2008-08-16 00:00:00,492474,10000,9994,0,0,,,...,,,,,,,,,,
2,3,2008/2009,2008-08-16 00:00:00,492475,9984,8635,0,3,,,...,,,,,,,,,,
3,4,2008/2009,2008-08-17 00:00:00,492476,9991,9998,5,0,,,...,,,,,,,,,,
4,5,2008/2009,2008-08-16 00:00:00,492477,7947,9985,1,3,,,...,,,,,,,,,,


In [8]:
# Drop player_fifa_api_id from player table
# read the Match table into Pandas DataFrame
q_player = "SELECT * FROM PLAYER;"
df_player = pd.read_sql_query(q_player, conn)

print(df_player.shape)
# df_player = df_player.set_index('player_api_id')
df_player = df_player.drop(['player_fifa_api_id', 'id'], axis=1)
print(df_player.shape)
df_player.tail()

(11060, 7)
(11060, 5)


Unnamed: 0,player_api_id,player_name,birthday,height,weight
11055,26357,Zoumana Camara,1979-04-03 00:00:00,182.88,168
11056,111182,Zsolt Laczko,1986-12-18 00:00:00,182.88,176
11057,36491,Zsolt Low,1979-04-29 00:00:00,180.34,154
11058,35506,Zurab Khizanishvili,1981-10-06 00:00:00,185.42,172
11059,39902,Zvjezdan Misimovic,1982-06-05 00:00:00,180.34,176


In [9]:
# import data from player attribute table

q_player_attr = "SELECT * FROM Player_Attributes;"
df_player_attr = pd.read_sql_query(q_player_attr, conn)

print(df_player_attr.shape)
df_player_attr = df_player_attr.loc[:, [ 'player_api_id', 'date', 'overall_rating']]

dict_player_attr = defaultdict(dict)
for index, row in df_player_attr.iterrows():
    dict_player_attr[row['player_api_id']][row['date']] = row['overall_rating']

print(len(dict_player_attr))
print(dict_player_attr[39902])
print(df_player_attr['player_api_id'].nunique())


(183978, 42)
11060
{'2012-08-31 00:00:00': 78.0, '2012-02-22 00:00:00': 81.0, '2011-08-30 00:00:00': 81.0, '2011-02-22 00:00:00': 81.0, '2010-08-30 00:00:00': 83.0, '2009-08-30 00:00:00': 83.0, '2009-02-22 00:00:00': 78.0, '2008-08-30 00:00:00': 77.0, '2007-08-30 00:00:00': 78.0, '2007-02-22 00:00:00': 80.0}
11060


In [10]:
def mostRecentRating(dates_dict, given_date_str):
    given_date = parser.parse(given_date_str).date()
    dates_dict_dt = {parser.parse(date_str).date(): value for date_str, value in dates_dict.items()}
    smaller_dates = {date: value for date, value in dates_dict_dt.items() if date <= given_date}
    highest_date = max(smaller_dates.keys())
    return dates_dict_dt[highest_date]

dates_list = ['2022-01-01', '2022-01-05', '2022-01-15', '2022-01-15', '2022-01-20', '2022-01-11']
dates_ratings = [80, 89, 32, 45, 11, 33]

all_dates = dict(zip(dates_list, dates_ratings))
# given date
given_date_str = '2022-01-12'
# print(all_dates)
# print(mostRecentRating(all_dates, given_date_str))
# print(mostRecentRating(dict_player_attr[39902], '2009-02-21'))


In [11]:
print(df_player.shape)

print(df_player.tail())
print(df_player.shape)


(11060, 5)
       player_api_id          player_name             birthday  height  weight
11055          26357       Zoumana Camara  1979-04-03 00:00:00  182.88     168
11056         111182         Zsolt Laczko  1986-12-18 00:00:00  182.88     176
11057          36491            Zsolt Low  1979-04-29 00:00:00  180.34     154
11058          35506  Zurab Khizanishvili  1981-10-06 00:00:00  185.42     172
11059          39902   Zvjezdan Misimovic  1982-06-05 00:00:00  180.34     176
(11060, 5)


In [12]:
# import team name
q_team = "SELECT * FROM Team"
df_team = pd.read_sql_query(q_team, conn)
df_team = df_team.loc[:, ['team_api_id', 'team_long_name', 'team_short_name']]

df_team.tail(10)


Unnamed: 0,team_api_id,team_long_name,team_short_name
289,10179,FC Sion,SIO
290,10199,FC Luzern,LUZ
291,9824,FC Vaduz,VAD
292,7955,Neuchâtel Xamax,XAM
293,10243,FC Zürich,ZUR
294,10190,FC St. Gallen,GAL
295,10191,FC Thun,THU
296,9777,Servette FC,SER
297,7730,FC Lausanne-Sports,LAU
298,7896,Lugano,LUG


In [13]:
# import Country but wont use them they dont add value to the model
q_country = "SELECT * FROM Country"
df_country = pd.read_sql_query(q_country, conn)
df_country.tail()


Unnamed: 0,id,name
6,15722,Poland
7,17642,Portugal
8,19694,Scotland
9,21518,Spain
10,24558,Switzerland


In [14]:
# import League but wont use them they dont add value to the model
q_league = "SELECT * FROM League"
df_league = pd.read_sql_query(q_league, conn)
df_league.tail()

Unnamed: 0,id,country_id,name
6,15722,15722,Poland Ekstraklasa
7,17642,17642,Portugal Liga ZON Sagres
8,19694,19694,Scotland Premier League
9,21518,21518,Spain LIGA BBVA
10,24558,24558,Switzerland Super League


In [15]:
# import Team_Attributes but wont use them they dont add value to the model
q_team_attr = "SELECT * FROM Team_Attributes"
df_team_attr = pd.read_sql_query(q_team_attr, conn)
df_team_attr.tail()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
1453,1454,15005,10000,2011-02-22 00:00:00,52,Balanced,,Little,52,Mixed,...,53,Normal,Organised,46,Medium,48,Press,53,Normal,Cover
1454,1455,15005,10000,2012-02-22 00:00:00,54,Balanced,,Little,51,Mixed,...,50,Normal,Organised,44,Medium,55,Press,53,Normal,Cover
1455,1456,15005,10000,2013-09-20 00:00:00,54,Balanced,,Little,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover
1456,1457,15005,10000,2014-09-19 00:00:00,54,Balanced,42.0,Normal,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover
1457,1458,15005,10000,2015-09-10 00:00:00,54,Balanced,42.0,Normal,51,Mixed,...,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover


## Consolidating features from Matches, Players, and other dataframes into a single DF
This is the basis for model building 

In [16]:
df_main = df_matches
df_main.tail()



Unnamed: 0,id,season,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,...,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11
25974,25975,2015/2016,2015-09-22 00:00:00,1992091,10190,10191,1,0,42231.0,678384.0,...,563066.0,8800.0,67304.0,158253.0,133126.0,186524.0,93223.0,121115.0,232110.0,289732.0
25975,25976,2015/2016,2015-09-23 00:00:00,1992092,9824,10199,1,2,33272.0,41621.0,...,114792.0,150007.0,178119.0,27232.0,570830.0,260708.0,201704.0,36382.0,34082.0,95257.0
25976,25977,2015/2016,2015-09-23 00:00:00,1992093,9956,10179,2,0,157856.0,274779.0,...,67349.0,202663.0,32597.0,114794.0,188114.0,25840.0,482200.0,95230.0,451335.0,275122.0
25977,25978,2015/2016,2015-09-22 00:00:00,1992094,7896,10243,0,0,,8881.0,...,121080.0,197757.0,260964.0,231614.0,113235.0,41116.0,462608.0,42262.0,92252.0,194532.0
25978,25979,2015/2016,2015-09-23 00:00:00,1992095,10192,9931,4,3,274787.0,492132.0,...,95216.0,172768.0,22834.0,458806.0,207234.0,25772.0,40274.0,34035.0,41726.0,527103.0


## Feb 22, Join player name into main_df


In [17]:
print(df_main.shape)

(25979, 30)


In [18]:
# NB RUN THIS CODE ONLY ONCE or restart needed
pd.options.mode.chained_assignment = None

df_main = df_main.rename(columns={"id":"id_main"})
df_player = df_player.rename(columns={"id":"id_player"})


In [19]:

hp = 'home_player_'
for i in range(1, 12):
    hp_n = hp+str(i)
    df_main = df_main.merge(df_player, left_on=hp_n, right_on="player_api_id")
    df_main = df_main.rename(columns={"id_player": hp_n+"_id", "player_name": hp_n+"_name", "birthday": hp_n+"_birthday", "height":hp_n+"_height", "weight":hp_n+"_weight"})
    
ap = 'away_player_'
for i in range(1, 12):
    ap_n = ap+str(i)
    df_main = df_main.merge(df_player, left_on=ap_n, right_on="player_api_id")
    df_main = df_main.rename(columns={"id_player": ap_n+"_id", "player_name": ap_n+"_name", "birthday": ap_n+"_birthday", "height":ap_n+"_height", "weight":ap_n+"_weight"})
df_main.drop(['player_api_id_x', 'player_api_id_y'], axis=1)
# print(df_main.tail())  

  df_main = df_main.merge(df_player, left_on=hp_n, right_on="player_api_id")
  df_main = df_main.merge(df_player, left_on=hp_n, right_on="player_api_id")
  df_main = df_main.merge(df_player, left_on=hp_n, right_on="player_api_id")
  df_main = df_main.merge(df_player, left_on=hp_n, right_on="player_api_id")
  df_main = df_main.merge(df_player, left_on=ap_n, right_on="player_api_id")
  df_main = df_main.merge(df_player, left_on=ap_n, right_on="player_api_id")
  df_main = df_main.merge(df_player, left_on=ap_n, right_on="player_api_id")
  df_main = df_main.merge(df_player, left_on=ap_n, right_on="player_api_id")
  df_main = df_main.merge(df_player, left_on=ap_n, right_on="player_api_id")
  df_main = df_main.merge(df_player, left_on=ap_n, right_on="player_api_id")


Unnamed: 0,id_main,season,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,...,away_player_9_height,away_player_9_weight,away_player_10_name,away_player_10_birthday,away_player_10_height,away_player_10_weight,away_player_11_name,away_player_11_birthday,away_player_11_height,away_player_11_weight
0,565,2010/2011,2010-11-13 00:00:00,838651,8342,9989,2,0,37990.0,36832.0,...,175.26,163,Wesley Sonck,1978-08-09 00:00:00,175.26,168,Mohamed El Gabbas,1987-07-21 00:00:00,177.80,154
1,673,2010/2011,2011-02-19 00:00:00,838822,10000,9989,1,1,37900.0,37886.0,...,187.96,187,Wesley Sonck,1978-08-09 00:00:00,175.26,168,Mohamed El Gabbas,1987-07-21 00:00:00,177.80,154
2,939,2011/2012,2011-08-14 00:00:00,1032714,8203,9989,2,1,39573.0,80184.0,...,177.80,176,Wesley Sonck,1978-08-09 00:00:00,175.26,168,Mohamed El Gabbas,1987-07-21 00:00:00,177.80,154
3,959,2011/2012,2011-08-27 00:00:00,1032726,9991,9989,1,0,37854.0,37440.0,...,177.80,165,Peter Kovacs,1978-02-07 00:00:00,198.12,214,Mohamed El Gabbas,1987-07-21 00:00:00,177.80,154
4,980,2011/2012,2011-09-17 00:00:00,1032747,9984,9989,0,0,36835.0,38342.0,...,177.80,165,Milos Maric,1982-03-05 00:00:00,177.80,176,Mohamed El Gabbas,1987-07-21 00:00:00,177.80,154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21369,25905,2015/2016,2016-04-20 00:00:00,1992191,9931,7896,3,0,156175.0,458806.0,...,172.72,143,Anastasios Donis,1996-08-29 00:00:00,177.80,163,Antonini Culina,1992-01-27 00:00:00,182.88,170
21370,25899,2015/2016,2016-04-09 00:00:00,1992185,10192,7896,7,0,274787.0,492132.0,...,170.18,139,Anastasios Donis,1996-08-29 00:00:00,177.80,163,Antonini Culina,1992-01-27 00:00:00,182.88,170
21371,25871,2015/2016,2016-02-28 00:00:00,1992157,10190,7896,3,3,42231.0,678384.0,...,182.88,170,Djordje Susnjar,1992-02-18 00:00:00,185.42,172,Matteo Tosetti,1992-02-15 00:00:00,177.80,154
21372,25843,2015/2016,2015-11-28 00:00:00,1992134,9824,7896,1,1,33272.0,358156.0,...,182.88,170,Djordje Susnjar,1992-02-18 00:00:00,185.42,172,Matteo Tosetti,1992-02-15 00:00:00,177.80,154


In [20]:
pd.options.display.max_columns = None

## Merging Overall_rating into main DataFrame

In [21]:
no_nans = df_main[~df_main.isnull().any(axis=1)]
print(no_nans.shape)
no_nans.tail()

(21374, 140)


Unnamed: 0,id_main,season,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,player_api_id_x,home_player_1_name,home_player_1_birthday,home_player_1_height,home_player_1_weight,player_api_id_y,home_player_2_name,home_player_2_birthday,home_player_2_height,home_player_2_weight,player_api_id_x.1,home_player_3_name,home_player_3_birthday,home_player_3_height,home_player_3_weight,player_api_id_y.1,home_player_4_name,home_player_4_birthday,home_player_4_height,home_player_4_weight,player_api_id_x.2,home_player_5_name,home_player_5_birthday,home_player_5_height,home_player_5_weight,player_api_id_y.2,home_player_6_name,home_player_6_birthday,home_player_6_height,home_player_6_weight,player_api_id_x.3,home_player_7_name,home_player_7_birthday,home_player_7_height,home_player_7_weight,player_api_id_y.3,home_player_8_name,home_player_8_birthday,home_player_8_height,home_player_8_weight,player_api_id_x.4,home_player_9_name,home_player_9_birthday,home_player_9_height,home_player_9_weight,player_api_id_y.4,home_player_10_name,home_player_10_birthday,home_player_10_height,home_player_10_weight,player_api_id_x.5,home_player_11_name,home_player_11_birthday,home_player_11_height,home_player_11_weight,player_api_id_y.5,away_player_1_name,away_player_1_birthday,away_player_1_height,away_player_1_weight,player_api_id_x.6,away_player_2_name,away_player_2_birthday,away_player_2_height,away_player_2_weight,player_api_id_y.6,away_player_3_name,away_player_3_birthday,away_player_3_height,away_player_3_weight,player_api_id_x.7,away_player_4_name,away_player_4_birthday,away_player_4_height,away_player_4_weight,player_api_id_y.7,away_player_5_name,away_player_5_birthday,away_player_5_height,away_player_5_weight,player_api_id_x.8,away_player_6_name,away_player_6_birthday,away_player_6_height,away_player_6_weight,player_api_id_y.8,away_player_7_name,away_player_7_birthday,away_player_7_height,away_player_7_weight,player_api_id_x.9,away_player_8_name,away_player_8_birthday,away_player_8_height,away_player_8_weight,player_api_id_y.9,away_player_9_name,away_player_9_birthday,away_player_9_height,away_player_9_weight,player_api_id_x.10,away_player_10_name,away_player_10_birthday,away_player_10_height,away_player_10_weight,player_api_id_y.10,away_player_11_name,away_player_11_birthday,away_player_11_height,away_player_11_weight
21369,25905,2015/2016,2016-04-20 00:00:00,1992191,9931,7896,3,0,156175.0,458806.0,22834.0,30492.0,181211.0,438780.0,207234.0,384376.0,34035.0,25794.0,527103.0,330458.0,282287.0,173534.0,41415.0,114212.0,491221.0,393337.0,8893.0,406283.0,614454.0,186948.0,156175,Tomas Vaclik,1989-03-29 00:00:00,187.96,187,458806,Naser Aliji,1993-12-27 00:00:00,177.8,161,22834,Marek Suchy,1988-03-29 00:00:00,182.88,168,30492,Walter Samuel,1978-03-23 00:00:00,182.88,179,181211,Adama Traore,1990-02-03 00:00:00,170.18,161,438780,Alexander Fransson,1994-04-02 00:00:00,180.34,163,207234,Taulant Xhaka,1991-03-28 00:00:00,172.72,159,384376,Renato Steffen,1991-11-03 00:00:00,170.18,150,34035,Matias Emilio Delgado,1982-12-15 00:00:00,182.88,174,25794,Davide Calla,1984-10-06 00:00:00,175.26,174,527103,Breel Embolo,1997-02-14 00:00:00,185.42,185,330458,Mirko Salvi,1994-02-14 00:00:00,187.96,176,282287,Frederic Veseli,1992-11-20 00:00:00,182.88,176,173534,Niko Datkovic,1993-04-21 00:00:00,190.5,190,41415,Orlando Urbano,1984-06-09 00:00:00,182.88,174,114212,Goran Jozinovic,1990-08-27 00:00:00,177.8,163,491221,Mario Piccinocchi,1995-02-21 00:00:00,172.72,148,393337,Jonathan Sabbatini,1988-03-31 00:00:00,175.26,157,8893,Antoine Rey,1986-08-25 00:00:00,167.64,148,406283,Ezgjan Alioski,1992-02-12 00:00:00,172.72,143,614454,Anastasios Donis,1996-08-29 00:00:00,177.8,163,186948,Antonini Culina,1992-01-27 00:00:00,182.88,170
21370,25899,2015/2016,2016-04-09 00:00:00,1992185,10192,7896,7,0,274787.0,492132.0,25815.0,36785.0,94553.0,147959.0,320184.0,119702.0,45174.0,302079.0,37554.0,41975.0,8881.0,173534.0,429986.0,282287.0,491221.0,393337.0,8893.0,178142.0,614454.0,186948.0,274787,Yvon Mvogo,1994-06-06 00:00:00,187.96,185,492132,Florent Hadergjonaj,1994-07-31 00:00:00,182.88,159,25815,Steve von Bergen,1983-06-10 00:00:00,182.88,174,36785,Alain Rochat,1983-02-01 00:00:00,182.88,172,94553,Jan Lecjaks,1990-08-09 00:00:00,185.42,179,147959,Yoric Ravet,1989-09-12 00:00:00,177.8,163,320184,Leonardo Bertone,1994-03-14 00:00:00,177.8,159,119702,Milan Gajic,1986-11-17 00:00:00,182.88,168,45174,Miralem Sulejmani,1988-12-05 00:00:00,177.8,170,302079,Yuya Kubo,1993-12-24 00:00:00,177.8,159,37554,Guillaume Hoarau,1984-03-05 00:00:00,193.04,176,41975,Alex Valentini,1988-04-05 00:00:00,185.42,176,8881,Marco Padalino,1983-12-08 00:00:00,177.8,168,173534,Niko Datkovic,1993-04-21 00:00:00,190.5,190,429986,Matias Malvino,1992-01-20 00:00:00,187.96,174,282287,Frederic Veseli,1992-11-20 00:00:00,182.88,176,491221,Mario Piccinocchi,1995-02-21 00:00:00,172.72,148,393337,Jonathan Sabbatini,1988-03-31 00:00:00,175.26,157,8893,Antoine Rey,1986-08-25 00:00:00,167.64,148,178142,Mattia Bottani,1991-05-24 00:00:00,170.18,139,614454,Anastasios Donis,1996-08-29 00:00:00,177.8,163,186948,Antonini Culina,1992-01-27 00:00:00,182.88,170
21371,25871,2015/2016,2016-02-28 00:00:00,1992157,10190,7896,3,3,42231.0,678384.0,638592.0,413155.0,210423.0,45780.0,566785.0,176298.0,119839.0,35831.0,278917.0,41975.0,282287.0,429986.0,41415.0,114212.0,8893.0,393337.0,340790.0,186948.0,198500.0,140490.0,42231,Daniel Lopar,1985-04-19 00:00:00,187.96,187,678384,Silvan Hefti,1997-10-25 00:00:00,182.88,172,638592,Roy Gelmi,1995-03-01 00:00:00,187.96,172,413155,Martin Angha,1994-01-22 00:00:00,187.96,174,210423,Florent Hanin,1990-02-04 00:00:00,177.8,165,45780,Mario Mutsch,1984-09-03 00:00:00,175.26,163,566785,Gianluca Gaudino,1996-11-11 00:00:00,175.26,143,176298,Marco Aratore,1991-06-04 00:00:00,177.8,134,119839,Danijel Aleksic,1991-04-30 00:00:00,182.88,165,35831,Albert Bunjaku,1983-11-29 00:00:00,177.8,165,278917,Edgar Salli,1992-08-17 00:00:00,162.56,141,41975,Alex Valentini,1988-04-05 00:00:00,185.42,176,282287,Frederic Veseli,1992-11-20 00:00:00,182.88,176,429986,Matias Malvino,1992-01-20 00:00:00,187.96,174,41415,Orlando Urbano,1984-06-09 00:00:00,182.88,174,114212,Goran Jozinovic,1990-08-27 00:00:00,177.8,163,8893,Antoine Rey,1986-08-25 00:00:00,167.64,148,393337,Jonathan Sabbatini,1988-03-31 00:00:00,175.26,157,340790,Domen Crnigoj,1995-11-18 00:00:00,185.42,176,186948,Antonini Culina,1992-01-27 00:00:00,182.88,170,198500,Djordje Susnjar,1992-02-18 00:00:00,185.42,172,140490,Matteo Tosetti,1992-02-15 00:00:00,177.8,154
21372,25843,2015/2016,2015-11-28 00:00:00,1992134,9824,7896,1,1,33272.0,358156.0,41621.0,257845.0,488297.0,114011.0,42237.0,56868.0,32343.0,301167.0,493418.0,41975.0,282287.0,173534.0,41415.0,114212.0,491221.0,393337.0,8893.0,186948.0,198500.0,140490.0,33272,Peter Jehle,1982-01-22 00:00:00,187.96,176,358156,Joel Untersee,1994-02-11 00:00:00,177.8,163,41621,Simone Grippo,1988-12-12 00:00:00,187.96,181,257845,Mario Buehler,1992-02-05 00:00:00,190.5,172,488297,Axel Borgmann,1994-07-08 00:00:00,177.8,165,114011,Stjepan Kukuruzovic,1989-06-07 00:00:00,180.34,168,42237,Philipp Muntwiler,1987-02-25 00:00:00,182.88,176,56868,Diego Ciccone,1987-07-21 00:00:00,172.72,163,32343,Markus Neumayr,1986-03-26 00:00:00,170.18,170,301167,Mauro Caballero,1994-10-08 00:00:00,175.26,143,493418,Albion Avdijaj,1994-01-12 00:00:00,190.5,176,41975,Alex Valentini,1988-04-05 00:00:00,185.42,176,282287,Frederic Veseli,1992-11-20 00:00:00,182.88,176,173534,Niko Datkovic,1993-04-21 00:00:00,190.5,190,41415,Orlando Urbano,1984-06-09 00:00:00,182.88,174,114212,Goran Jozinovic,1990-08-27 00:00:00,177.8,163,491221,Mario Piccinocchi,1995-02-21 00:00:00,172.72,148,393337,Jonathan Sabbatini,1988-03-31 00:00:00,175.26,157,8893,Antoine Rey,1986-08-25 00:00:00,167.64,148,186948,Antonini Culina,1992-01-27 00:00:00,182.88,170,198500,Djordje Susnjar,1992-02-18 00:00:00,185.42,172,140490,Matteo Tosetti,1992-02-15 00:00:00,177.8,154
21373,25934,2015/2016,2016-05-11 00:00:00,1992215,10243,7896,0,4,7621.0,121080.0,34268.0,115700.0,451982.0,198082.0,41116.0,113235.0,540230.0,656668.0,3517.0,330458.0,8881.0,173534.0,41415.0,282287.0,491221.0,393337.0,8893.0,178142.0,406283.0,614454.0,7621,Anthony Favre,1984-02-01 00:00:00,193.04,198,121080,Philippe Koch,1991-02-08 00:00:00,180.34,168,34268,Alain Nef,1982-02-06 00:00:00,190.5,194,115700,Leonardo Sanchez,1986-08-02 00:00:00,190.5,187,451982,Cedric Brunner,1994-02-17 00:00:00,180.34,163,198082,Oliver Buff,1992-08-03 00:00:00,175.26,150,41116,Gilles Yapi,1982-01-30 00:00:00,170.18,139,113235,Burim Kukeli,1984-01-16 00:00:00,180.34,159,540230,Artem Simonyan,1995-02-20 00:00:00,177.8,154,656668,Anto Grgic,1996-11-28 00:00:00,187.96,172,3517,Alexandr Kerzhakov,1982-11-27 00:00:00,175.26,168,330458,Mirko Salvi,1994-02-14 00:00:00,187.96,176,8881,Marco Padalino,1983-12-08 00:00:00,177.8,168,173534,Niko Datkovic,1993-04-21 00:00:00,190.5,190,41415,Orlando Urbano,1984-06-09 00:00:00,182.88,174,282287,Frederic Veseli,1992-11-20 00:00:00,182.88,176,491221,Mario Piccinocchi,1995-02-21 00:00:00,172.72,148,393337,Jonathan Sabbatini,1988-03-31 00:00:00,175.26,157,8893,Antoine Rey,1986-08-25 00:00:00,167.64,148,178142,Mattia Bottani,1991-05-24 00:00:00,170.18,139,406283,Ezgjan Alioski,1992-02-12 00:00:00,172.72,143,614454,Anastasios Donis,1996-08-29 00:00:00,177.8,163


## Most recent rating for each player on match

In [22]:
# dropped null values from the dataframe
df_main_nn = no_nans
pd.options.display.max_columns = None

In [23]:
# Ratings for home players
hp = 'home_player_'
ap = 'away_player_'
start_time = time.time()
for i in range(1, 12):
    hp_n = hp+str(i)
    ap_n = ap+str(i)
    hp_n_rating = hp_n+"_rating"
    ap_n_rating = ap_n+"_rating"
    for index, row in df_main_nn.iterrows():
        df_main_nn.at[index, hp_n_rating] = mostRecentRating(dict_player_attr[row[hp_n]], row['date'])
        df_main_nn.at[index, ap_n_rating] = mostRecentRating(dict_player_attr[row[ap_n]], row['date'])
  
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.4f} seconds")
# print(df_main_nn.tail())


Elapsed time: 624.0829 seconds


In [24]:
curr_date = date.today().strftime('%Y-%m-%d')
pickle_file = 'sa-preprocessing-' + curr_date + '.pkl'
pickle_file_path = 'data/' + pickle_file
df_main_nn.to_pickle(pickle_file_path)
print('Saved dataframe into .pkl file')

Saved dataframe into .pkl file
