In [1]:
import sqlite3
import pandas as pd
import numpy as np
from pathlib import Path
from modules.utils.data_processing import get_match_label

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

### Create database connection

In [2]:
database_path = "datasets/database.sqlite"
conn = sqlite3.connect(database_path)

### Read data from database

In [3]:
match_data_sql = '''select * from Match '''
team_attributes_sql = '''select * from Team_Attributes'''
player_attributes_sql = '''select player_api_id, overall_rating from Player_Attributes'''
# Country, League and Team tables don't seem to have any value for us, country_id is equivalent to country_name 
# for the neural net, as well as league_id/league name, team_id/team name
# TODO Player table may be beneficial due to some features (birthyear, weight, height), might want to add them later

match_data = pd.read_sql(match_data_sql, conn)
team_attributes = pd.read_sql(team_attributes_sql, conn)
player_attributes = pd.read_sql(player_attributes_sql, conn)

### Create labels

In [4]:
labels = match_data.apply(get_match_label, axis=1)
labels = labels.drop("match_api_id", axis=1)

match_data["label"] = labels

### Clean up some data and create new features

In [5]:
# make season numerical
match_data['season'] = match_data["season"].apply(lambda x: int(x.split("/")[0]))

# create new 'month' field
match_data['month'] = match_data["date"].apply(lambda x: int(x.split("-")[1]))

# TODO create fields which represent win/loss/draw percentage for each team (over the last x games/years)


### Remove unnecessary data

In [6]:
# shouldn't know these columns 
# TODO there are probably many more useless columns
columns_to_drop = ["match_api_id", "home_team_goal", "away_team_goal", "goal",
                   "shoton", "shotoff", "foulcommit", "card", "cross", "corner",
                   "possession", "date"]
match_data = match_data.drop(columns_to_drop, axis=1)


# drop home_player_X/Y and away_player_X/Y columns, not sure what they represent
for no in range(1,12):
    for char in ["X", "Y"]:
        match_data = match_data.drop("home_player_"+char+str(no), axis=1)
        match_data = match_data.drop("away_player_"+char+str(no), axis=1)
        
# TODO should probably drop rows with (a lot of) null values also (make sure to drop matching values from labels in this case)


### Join tables

In [7]:
# drop null values and duplicates from team_attributes, otherwise the dataset size will grow to 700k rows
team_attributes = team_attributes.dropna(subset=['team_api_id'])
team_attributes = team_attributes.drop_duplicates(subset=['team_api_id'])

match_data = match_data.dropna(subset=['home_team_api_id', 'away_team_api_id'])

# create 2 versions with home/away prefixes for joining purposes
team_attributes_home = team_attributes.add_prefix("home_")
team_attributes_away = team_attributes.add_prefix("away_")

# join match data with home and away team attributes

match_data = pd.merge(match_data, team_attributes_home, how="left", left_on="home_team_api_id", right_on="home_team_api_id")
match_data = pd.merge(match_data, team_attributes_away, how="left", left_on="away_team_api_id", right_on="away_team_api_id")


'''
# join match data with home and away players' attributes
# commented out right now due to memory running out, should try this https://stackoverflow.com/questions/47386405/memoryerror-when-i-merge-two-pandas-data-frames
match_data = match_data.dropna(subset=['home_player_1', 'home_player_2'])

for i in range(1,3):
    
    print(str(i) + "_1")
    home_player_attributes_i = player_attributes.add_prefix("home_p_" + str(i) + "_")
    print(str(i) + "_2")
    home_player_attributes_i = home_player_attributes_i.dropna()
    print(str(i) + "_3")
    #away_player_attributes_i = player_attributes.add_prefix("away_p_" + str(i) + "_")
    match_data = pd.merge(match_data, home_player_attributes_i, left_on="home_player_" + str(i), right_on="home_p_" + str(i) + "_player_api_id")
    #match_data = pd.merge(match_data, away_player_attributes_i, left_on="away_player_" + str(i), right_on="away_p_" + str(i) + "_player_api_id")
    
    # delete temp dataframe to conserve memory
    del(home_player_attributes_i)
'''

'\n# join match data with home and away players\' attributes\n# commented out right now due to memory running out, should try this https://stackoverflow.com/questions/47386405/memoryerror-when-i-merge-two-pandas-data-frames\nmatch_data = match_data.dropna(subset=[\'home_player_1\', \'home_player_2\'])\n\nfor i in range(1,3):\n    \n    print(str(i) + "_1")\n    home_player_attributes_i = player_attributes.add_prefix("home_p_" + str(i) + "_")\n    print(str(i) + "_2")\n    home_player_attributes_i = home_player_attributes_i.dropna()\n    print(str(i) + "_3")\n    #away_player_attributes_i = player_attributes.add_prefix("away_p_" + str(i) + "_")\n    match_data = pd.merge(match_data, home_player_attributes_i, left_on="home_player_" + str(i), right_on="home_p_" + str(i) + "_player_api_id")\n    #match_data = pd.merge(match_data, away_player_attributes_i, left_on="away_player_" + str(i), right_on="away_p_" + str(i) + "_player_api_id")\n    \n    # delete temp dataframe to conserve memory\n

### Select the columns to be used as features

In [8]:
match_data = match_data[['season',
                        'month',
                        'stage', 
                        'home_player_1',
                        'home_player_2',
                        'home_player_3',
                        'home_player_4',
                        'home_player_5',
                        'home_player_6',
                        'home_player_7',
                        'home_player_8',
                        'home_player_9',
                        'home_player_10',
                        'home_player_11',
                        'away_player_1',
                        'away_player_2',
                        'away_player_3',
                        'away_player_4',
                        'away_player_5',
                        'away_player_6',
                        'away_player_7',
                        'away_player_8',
                        'away_player_9',
                        'away_player_10',
                        'away_player_11',
                        'home_buildUpPlaySpeed',
                        'home_buildUpPlayDribbling',
                        'home_buildUpPlayPassing',
                        'home_chanceCreationPassing',
                        'home_chanceCreationCrossing',
                        'home_chanceCreationShooting',
                        'home_defencePressure',
                        'home_defenceAggression',
                        'home_defenceTeamWidth',
                        'away_buildUpPlaySpeed',
                        'away_buildUpPlayDribbling',
                        'away_buildUpPlayPassing',
                        'away_chanceCreationPassing',
                        'away_chanceCreationCrossing',
                        'away_chanceCreationShooting',
                        'away_defencePressure',
                        'away_defenceAggression',
                        'away_defenceTeamWidth',
                        'B365H',
                        'B365D',
                        'B365A',
                        'BWH',
                        'BWD',
                        'BWA',
                        'label']]

match_data.head()

Unnamed: 0,season,month,stage,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,...,away_defencePressure,away_defenceAggression,away_defenceTeamWidth,B365H,B365D,B365A,BWH,BWD,BWA,label
0,2008,8,1,,,,,,,,...,70.0,70.0,70.0,1.73,3.4,5.0,1.75,3.35,4.2,1.0
1,2008,8,1,,,,,,,,...,65.0,65.0,70.0,1.95,3.2,3.6,1.8,3.3,3.95,1.0
2,2008,8,1,,,,,,,,...,70.0,50.0,70.0,2.38,3.3,2.75,2.4,3.3,2.55,0.0
3,2008,8,1,,,,,,,,...,46.0,45.0,47.0,1.44,3.75,7.5,1.4,4.0,6.8,2.0
4,2008,8,1,,,,,,,,...,70.0,70.0,65.0,5.0,3.5,1.65,5.0,3.5,1.6,0.0


### Write data to csv

In [9]:
match_data.to_csv("datasets/data.csv")

In [10]:
match_data.shape

(25979, 50)