In [1]:
import sqlite3
import pandas as pd
import numpy as np
from pathlib import Path
from modules.random_forest.helpers import get_match_label

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

### Create database connection

In [2]:
database_path = "datasets/database.sqlite"
conn = sqlite3.connect(database_path)

### Read data from database

In [3]:
match_data_sql = '''select * from Match '''
team_attributes_sql = '''select * from Team_Attributes'''
player_attributes_sql = '''select player_api_id, overall_rating from Player_Attributes'''
# Country, League and Team tables don't seem to have any value for us, country_id is equivalent to country_name 
# for the neural net, as well as league_id/league name, team_id/team name
# TODO Player table may be beneficial due to some features (birthyear, weight, height), might want to add them later

match_data = pd.read_sql(match_data_sql, conn)
team_attributes = pd.read_sql(team_attributes_sql, conn)
player_attributes = pd.read_sql(player_attributes_sql, conn)

### Create labels

In [4]:
labels = match_data.apply(get_match_label, axis=1)
labels = labels.drop("match_api_id", axis=1)

### Clean up some data and create new features

In [5]:
# make season numerical
match_data['season'] = match_data["season"].apply(lambda x: int(x.split("/")[0]))

# create new 'month' field
match_data['month'] = match_data["date"].apply(lambda x: int(x.split("-")[1]))

# TODO create fields which represent win/loss/draw percentage for each team (over the last x games/years)


### Remove unnecessary data

In [6]:
# shouldn't know these columns 
# TODO there are probably many more useless columns
columns_to_drop = ["match_api_id", "home_team_goal", "away_team_goal", "goal",
                   "shoton", "shotoff", "foulcommit", "card", "cross", "corner",
                   "possession", "date"]
match_data = match_data.drop(columns_to_drop, axis=1)


# drop home_player_X/Y and away_player_X/Y columns, not sure what they represent
for no in range(1,12):
    for char in ["X", "Y"]:
        match_data = match_data.drop("home_player_"+char+str(no), axis=1)
        match_data = match_data.drop("away_player_"+char+str(no), axis=1)
        
# TODO should probably drop rows with (a lot of) null values also (make sure to drop matching values from labels in this case)


### Join tables

In [7]:
# create 2 versions with home/away prefixes for joining purposes
team_attributes_home = team_attributes.add_prefix("home_")
team_attributes_away = team_attributes.add_prefix("away_")

# join match data with home and away team attributes
match_data = pd.merge(match_data, team_attributes_home, left_on="home_team_api_id", right_on="home_team_api_id")
match_data = pd.merge(match_data, team_attributes_away, left_on="away_team_api_id", right_on="away_team_api_id")

'''
# join match data with home and away players' attributes
# commented out right now due to memory running out

for i in range(1,12):
    home_player_attributes_i = player_attributes.add_prefix("home_p_" + str(i) + "_")
    away_player_attributes_i = player_attributes.add_prefix("away_p_" + str(i) + "_")
    match_data = pd.merge(match_data, home_player_attributes_i, left_on="home_player_" + str(i), right_on="home_p_" + str(i) + "_player_api_id")
    match_data = pd.merge(match_data, away_player_attributes_i, left_on="away_player_" + str(i), right_on="away_p_" + str(i) + "_player_api_id")
'''

'\n# join match data with home and away players\' attributes\n#commented out right now due to memory running out\n\nfor i in range(1,12):\n    home_player_attributes_i = player_attributes.add_prefix("home_p_" + str(i) + "_")\n    away_player_attributes_i = player_attributes.add_prefix("away_p_" + str(i) + "_")\n    match_data = pd.merge(match_data, home_player_attributes_i, left_on="home_player_" + str(i), right_on="home_p_" + str(i) + "_player_api_id")\n    match_data = pd.merge(match_data, away_player_attributes_i, left_on="away_player_" + str(i), right_on="away_p_" + str(i) + "_player_api_id")\n'

### Write data to csv

In [9]:
match_data.to_csv("datasets/data.csv")