# DATA INTEGRATION
A notebook to integrate multiple data sources

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import copy
import pandas as pd
import numpy as np
import os
PRJ_ROOT_DIR = os.path.dirname(os.path.abspath(''))

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns

In [2]:
# Where to save the figures
NOTE_ROOT_DIR = os.path.abspath('')
CHAPTER_ID = "03_data_integrating"
IMAGES_PATH = os.path.join(NOTE_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## **1 - Load the Data**

### 1.1 - EPL Match

In [32]:
# Load EPL match data
df_match = pd.read_csv(os.path.join(PRJ_ROOT_DIR, "data", "tabular", "exploratory", "matches.csv"))
df_match = df_match[df_match["season"] != "2022/23"] # Exclude season 2022/23
df_match.head()

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,away/mf_2,away/mf_3,away/mf_4,away/fw_0,away/fw_1,away/fw_2,home/name,away/name,home_result,season
0,37096.0,55459.0,166640.0,219924.0,81012.0,,101537.0,57531.0,86934.0,54102.0,...,20664.0,,,103955.0,205651.0,103025.0,West Ham United,Manchester City,lose,2019/20
1,37915.0,38290.0,55605.0,173904.0,158534.0,,157668.0,62974.0,45268.0,231372.0,...,85242.0,148508.0,193488.0,213345.0,,,Tottenham Hotspur,Aston Villa,win,2019/20
2,116535.0,171287.0,97032.0,122798.0,169187.0,,116643.0,41733.0,56979.0,,...,71738.0,90440.0,78607.0,57127.0,,,Liverpool,Norwich City,win,2019/20
3,98747.0,68983.0,17761.0,51927.0,39487.0,,40145.0,60586.0,433154.0,60551.0,...,101178.0,213482.0,,84939.0,200439.0,83283.0,Burnley,Southampton,win,2019/20
4,51940.0,184667.0,95658.0,106760.0,214590.0,,74208.0,176297.0,109322.0,156689.0,...,88894.0,49579.0,184341.0,173879.0,,,Manchester United,Chelsea,win,2019/20


In [57]:
# List positions
all_pos = list(df_match.columns)[:-4]
all_pos

['home/gk_0',
 'home/df_0',
 'home/df_1',
 'home/df_2',
 'home/df_3',
 'home/df_4',
 'home/mf_0',
 'home/mf_1',
 'home/mf_2',
 'home/mf_3',
 'home/mf_4',
 'home/fw_0',
 'home/fw_1',
 'home/fw_2',
 'away/gk_0',
 'away/df_0',
 'away/df_1',
 'away/df_2',
 'away/df_3',
 'away/mf_0',
 'away/mf_1',
 'away/mf_2',
 'away/mf_3',
 'away/mf_4',
 'away/fw_0',
 'away/fw_1',
 'away/fw_2']

In [13]:
# List position types
position = ['home/gk', 'home/df', 'home/mf', 'home/fw',
            'away/gk', 'away/df', 'away/mf', 'away/fw']

In [33]:
# Fill empty positions with ghost id (-1)
df_match = df_match.fillna(-1)
df_match[df_match["home/df_4"] == -1].head()

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,away/mf_2,away/mf_3,away/mf_4,away/fw_0,away/fw_1,away/fw_2,home/name,away/name,home_result,season
0,37096.0,55459.0,166640.0,219924.0,81012.0,-1.0,101537.0,57531.0,86934.0,54102.0,...,20664.0,-1.0,-1.0,103955.0,205651.0,103025.0,West Ham United,Manchester City,lose,2019/20
1,37915.0,38290.0,55605.0,173904.0,158534.0,-1.0,157668.0,62974.0,45268.0,231372.0,...,85242.0,148508.0,193488.0,213345.0,-1.0,-1.0,Tottenham Hotspur,Aston Villa,win,2019/20
2,116535.0,171287.0,97032.0,122798.0,169187.0,-1.0,116643.0,41733.0,56979.0,-1.0,...,71738.0,90440.0,78607.0,57127.0,-1.0,-1.0,Liverpool,Norwich City,win,2019/20
3,98747.0,68983.0,17761.0,51927.0,39487.0,-1.0,40145.0,60586.0,433154.0,60551.0,...,101178.0,213482.0,-1.0,84939.0,200439.0,83283.0,Burnley,Southampton,win,2019/20
4,51940.0,184667.0,95658.0,106760.0,214590.0,-1.0,74208.0,176297.0,109322.0,156689.0,...,88894.0,49579.0,184341.0,173879.0,-1.0,-1.0,Manchester United,Chelsea,win,2019/20


### 1.2 - EPL Player

In [34]:
# Load EPL player data
df_player = pd.read_csv(os.path.join(PRJ_ROOT_DIR, "data", "tabular", "exploratory", "players.csv"))
df_player.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,id,name,season
0,0.413188,-2.238841,1.684397,-1.461445,-0.034132,-0.713697,-0.078653,-0.174808,0.151001,-0.5077,...,0.715804,0.912633,0.192937,0.762094,0.079435,1.502132,0.326795,54861.0,Christian Benteke,2018/19
1,-2.514528,-0.501798,-0.12844,0.198128,0.566048,-0.237443,-0.000778,0.018578,-0.020019,0.09228,...,-0.125432,0.04453,0.033725,-0.094612,0.014176,-0.032584,0.023206,213405.0,Filip Benkovic,2018/19
2,-2.514528,-0.501798,-0.12844,0.198128,0.566048,-0.237443,-0.000778,0.018578,-0.020019,0.09228,...,-0.125432,0.04453,0.033725,-0.094612,0.014176,-0.032584,0.023206,449926.0,Adrián Bernabé,2018/19
3,-2.514528,-0.501798,-0.12844,0.198128,0.566048,-0.237443,-0.000778,0.018578,-0.020019,0.09228,...,-0.125432,0.04453,0.033725,-0.094612,0.014176,-0.032584,0.023206,182436.0,Ben Woodburn,2018/19
4,1.384457,-1.455737,0.75161,0.381852,0.055696,-0.925664,-0.105172,-0.105222,0.117333,-0.13313,...,1.641439,1.091559,-0.876608,-0.08232,-0.063718,-0.059692,-0.61471,100649.0,Bernard,2018/19


## 2 - Integrate the Data

In [35]:
# Clone match data for safe integration
df_integrate = df_match.copy()
df_integrate.head()

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,away/mf_2,away/mf_3,away/mf_4,away/fw_0,away/fw_1,away/fw_2,home/name,away/name,home_result,season
0,37096.0,55459.0,166640.0,219924.0,81012.0,-1.0,101537.0,57531.0,86934.0,54102.0,...,20664.0,-1.0,-1.0,103955.0,205651.0,103025.0,West Ham United,Manchester City,lose,2019/20
1,37915.0,38290.0,55605.0,173904.0,158534.0,-1.0,157668.0,62974.0,45268.0,231372.0,...,85242.0,148508.0,193488.0,213345.0,-1.0,-1.0,Tottenham Hotspur,Aston Villa,win,2019/20
2,116535.0,171287.0,97032.0,122798.0,169187.0,-1.0,116643.0,41733.0,56979.0,-1.0,...,71738.0,90440.0,78607.0,57127.0,-1.0,-1.0,Liverpool,Norwich City,win,2019/20
3,98747.0,68983.0,17761.0,51927.0,39487.0,-1.0,40145.0,60586.0,433154.0,60551.0,...,101178.0,213482.0,-1.0,84939.0,200439.0,83283.0,Burnley,Southampton,win,2019/20
4,51940.0,184667.0,95658.0,106760.0,214590.0,-1.0,74208.0,176297.0,109322.0,156689.0,...,88894.0,49579.0,184341.0,173879.0,-1.0,-1.0,Manchester United,Chelsea,win,2019/20


In [36]:
# Integrate critical performance index to corresponding player slots in match data
players_to_performance = {"gk": "12", "df": "8", "mf": "6", "fw": "22"} # critical attributes
score_players_df = pd.DataFrame()
for pos in position:  #home/gk
    score_df = df_integrate.copy()
    score_df = score_df[[col for col in score_df.columns if pos in col]+["season"]]
    score_col_idx = [col_id for (posit, col_id) in players_to_performance.items() if posit in pos]
    position_players_score = df_player[['id', 'season'] + score_col_idx]
    count=0
    for i in range(len(all_pos)):
      if pos in all_pos[i]:
        count += 1
        score_df = score_df.merge(position_players_score, how='left', left_on=[all_pos[i], 'season'], right_on=['id', 'season'], suffixes=(f"_{i-1}", f"_{i}"))                                   
    multiplier = 1
    if "fw" in pos:
      multiplier = -1
    score_df[score_df.columns[:count]] = multiplier * score_df[score_df.columns[count+2::2]]
    score_df.drop(columns=score_df.columns[count+1:], inplace=True)
    score_players_df = pd.concat([score_players_df, score_df], axis=1)

score_players_df.drop(columns='season', inplace=True)
score_players_df = pd.concat([score_players_df, df_integrate[["home/name", "away/name", "season", "home_result"]]], axis=1)
columns_titles = ["home/name", "away/name",
                  'home/gk_0', 'home/df_0', 'home/df_1', 'home/df_2', 'home/df_3',
                  'home/df_4', 'home/mf_0', 'home/mf_1', 'home/mf_2', 'home/mf_3',
                  'home/mf_4', 'home/fw_0', 'home/fw_1', 'home/fw_2', 'away/gk_0',
                  'away/df_0', 'away/df_1', 'away/df_2', 'away/df_3', 'away/mf_0',
                  'away/mf_1', 'away/mf_2', 'away/mf_3', 'away/mf_4', 'away/fw_0',
                  'away/fw_1', 'away/fw_2',
                  "home_result", "season"]
score_players_df = score_players_df.reindex(columns=columns_titles)
score_players_df = score_players_df.fillna(-1e10)
print(score_players_df.columns)
score_players_df.head()

Index(['home/name', 'away/name', 'home/gk_0', 'home/df_0', 'home/df_1',
       'home/df_2', 'home/df_3', 'home/df_4', 'home/mf_0', 'home/mf_1',
       'home/mf_2', 'home/mf_3', 'home/mf_4', 'home/fw_0', 'home/fw_1',
       'home/fw_2', 'away/gk_0', 'away/df_0', 'away/df_1', 'away/df_2',
       'away/df_3', 'away/mf_0', 'away/mf_1', 'away/mf_2', 'away/mf_3',
       'away/mf_4', 'away/fw_0', 'away/fw_1', 'away/fw_2', 'home_result',
       'season'],
      dtype='object')


Unnamed: 0,home/name,away/name,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,...,away/mf_0,away/mf_1,away/mf_2,away/mf_3,away/mf_4,away/fw_0,away/fw_1,away/fw_2,home_result,season
0,West Ham United,Manchester City,3.210518,-0.465272,-0.967801,-2.77804,-0.103269,-10000000000.0,-0.17674,-0.124015,...,-0.206532,0.294083,0.618261,-10000000000.0,-10000000000.0,4.382388,1.037659,1.199605,lose,2019/20
1,Tottenham Hotspur,Aston Villa,3.335733,-0.760264,3.798503,-6.133436,-0.224305,-10000000000.0,-0.050453,-0.094294,...,-0.272153,0.010975,0.244853,-0.01426199,-0.08596517,0.467432,-10000000000.0,-10000000000.0,win,2019/20
2,Liverpool,Norwich City,-3.634743,-0.827935,-0.685514,-0.497304,-2.583876,-10000000000.0,-0.101755,-0.124557,...,0.149595,0.612742,-0.050682,-0.05327198,0.1818292,0.909154,-10000000000.0,-10000000000.0,win,2019/20
3,Burnley,Southampton,-4.891314,-0.608896,-2.97037,-1.385166,-0.303632,-10000000000.0,-0.15593,-0.88399,...,-0.046572,-0.121608,0.34666,0.03914802,-10000000000.0,4.28967,0.2996927,0.2529282,win,2019/20
4,Manchester United,Chelsea,-2.383543,3.500862,-1.310595,-0.396454,-0.414364,-10000000000.0,0.334281,0.084951,...,0.085926,-0.158398,-0.054101,-0.02514437,0.04228522,2.056236,-10000000000.0,-10000000000.0,win,2019/20


In [37]:
# Transpose the merged critical performane data
df_one_perf = score_players_df.T.copy()
df_one_perf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139
home/name,West Ham United,Tottenham Hotspur,Liverpool,Burnley,Manchester United,Leicester City,Bournemouth,Watford,Crystal Palace,Newcastle United,...,Burnley,Southampton,Arsenal,Crystal Palace,Wolverhampton Wanderers,West Ham United,Southampton,Manchester United,Liverpool,Newcastle United
away/name,Manchester City,Aston Villa,Norwich City,Southampton,Chelsea,Wolverhampton Wanderers,Sheffield United,Brighton and Hove Albion,Everton,Arsenal,...,Watford,Manchester City,Burnley,Liverpool,Southampton,Norwich City,Brentford,Wolverhampton Wanderers,Leeds United,Manchester City
home/gk_0,3.210518,3.335733,-3.634743,-4.891314,-2.383543,8.04015,2.103014,1.514494,-2.29334,-3.657916,...,-5.023994,-2.676367,-6.289262,-3.143894,-5.454997,13.323114,-2.676367,8.25548,-4.963912,-1.940451
home/df_0,-0.465272,-0.760264,-0.827935,-0.608896,3.500862,-0.644684,-1.345505,-2.232955,-0.743225,4.268416,...,3.693925,-0.063886,-0.453974,-2.502414,-4.224894,-0.540147,-0.510987,-0.145968,-3.264968,0.937648
home/df_1,-0.967801,3.798503,-0.685514,-2.97037,-1.310595,-5.243415,-0.724114,-0.139738,4.303382,-0.799511,...,2.298476,0.005619,1.737085,-2.320418,-1.099064,5.387269,4.759809,-0.674383,0.48432,8.985858


In [38]:
# Complete critical attributes to match data (for merge ordering)
df_match_tranposed = df_match.T.copy() # [index: home/fw_0, column: 0, 1, 2]
for col in df_one_perf.columns:   # 0, 1
    for pos in position:  # home/gk, home/fw
        pos_order = [row for row in df_match_tranposed.index if pos in row]
        sorted_pos = df_match_tranposed[col].loc[pos_order].sort_values(ascending=False, ignore_index=False,
                                  key=lambda x: df_one_perf[col].loc[[row for row in df_one_perf.index if pos in row]].astype(float))
        re_cols = dict(zip(sorted_pos.index, pos_order))
        df_match_tranposed[col].loc[pos_order] = sorted_pos.rename(index=re_cols)
df_match_tranposed.reindex()
df_match_tranposed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139
home/gk_0,37096.0,37915.0,116535.0,98747.0,51940.0,17745.0,225321.0,9089.0,40836.0,67089.0,...,98747.0,40383.0,225321.0,40836.0,149065.0,37096.0,40383.0,51940.0,116535.0,67089.0
home/df_0,81012.0,55605.0,122798.0,39487.0,184667.0,37642.0,126184.0,40868.0,74230.0,106618.0,...,17761.0,450527.0,198869.0,174874.0,510362.0,164555.0,450527.0,214590.0,122798.0,101148.0
home/df_1,55459.0,158534.0,97032.0,68983.0,106760.0,172850.0,56917.0,41338.0,19188.0,119471.0,...,51927.0,171771.0,192895.0,209036.0,214048.0,219924.0,171771.0,76359.0,60914.0,58845.0
home/df_2,166640.0,38290.0,171287.0,51927.0,214590.0,218031.0,223911.0,54484.0,58786.0,101148.0,...,39487.0,244560.0,156074.0,244723.0,94147.0,55459.0,212721.0,106760.0,97032.0,114243.0
home/df_3,219924.0,173904.0,169187.0,17761.0,95658.0,111931.0,-1.0,60232.0,55494.0,-1.0,...,192290.0,158534.0,226597.0,55494.0,-1.0,60232.0,-1.0,90152.0,169187.0,56983.0


In [39]:
# Re-transpose the order merge data
ordered_epl = df_match_tranposed.T.copy()
ordered_epl.head()

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,away/mf_2,away/mf_3,away/mf_4,away/fw_0,away/fw_1,away/fw_2,home/name,away/name,home_result,season
0,37096.0,81012.0,55459.0,166640.0,219924.0,-1.0,54102.0,86934.0,57531.0,101537.0,...,220566.0,-1.0,-1.0,103955.0,103025.0,205651.0,West Ham United,Manchester City,lose,2019/20
1,37915.0,55605.0,158534.0,38290.0,173904.0,-1.0,157668.0,62974.0,231372.0,45268.0,...,148508.0,193488.0,122806.0,213345.0,-1.0,-1.0,Tottenham Hotspur,Aston Villa,win,2019/20
2,116535.0,122798.0,97032.0,171287.0,169187.0,-1.0,116643.0,41733.0,56979.0,-1.0,...,193111.0,71738.0,90440.0,57127.0,-1.0,-1.0,Liverpool,Norwich City,win,2019/20
3,98747.0,39487.0,68983.0,51927.0,17761.0,-1.0,40145.0,433154.0,60551.0,60586.0,...,40146.0,78056.0,-1.0,84939.0,200439.0,83283.0,Burnley,Southampton,win,2019/20
4,51940.0,184667.0,106760.0,214590.0,95658.0,-1.0,74208.0,195851.0,176297.0,109322.0,...,49579.0,88894.0,91651.0,173879.0,-1.0,-1.0,Manchester United,Chelsea,win,2019/20


In [40]:
# Add empty slots as players 
all_pos_home = ['home/name', 'away/name', 'home/gk_0', 'home/df_0', 'home/df_1',
                'home/df_2', 'home/df_3', 'home/df_4', 'home/mf_0', 'home/mf_1',
                'home/mf_2', 'home/mf_3', 'home/mf_4', 'home/fw_0', 'home/fw_1',
                'home/fw_2']

all_season = df_match['season'].unique().tolist() 
wno_players = df_player.copy()
wno_players.drop(columns='name', inplace=True)
for season in all_season:
    null_record = {'id': -1, 'season': season}
    for feat in wno_players.columns[:-2]:
        null_record[feat] = -100.
    wno_players = wno_players.append(null_record, ignore_index=True)

wno_players['id'] = wno_players['id'].astype(int)
wno_players.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,id,season
3937,-2.514528,-0.501798,-0.12844,0.198128,0.566048,-0.237443,-0.000778,0.018578,-0.020019,0.09228,...,0.084486,-0.125432,0.04453,0.033725,-0.094612,0.014176,-0.032584,0.023206,226956,2020/21
3938,1.318987,-3.134735,2.683336,-0.809731,-0.788048,0.326477,-0.025115,-0.42574,0.140882,-1.0107,...,-0.682221,0.248108,-0.126991,-0.606964,-0.868551,0.407927,-1.263675,-0.501533,220688,2021/22
3939,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-1,2019/20
3940,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-1,2020/21
3941,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-1,2021/22


In [52]:
copy_ordered_epl.columns[:27]

Index(['home/gk_0', 'home/df_0', 'home/df_1', 'home/df_2', 'home/df_3',
       'home/df_4', 'home/mf_0', 'home/mf_1', 'home/mf_2', 'home/mf_3',
       'home/mf_4', 'home/fw_0', 'home/fw_1', 'home/fw_2', 'away/gk_0',
       'away/df_0', 'away/df_1', 'away/df_2', 'away/df_3', 'away/mf_0',
       'away/mf_1', 'away/mf_2', 'away/mf_3', 'away/mf_4', 'away/fw_0',
       'away/fw_1', 'away/fw_2'],
      dtype='object')

In [66]:
# Match data for ordering
copy_ordered_epl = ordered_epl.copy()
for i in range(len(all_pos)):
    copy_ordered_epl = copy_ordered_epl.merge(wno_players, how='left', left_on=[all_pos[i], 'season'], 
                                              right_on=['id', 'season'], suffixes=(f"_{i-1}", f"_{i}"))
re_cols = dict(zip(copy_ordered_epl.columns[-25:], df_player.drop(columns=['season', 'name']).columns+'_26'))
copy_ordered_epl.rename(columns=re_cols, inplace=True)
copy_ordered_epl.columns

Index(['home/gk_0', 'home/df_0', 'home/df_1', 'home/df_2', 'home/df_3',
       'home/df_4', 'home/mf_0', 'home/mf_1', 'home/mf_2', 'home/mf_3',
       ...
       '15_26', '16_26', '17_26', '18_26', '19_26', '20_26', '21_26', '22_26',
       '23_26', 'id_26'],
      dtype='object', length=706)

In [67]:
# Check if whether missing records
count_null = 0
null_col = []
for i in range(copy_ordered_epl.shape[0]):
    if copy_ordered_epl.iloc[i].isnull().sum() > 0:
        count_null += 1
        null_col.append(i)
count_null

0

**NOTE:** Column `0_0`means performance index $0^{th}$ with position $0^{th}$.

In [68]:
# Prepare data for machine learning
dropped_copy_ordered_epl = copy_ordered_epl.copy()
dropped_copy_ordered_epl.drop(columns=dropped_copy_ordered_epl.columns[:29], inplace=True)
dropped_copy_ordered_epl.drop(columns=[col for col in dropped_copy_ordered_epl.columns if 'id' in col], inplace=True)
dropped_copy_ordered_epl.columns[:40] 

Index(['home_result', 'season', '0_0', '1_0', '2_0', '3_0', '4_0', '5_0',
       '6_0', '7_0', '8_0', '9_0', '10_0', '11_0', '12_0', '13_0', '14_0',
       '15_0', '16_0', '17_0', '18_0', '19_0', '20_0', '21_0', '22_0', '23_0',
       '0_1', '1_1', '2_1', '3_1', '4_1', '5_1', '6_1', '7_1', '8_1', '9_1',
       '10_1', '11_1', '12_1', '13_1'],
      dtype='object')

In [69]:
# Check number of instances
len(dropped_copy_ordered_epl)

1140

## 3 - Save the Work

In [73]:
# Where to save dataframe
def save_df(df, path, extension="csv", index=False):
    df.to_csv(path, index=index)

In [70]:
# Define the final data
df_final_integrate = dropped_copy_ordered_epl.copy()
df_final_integrate.head()

Unnamed: 0,home_result,season,0_0,1_0,2_0,3_0,4_0,5_0,6_0,7_0,...,14_26,15_26,16_26,17_26,18_26,19_26,20_26,21_26,22_26,23_26
0,lose,2019/20,-0.833564,11.894702,9.797198,0.690392,-0.399289,-0.196324,-0.271997,1.064701,...,1.037913,0.891078,0.420644,0.077849,-0.069981,-0.705365,0.952756,1.854779,-1.037659,-0.282706
1,win,2019/20,-1.341095,8.727017,7.268857,0.476498,-0.217216,-0.160226,-0.007719,0.360472,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0
2,win,2019/20,-0.645374,10.338912,8.351297,0.264416,0.377598,-0.239645,-0.172684,2.751132,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0
3,win,2019/20,-0.455534,15.332285,12.749199,0.857845,-0.496018,-0.299477,-0.234882,0.315938,...,-0.111448,0.046474,0.630581,-1.904343,-0.748645,0.929946,1.764384,0.233507,-0.252928,1.42081
4,win,2019/20,-0.728257,12.33597,9.748855,0.095955,-0.098054,-0.104447,-1.154838,0.70488,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0


In [74]:
# Save the final data
save_df(df_final_integrate, os.path.join(PRJ_ROOT_DIR, "data", "tabular", "integrate", "matches.csv"))

In [72]:
# Define matches with player id
df_match_with_player_id = copy_ordered_epl.copy()
df_match_with_player_id.head()

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,15_26,16_26,17_26,18_26,19_26,20_26,21_26,22_26,23_26,id_26
0,37096.0,81012.0,55459.0,166640.0,219924.0,-1.0,54102.0,86934.0,57531.0,101537.0,...,0.891078,0.420644,0.077849,-0.069981,-0.705365,0.952756,1.854779,-1.037659,-0.282706,205651
1,37915.0,55605.0,158534.0,38290.0,173904.0,-1.0,157668.0,62974.0,231372.0,45268.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-1
2,116535.0,122798.0,97032.0,171287.0,169187.0,-1.0,116643.0,41733.0,56979.0,-1.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-1
3,98747.0,39487.0,68983.0,51927.0,17761.0,-1.0,40145.0,433154.0,60551.0,60586.0,...,0.046474,0.630581,-1.904343,-0.748645,0.929946,1.764384,0.233507,-0.252928,1.42081,83283
4,51940.0,184667.0,106760.0,214590.0,95658.0,-1.0,74208.0,195851.0,176297.0,109322.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-1


In [None]:
# Save the final data
save_df(df_final_integrate, os.path.join(PRJ_ROOT_DIR, "data", "tabular", "integrate", "matches.csv"))