# DATA INTEGRATION
A notebook to integrate multiple data sources

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import copy
import pandas as pd
import numpy as np
import os
PRJ_ROOT_DIR = os.path.dirname(os.path.abspath(''))

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns

In [2]:
# Where to save the figures
NOTE_ROOT_DIR = os.path.abspath('')
CHAPTER_ID = "03_data_integrating"
IMAGES_PATH = os.path.join(NOTE_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## **1 - Load the Data**

### 1.1 - EPL Match

In [3]:
# Load EPL match data
df_match = pd.read_csv(os.path.join(PRJ_ROOT_DIR, "data", "tabular", "exploratory", "matches.csv"))
df_match = df_match[df_match["season"] != "2022/23"] # Exclude season 2022/23
df_match.head()

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,away/mf_2,away/mf_3,away/mf_4,away/fw_0,away/fw_1,away/fw_2,home/name,away/name,home_result,season
0,37096.0,55459.0,166640.0,219924.0,81012.0,,101537.0,57531.0,86934.0,54102.0,...,20664.0,,,103955.0,205651.0,103025.0,West Ham United,Manchester City,lose,2019/20
1,37915.0,38290.0,55605.0,173904.0,158534.0,,157668.0,62974.0,45268.0,231372.0,...,85242.0,148508.0,193488.0,213345.0,,,Tottenham Hotspur,Aston Villa,win,2019/20
2,116535.0,171287.0,97032.0,122798.0,169187.0,,116643.0,41733.0,56979.0,,...,71738.0,90440.0,78607.0,57127.0,,,Liverpool,Norwich City,win,2019/20
3,98747.0,68983.0,17761.0,51927.0,39487.0,,40145.0,60586.0,433154.0,60551.0,...,101178.0,213482.0,,84939.0,200439.0,83283.0,Burnley,Southampton,win,2019/20
4,51940.0,184667.0,95658.0,106760.0,214590.0,,74208.0,176297.0,109322.0,156689.0,...,88894.0,49579.0,184341.0,173879.0,,,Manchester United,Chelsea,win,2019/20


In [4]:
# List positions
all_pos = list(df_match.columns)[:-4]
all_pos

['home/gk_0',
 'home/df_0',
 'home/df_1',
 'home/df_2',
 'home/df_3',
 'home/df_4',
 'home/mf_0',
 'home/mf_1',
 'home/mf_2',
 'home/mf_3',
 'home/mf_4',
 'home/fw_0',
 'home/fw_1',
 'home/fw_2',
 'away/gk_0',
 'away/df_0',
 'away/df_1',
 'away/df_2',
 'away/df_3',
 'away/mf_0',
 'away/mf_1',
 'away/mf_2',
 'away/mf_3',
 'away/mf_4',
 'away/fw_0',
 'away/fw_1',
 'away/fw_2']

In [5]:
# List position types
position = ['home/gk', 'home/df', 'home/mf', 'home/fw',
            'away/gk', 'away/df', 'away/mf', 'away/fw']

In [6]:
# Fill empty positions with ghost id (-1)
df_match = df_match.fillna(-1)
df_match[df_match["home/df_4"] == -1].head()

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,away/mf_2,away/mf_3,away/mf_4,away/fw_0,away/fw_1,away/fw_2,home/name,away/name,home_result,season
0,37096.0,55459.0,166640.0,219924.0,81012.0,-1.0,101537.0,57531.0,86934.0,54102.0,...,20664.0,-1.0,-1.0,103955.0,205651.0,103025.0,West Ham United,Manchester City,lose,2019/20
1,37915.0,38290.0,55605.0,173904.0,158534.0,-1.0,157668.0,62974.0,45268.0,231372.0,...,85242.0,148508.0,193488.0,213345.0,-1.0,-1.0,Tottenham Hotspur,Aston Villa,win,2019/20
2,116535.0,171287.0,97032.0,122798.0,169187.0,-1.0,116643.0,41733.0,56979.0,-1.0,...,71738.0,90440.0,78607.0,57127.0,-1.0,-1.0,Liverpool,Norwich City,win,2019/20
3,98747.0,68983.0,17761.0,51927.0,39487.0,-1.0,40145.0,60586.0,433154.0,60551.0,...,101178.0,213482.0,-1.0,84939.0,200439.0,83283.0,Burnley,Southampton,win,2019/20
4,51940.0,184667.0,95658.0,106760.0,214590.0,-1.0,74208.0,176297.0,109322.0,156689.0,...,88894.0,49579.0,184341.0,173879.0,-1.0,-1.0,Manchester United,Chelsea,win,2019/20


### 1.2 - EPL Player

In [8]:
# Load EPL player data (i.e. of PCA/LDA and so on)
df_player = pd.read_csv(os.path.join(PRJ_ROOT_DIR, "data", "tabular", "exploratory", 
                                     "players_lda.csv")) # <-- change to other embeddings if needed
df_player.head()

Unnamed: 0,0,1,2,id,name,season
0,1.687741,-0.155813,-2.664956,54861.0,Christian Benteke,2018/19
1,0.619076,0.274014,-0.376937,213405.0,Filip Benkovic,2018/19
2,0.619076,0.274014,-0.376937,449926.0,Adrián Bernabé,2018/19
3,0.619076,0.274014,-0.376937,182436.0,Ben Woodburn,2018/19
4,0.511864,-0.831793,1.503204,100649.0,Bernard,2018/19


## 2 - Integrate the Data

In [9]:
# Clone match data for safe integration
df_integrate = df_match.copy()
df_integrate.head()

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,away/mf_2,away/mf_3,away/mf_4,away/fw_0,away/fw_1,away/fw_2,home/name,away/name,home_result,season
0,37096.0,55459.0,166640.0,219924.0,81012.0,-1.0,101537.0,57531.0,86934.0,54102.0,...,20664.0,-1.0,-1.0,103955.0,205651.0,103025.0,West Ham United,Manchester City,lose,2019/20
1,37915.0,38290.0,55605.0,173904.0,158534.0,-1.0,157668.0,62974.0,45268.0,231372.0,...,85242.0,148508.0,193488.0,213345.0,-1.0,-1.0,Tottenham Hotspur,Aston Villa,win,2019/20
2,116535.0,171287.0,97032.0,122798.0,169187.0,-1.0,116643.0,41733.0,56979.0,-1.0,...,71738.0,90440.0,78607.0,57127.0,-1.0,-1.0,Liverpool,Norwich City,win,2019/20
3,98747.0,68983.0,17761.0,51927.0,39487.0,-1.0,40145.0,60586.0,433154.0,60551.0,...,101178.0,213482.0,-1.0,84939.0,200439.0,83283.0,Burnley,Southampton,win,2019/20
4,51940.0,184667.0,95658.0,106760.0,214590.0,-1.0,74208.0,176297.0,109322.0,156689.0,...,88894.0,49579.0,184341.0,173879.0,-1.0,-1.0,Manchester United,Chelsea,win,2019/20


In [11]:
# Integrate critical performance index to corresponding player slots in match data
players_to_performance = {"gk": "1", "df": "0", "mf": "1", "fw": "0"} # critical attributes
score_players_df = pd.DataFrame()
for pos in position:  #home/gk
    score_df = df_integrate.copy()
    score_df = score_df[[col for col in score_df.columns if pos in col]+["season"]]
    score_col_idx = [col_id for (posit, col_id) in players_to_performance.items() if posit in pos]
    position_players_score = df_player[['id', 'season'] + score_col_idx]
    count=0
    for i in range(len(all_pos)):
      if pos in all_pos[i]:
        count += 1
        score_df = score_df.merge(position_players_score, how='left', left_on=[all_pos[i], 'season'], right_on=['id', 'season'], suffixes=(f"_{i-1}", f"_{i}"))                                   
    multiplier = 1
    if "df" in pos or "mf" in pos:
      multiplier = -1
    score_df[score_df.columns[:count]] = multiplier * score_df[score_df.columns[count+2::2]]
    score_df.drop(columns=score_df.columns[count+1:], inplace=True)
    score_players_df = pd.concat([score_players_df, score_df], axis=1)

score_players_df.drop(columns='season', inplace=True)
score_players_df = pd.concat([score_players_df, df_integrate[["home/name", "away/name", "season", "home_result"]]], axis=1)
columns_titles = ["home/name", "away/name",
                  'home/gk_0', 'home/df_0', 'home/df_1', 'home/df_2', 'home/df_3',
                  'home/df_4', 'home/mf_0', 'home/mf_1', 'home/mf_2', 'home/mf_3',
                  'home/mf_4', 'home/fw_0', 'home/fw_1', 'home/fw_2', 'away/gk_0',
                  'away/df_0', 'away/df_1', 'away/df_2', 'away/df_3', 'away/mf_0',
                  'away/mf_1', 'away/mf_2', 'away/mf_3', 'away/mf_4', 'away/fw_0',
                  'away/fw_1', 'away/fw_2',
                  "home_result", "season"]
score_players_df = score_players_df.reindex(columns=columns_titles)
score_players_df = score_players_df.fillna(-1e10)
print(score_players_df.columns)
score_players_df.head()

Index(['home/name', 'away/name', 'home/gk_0', 'home/df_0', 'home/df_1',
       'home/df_2', 'home/df_3', 'home/df_4', 'home/mf_0', 'home/mf_1',
       'home/mf_2', 'home/mf_3', 'home/mf_4', 'home/fw_0', 'home/fw_1',
       'home/fw_2', 'away/gk_0', 'away/df_0', 'away/df_1', 'away/df_2',
       'away/df_3', 'away/mf_0', 'away/mf_1', 'away/mf_2', 'away/mf_3',
       'away/mf_4', 'away/fw_0', 'away/fw_1', 'away/fw_2', 'home_result',
       'season'],
      dtype='object')


Unnamed: 0,home/name,away/name,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,...,away/mf_0,away/mf_1,away/mf_2,away/mf_3,away/mf_4,away/fw_0,away/fw_1,away/fw_2,home_result,season
0,West Ham United,Manchester City,7.035507,2.67005,1.873522,2.931753,2.707726,-10000000000.0,1.084006,0.99057,...,1.223387,0.985511,0.594925,-10000000000.0,-10000000000.0,2.059278,2.247774,-0.0253118,lose,2019/20
1,Tottenham Hotspur,Aston Villa,2.864098,1.452575,3.137167,2.72558,1.600187,-10000000000.0,0.868448,1.558343,...,0.92245,0.695674,1.287587,1.00255,0.5881123,1.990417,-10000000000.0,-10000000000.0,win,2019/20
2,Liverpool,Norwich City,6.476357,2.05365,2.259503,3.131397,3.610155,-10000000000.0,1.002026,0.480287,...,0.812558,-0.161368,0.379657,0.4893178,0.7190489,2.234902,-10000000000.0,-10000000000.0,win,2019/20
3,Burnley,Southampton,6.125389,2.026668,3.375562,3.103326,1.947219,-10000000000.0,0.869871,0.795341,...,0.418658,0.992302,0.433101,0.675289,-10000000000.0,1.383543,1.980681,1.08355,win,2019/20
4,Manchester United,Chelsea,7.081245,2.311709,2.441979,1.776189,3.169787,-10000000000.0,-0.558712,0.715908,...,0.646596,0.770839,0.925704,0.00759442,0.8385443,2.518209,-10000000000.0,-10000000000.0,win,2019/20


In [12]:
# Transpose the merged critical performane data
df_one_perf = score_players_df.T.copy()
df_one_perf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139
home/name,West Ham United,Tottenham Hotspur,Liverpool,Burnley,Manchester United,Leicester City,Bournemouth,Watford,Crystal Palace,Newcastle United,...,Burnley,Southampton,Arsenal,Crystal Palace,Wolverhampton Wanderers,West Ham United,Southampton,Manchester United,Liverpool,Newcastle United
away/name,Manchester City,Aston Villa,Norwich City,Southampton,Chelsea,Wolverhampton Wanderers,Sheffield United,Brighton and Hove Albion,Everton,Arsenal,...,Watford,Manchester City,Burnley,Liverpool,Southampton,Norwich City,Brentford,Wolverhampton Wanderers,Leeds United,Manchester City
home/gk_0,7.035507,2.864098,6.476357,6.125389,7.081245,7.327352,6.874712,8.483795,8.177021,7.716314,...,7.394647,3.979458,14.269425,6.547555,7.437687,7.664981,3.979458,6.279345,7.042481,6.347971
home/df_0,2.67005,1.452575,2.05365,2.026668,2.311709,2.563161,2.76889,2.667527,2.315755,1.947041,...,2.757991,2.209071,2.612025,2.50913,2.447501,3.110702,1.665236,1.267365,2.146291,2.044447
home/df_1,1.873522,3.137167,2.259503,3.375562,2.441979,4.509485,3.545527,2.988853,2.661029,1.920219,...,2.311603,1.988306,1.982329,3.200735,2.396435,2.291735,2.975121,2.359624,2.763736,1.981579


In [13]:
# Complete critical attributes to match data (for merge ordering)
df_match_tranposed = df_match.T.copy() # [index: home/fw_0, column: 0, 1, 2]
for col in df_one_perf.columns:   # 0, 1
    for pos in position:  # home/gk, home/fw
        pos_order = [row for row in df_match_tranposed.index if pos in row]
        sorted_pos = df_match_tranposed[col].loc[pos_order].sort_values(ascending=False, ignore_index=False,
                                  key=lambda x: df_one_perf[col].loc[[row for row in df_one_perf.index if pos in row]].astype(float))
        re_cols = dict(zip(sorted_pos.index, pos_order))
        df_match_tranposed[col].loc[pos_order] = sorted_pos.rename(index=re_cols)
df_match_tranposed.reindex()
df_match_tranposed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139
home/gk_0,37096.0,37915.0,116535.0,98747.0,51940.0,17745.0,225321.0,9089.0,40836.0,67089.0,...,98747.0,40383.0,225321.0,40836.0,149065.0,37096.0,40383.0,51940.0,116535.0,67089.0
home/df_0,219924.0,55605.0,169187.0,17761.0,214590.0,218031.0,126184.0,41338.0,74230.0,101148.0,...,17761.0,171771.0,226597.0,244723.0,94147.0,55459.0,171771.0,214590.0,60914.0,58845.0
home/df_1,81012.0,173904.0,122798.0,51927.0,95658.0,111931.0,56917.0,60232.0,55494.0,106618.0,...,51927.0,450527.0,192895.0,209036.0,214048.0,164555.0,450527.0,90152.0,122798.0,101148.0
home/df_2,55459.0,158534.0,97032.0,68983.0,184667.0,37642.0,223911.0,54484.0,58786.0,119471.0,...,192290.0,158534.0,198869.0,55494.0,510362.0,60232.0,212721.0,106760.0,97032.0,56983.0
home/df_3,166640.0,38290.0,171287.0,39487.0,106760.0,172850.0,-1.0,40868.0,19188.0,-1.0,...,39487.0,244560.0,156074.0,174874.0,-1.0,219924.0,-1.0,76359.0,169187.0,114243.0


In [14]:
# Re-transpose the order merge data
ordered_epl = df_match_tranposed.T.copy()
ordered_epl.head()

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,away/mf_2,away/mf_3,away/mf_4,away/fw_0,away/fw_1,away/fw_2,home/name,away/name,home_result,season
0,37096.0,219924.0,81012.0,55459.0,166640.0,-1.0,204480.0,101537.0,57531.0,86934.0,...,20664.0,-1.0,-1.0,205651.0,103955.0,103025.0,West Ham United,Manchester City,lose,2019/20
1,37915.0,55605.0,173904.0,158534.0,38290.0,-1.0,62974.0,231372.0,157668.0,45268.0,...,122806.0,114283.0,193488.0,213345.0,-1.0,-1.0,Tottenham Hotspur,Aston Villa,win,2019/20
2,116535.0,169187.0,122798.0,97032.0,171287.0,-1.0,56979.0,116643.0,41733.0,-1.0,...,90440.0,71738.0,195546.0,57127.0,-1.0,-1.0,Liverpool,Norwich City,win,2019/20
3,98747.0,17761.0,51927.0,68983.0,39487.0,-1.0,433154.0,60551.0,40145.0,60586.0,...,101178.0,40146.0,-1.0,200439.0,84939.0,83283.0,Burnley,Southampton,win,2019/20
4,51940.0,214590.0,95658.0,184667.0,106760.0,-1.0,109322.0,156689.0,176297.0,195851.0,...,91651.0,85955.0,49579.0,173879.0,-1.0,-1.0,Manchester United,Chelsea,win,2019/20


In [15]:
# Add empty slots as players 
all_pos_home = ['home/name', 'away/name', 'home/gk_0', 'home/df_0', 'home/df_1',
                'home/df_2', 'home/df_3', 'home/df_4', 'home/mf_0', 'home/mf_1',
                'home/mf_2', 'home/mf_3', 'home/mf_4', 'home/fw_0', 'home/fw_1',
                'home/fw_2']

all_season = df_match['season'].unique().tolist() 
wno_players = df_player.copy()
wno_players.drop(columns='name', inplace=True)
for season in all_season:
    null_record = {'id': -1, 'season': season}
    for feat in wno_players.columns[:-2]:
        null_record[feat] = -100.
    wno_players = wno_players.append(null_record, ignore_index=True)

wno_players['id'] = wno_players['id'].astype(int)
wno_players.tail()

Unnamed: 0,0,1,2,id,season
3937,0.619076,0.274014,-0.376937,226956,2020/21
3938,1.849693,-0.429686,0.001978,220688,2021/22
3939,-100.0,-100.0,-100.0,-1,2019/20
3940,-100.0,-100.0,-100.0,-1,2020/21
3941,-100.0,-100.0,-100.0,-1,2021/22


In [29]:
# Match data for ordering
copy_ordered_epl = ordered_epl.copy()
for i in range(len(all_pos)):
    copy_ordered_epl = copy_ordered_epl.merge(wno_players, how='left', left_on=[all_pos[i], 'season'], 
                                              right_on=['id', 'season'], suffixes=(f"_{i-1}", f"_{i}"))
re_cols = dict(zip(copy_ordered_epl.columns[-4:], df_player.drop(columns=['season', 'name']).columns+'_26'))
copy_ordered_epl.rename(columns=re_cols, inplace=True)
copy_ordered_epl

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,2_24,id_24,0_25,1_25,2_25,id_25,0_26,1_26,2_26,id_26
0,37096.0,219924.0,81012.0,55459.0,166640.0,-1.0,204480.0,101537.0,57531.0,86934.0,...,-1.588368,205651,2.059278,-1.289581,-0.214014,103955,-0.025312,-0.479952,1.471758,103025
1,37915.0,55605.0,173904.0,158534.0,38290.0,-1.0,62974.0,231372.0,157668.0,45268.0,...,-2.796785,213345,-100.000000,-100.000000,-100.000000,-1,-100.000000,-100.000000,-100.000000,-1
2,116535.0,169187.0,122798.0,97032.0,171287.0,-1.0,56979.0,116643.0,41733.0,-1.0,...,-0.524770,57127,-100.000000,-100.000000,-100.000000,-1,-100.000000,-100.000000,-100.000000,-1
3,98747.0,17761.0,51927.0,68983.0,39487.0,-1.0,433154.0,60551.0,40145.0,60586.0,...,-1.541976,200439,1.383543,-0.932270,0.070167,84939,1.083550,-0.641444,1.485667,83283
4,51940.0,214590.0,95658.0,184667.0,106760.0,-1.0,109322.0,156689.0,176297.0,195851.0,...,-3.471708,173879,-100.000000,-100.000000,-100.000000,-1,-100.000000,-100.000000,-100.000000,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,37096.0,55459.0,164555.0,60232.0,219924.0,-1.0,180151.0,86934.0,178186.0,204480.0,...,-1.226240,57127,-100.000000,-100.000000,-100.000000,-1,-100.000000,-100.000000,-100.000000,-1
1136,40383.0,171771.0,450527.0,212721.0,-1.0,-1.0,203389.0,240143.0,78056.0,244560.0,...,-3.622045,144485,1.180349,-0.571187,-1.254287,446008,-100.000000,-100.000000,-100.000000,-1
1137,51940.0,214590.0,90152.0,106760.0,76359.0,-1.0,195851.0,62398.0,209243.0,220688.0,...,-3.477027,102057,1.293585,-1.183664,2.243419,200600,1.281312,-0.590887,0.282728,222564
1138,116535.0,60914.0,122798.0,97032.0,169187.0,-1.0,206915.0,61558.0,116643.0,-1.0,...,-1.083433,200617,-100.000000,-100.000000,-100.000000,-1,-100.000000,-100.000000,-100.000000,-1


In [30]:
# Check if whether missing records
count_null = 0
null_col = []
for i in range(copy_ordered_epl.shape[0]):
    if copy_ordered_epl.iloc[i].isnull().sum() > 0:
        count_null += 1
        null_col.append(i)
count_null

0

**NOTE:** Column `0_0`means performance index $0^{th}$ with position $0^{th}$.

In [31]:
# Prepare data for machine learning
dropped_copy_ordered_epl = copy_ordered_epl.copy()
dropped_copy_ordered_epl.drop(columns=dropped_copy_ordered_epl.columns[:29], inplace=True)
dropped_copy_ordered_epl.drop(columns=[col for col in dropped_copy_ordered_epl.columns if 'id' in col], inplace=True)
dropped_copy_ordered_epl.columns[:40] 

Index(['home_result', 'season', '0_0', '1_0', '2_0', '0_1', '1_1', '2_1',
       '0_2', '1_2', '2_2', '0_3', '1_3', '2_3', '0_4', '1_4', '2_4', '0_5',
       '1_5', '2_5', '0_6', '1_6', '2_6', '0_7', '1_7', '2_7', '0_8', '1_8',
       '2_8', '0_9', '1_9', '2_9', '0_10', '1_10', '2_10', '0_11', '1_11',
       '2_11', '0_12', '1_12'],
      dtype='object')

In [32]:
# Check number of instances
len(dropped_copy_ordered_epl)

1140

## 3 - Save the Work

In [35]:
# Where to save dataframe
def save_df(df, path, extension="csv", index=False):
    df.to_csv(path, index=index)

In [36]:
# Define the final data
df_final_integrate = dropped_copy_ordered_epl.copy()
df_final_integrate.head()

Unnamed: 0,home_result,season,0_0,1_0,2_0,0_1,1_1,2_1,0_2,1_2,...,2_23,0_24,1_24,2_24,0_25,1_25,2_25,0_26,1_26,2_26
0,lose,2019/20,1.286584,7.035507,1.259102,-2.931753,0.062317,-1.631287,-2.707726,-0.607117,...,-100.0,2.247774,-0.737455,-1.588368,2.059278,-1.289581,-0.214014,-0.025312,-0.479952,1.471758
1,win,2019/20,0.771112,2.864098,0.136227,-3.137167,-0.522932,-0.568367,-2.72558,-0.359102,...,-0.780148,1.990417,-0.727778,-2.796785,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0
2,win,2019/20,1.081934,6.476357,1.060024,-3.610155,-0.617447,3.083772,-3.131397,-0.594232,...,2.718124,2.234902,-0.76416,-0.52477,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0
3,win,2019/20,1.270547,6.125389,0.866984,-3.375562,-0.538387,-1.68244,-3.103326,0.157253,...,-100.0,1.980681,-0.619516,-1.541976,1.383543,-0.93227,0.070167,1.08355,-0.641444,1.485667
4,win,2019/20,1.446074,7.081245,1.423544,-3.169787,-0.7007,0.751141,-2.441979,-0.588787,...,0.878456,2.518209,-0.529214,-3.471708,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0


In [41]:
# Save the final data
save_df(df_final_integrate, os.path.join(PRJ_ROOT_DIR, "data", "tabular", "integrate", "lda", "matches.csv"))

In [39]:
# Define matches with player id
df_match_with_player_id = copy_ordered_epl.copy()
df_match_with_player_id.head()

Unnamed: 0,home/gk_0,home/df_0,home/df_1,home/df_2,home/df_3,home/df_4,home/mf_0,home/mf_1,home/mf_2,home/mf_3,...,2_24,id_24,0_25,1_25,2_25,id_25,0_26,1_26,2_26,id_26
0,37096.0,219924.0,81012.0,55459.0,166640.0,-1.0,204480.0,101537.0,57531.0,86934.0,...,-1.588368,205651,2.059278,-1.289581,-0.214014,103955,-0.025312,-0.479952,1.471758,103025
1,37915.0,55605.0,173904.0,158534.0,38290.0,-1.0,62974.0,231372.0,157668.0,45268.0,...,-2.796785,213345,-100.0,-100.0,-100.0,-1,-100.0,-100.0,-100.0,-1
2,116535.0,169187.0,122798.0,97032.0,171287.0,-1.0,56979.0,116643.0,41733.0,-1.0,...,-0.52477,57127,-100.0,-100.0,-100.0,-1,-100.0,-100.0,-100.0,-1
3,98747.0,17761.0,51927.0,68983.0,39487.0,-1.0,433154.0,60551.0,40145.0,60586.0,...,-1.541976,200439,1.383543,-0.93227,0.070167,84939,1.08355,-0.641444,1.485667,83283
4,51940.0,214590.0,95658.0,184667.0,106760.0,-1.0,109322.0,156689.0,176297.0,195851.0,...,-3.471708,173879,-100.0,-100.0,-100.0,-1,-100.0,-100.0,-100.0,-1


In [42]:
# Save the final data
save_df(df_match_with_player_id, os.path.join(PRJ_ROOT_DIR, "data", "tabular", "integrate", "lda", "matches_with_playerid.csv"))