# NOTEBOOK FOR DATA PREPARATION FOR MODEL A

## AUTHOR: MATHIAS ARE

In [19]:
#Imports
import numpy as np
import pandas as pd

In [20]:
#Read data in

matches = pd.read_csv("./data/match.csv")
players = pd.read_csv("./data/player.csv")
player_attributes = pd.read_csv("./data/player_attributes.csv")
len(matches)

25979

In [21]:

#Dropping unused columns
mat=matches.drop(columns=matches.columns[11:55])
ma = mat.drop(columns=mat.columns[33:])
ma=ma.drop(columns=ma.columns[1:9])

#Replacing goal scores with result column
# 1 = home_team_win, 0 = tie, -1 = away_team_win


ma["result"]=ma["home_team_goal"] - ma["away_team_goal"]
ma.loc[ma["result"]>0,"result"] = 1
ma.loc[ma["result"]<0,"result"] = -1
ma.loc[ma["result"]==0,"result"] = 0
ma.drop(columns=["home_team_goal","away_team_goal"])

#Remove na values
ma=ma.dropna()

#Remove na values and unused columns from player attributes dataframe
pa = player_attributes.dropna()
pa = pa[['id','overall_rating']]
print(len(ma))

#Show na values for each column of matches tabel to check if everything is correct
for i, sum in enumerate(ma.isna().sum()):
    print(ma.columns[i]+"\t"*4,sum)

21374
id				 0
home_team_goal				 0
away_team_goal				 0
home_player_1				 0
home_player_2				 0
home_player_3				 0
home_player_4				 0
home_player_5				 0
home_player_6				 0
home_player_7				 0
home_player_8				 0
home_player_9				 0
home_player_10				 0
home_player_11				 0
away_player_1				 0
away_player_2				 0
away_player_3				 0
away_player_4				 0
away_player_5				 0
away_player_6				 0
away_player_7				 0
away_player_8				 0
away_player_9				 0
away_player_10				 0
away_player_11				 0
result				 0


In [22]:
#Result distribution

ma.groupby("result")["id"].count()

result
-1    6166
 0    5398
 1    9810
Name: id, dtype: int64

In [23]:
#Find average values for each column in pa and save to dictionary

averages=dict()
pa = pa.select_dtypes(exclude=['object'])
pa=pa.drop(columns=["player_fifa_api_id","player_api_id"])
for col in pa.columns:
    averages[col]=((pa[col].mean()*100).round())/100

#Print results
for key in averages.keys():
    print(key + "    ",averages[key])

id     91995.89
overall_rating     68.64


In [24]:

#Merge first players attributes to the matches table.

new_df=pd.merge(ma,pa,left_on="home_player_1",right_on="id",how="left").drop(columns=["id_y"])
new_df["id"]=new_df["id_x"]
new_df=new_df.drop(columns=["id_x"])
for col in pa.columns[1:]:
    new_df["home_player_1 "+col]=new_df[col]
    new_df=new_df.drop(columns=[col])

#Merge rest of the players attributes to the matches table

for j in range(2,23):
    s= "home_player_" if j<12 else "away_player_"
    k = j if j<12 else j-11
    new_df=pd.merge(new_df,pa,left_on=(s+str(k)),right_on="id",how="left").drop(columns=["id_y"])
    new_df["id"]=new_df["id_x"]
    new_df=new_df.drop(columns=["id_x"])
    for col in pa.columns[1:]:
        new_df[s+str(k)+" "+col]=new_df[col]
        new_df=new_df.drop(columns=[col])



#Check for na values
for i, sum in enumerate(new_df.isna().sum()):
    print(new_df.columns[i]+"\t"*4,sum)

home_team_goal				 0
away_team_goal				 0
home_player_1				 0
home_player_2				 0
home_player_3				 0
home_player_4				 0
home_player_5				 0
home_player_6				 0
home_player_7				 0
home_player_8				 0
home_player_9				 0
home_player_10				 0
home_player_11				 0
away_player_1				 0
away_player_2				 0
away_player_3				 0
away_player_4				 0
away_player_5				 0
away_player_6				 0
away_player_7				 0
away_player_8				 0
away_player_9				 0
away_player_10				 0
away_player_11				 0
result				 0
home_player_1 overall_rating				 2590
home_player_2 overall_rating				 4460
home_player_3 overall_rating				 3686
home_player_4 overall_rating				 3773
home_player_5 overall_rating				 4686
home_player_6 overall_rating				 4231
home_player_7 overall_rating				 3829
home_player_8 overall_rating				 4468
home_player_9 overall_rating				 4677
home_player_10 overall_rating				 4396
home_player_11 overall_rating				 4180
away_player_1 overall_rating				 2607
away_player_2 overall_rating				 4444
away_

In [25]:
#Fill na values with average values from dictionary

for col in new_df.columns:
    splitter=col.split(' ')
    if(len(splitter)>1):
        original_name=splitter[1]
        print(averages[original_name])
        new_df[col]=new_df[col].fillna(averages[original_name])

68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64
68.64


In [26]:
#Check again for na values (there should not be any)

for i, sum in enumerate(new_df.isna().sum()):
    print(new_df.columns[i]+"\t"*4,sum)

home_team_goal				 0
away_team_goal				 0
home_player_1				 0
home_player_2				 0
home_player_3				 0
home_player_4				 0
home_player_5				 0
home_player_6				 0
home_player_7				 0
home_player_8				 0
home_player_9				 0
home_player_10				 0
home_player_11				 0
away_player_1				 0
away_player_2				 0
away_player_3				 0
away_player_4				 0
away_player_5				 0
away_player_6				 0
away_player_7				 0
away_player_8				 0
away_player_9				 0
away_player_10				 0
away_player_11				 0
result				 0
home_player_1 overall_rating				 0
home_player_2 overall_rating				 0
home_player_3 overall_rating				 0
home_player_4 overall_rating				 0
home_player_5 overall_rating				 0
home_player_6 overall_rating				 0
home_player_7 overall_rating				 0
home_player_8 overall_rating				 0
home_player_9 overall_rating				 0
home_player_10 overall_rating				 0
home_player_11 overall_rating				 0
away_player_1 overall_rating				 0
away_player_2 overall_rating				 0
away_player_3 overall_rating				 0
away_play

In [27]:
#Save dataframe to csv


new_df.to_csv("./data/data2.csv")

new_df

In [109]:

#Alternate version to merge dataframes
def append_player_attributes(df_m,df_p):
    first = True
    
    colsList=[]
    new_df=None
    for i in range(len(df_m)):
        hasNull=False
        row = df_m.iloc[i]
        row["result"]=convert_result(row)
        row=row.drop(labels=["home_team_goal","away_team_goal"])

        for j in range(1,23):

            s= "home_player_" if j<12 else "away_player_"

            k = j if j<12 else j-11

            Id = row[s+str(k)]

            cols = df_p.columns
            valuesCol = df_p.loc[df_p["id"]==Id].values
            if(len(valuesCol)==0):
                hasNull=True
                continue
            values=list(valuesCol[0])
            for i,col in enumerate(cols):
                row[s+str(k)+"_"+col]=values[i]
        if(first):
            first = False
            new_df=pd.DataFrame(columns=row.keys())
        if(not(hasNull)):
            new_df=new_df.append(row,ignore_index=True)
    return new_df
        






res = append_player_attributes(matches[matches["league_id"] == 1729],pa)

KeyboardInterrupt: 

In [106]:
#Check for na values
for i, sum in enumerate(res.isna().sum()):
    print(res.columns[i]+"\t"*4,sum)

id				 0
country_id				 0
league_id				 0
season				 0
stage				 0
date				 0
match_api_id				 0
home_team_api_id				 0
away_team_api_id				 0
home_player_X1				 0
home_player_X2				 0
home_player_X3				 0
home_player_X4				 0
home_player_X5				 0
home_player_X6				 0
home_player_X7				 0
home_player_X8				 0
home_player_X9				 0
home_player_X10				 0
home_player_X11				 0
away_player_X1				 0
away_player_X2				 0
away_player_X3				 0
away_player_X4				 0
away_player_X5				 0
away_player_X6				 0
away_player_X7				 0
away_player_X8				 0
away_player_X9				 0
away_player_X10				 0
away_player_X11				 0
home_player_Y1				 0
home_player_Y2				 0
home_player_Y3				 0
home_player_Y4				 0
home_player_Y5				 0
home_player_Y6				 0
home_player_Y7				 0
home_player_Y8				 0
home_player_Y9				 0
home_player_Y10				 0
home_player_Y11				 0
away_player_Y1				 0
away_player_Y2				 0
away_player_Y3				 0
away_player_Y4				 0
away_player_Y5				 0
away_player_Y6				 0
away_player_Y7				 0
away_player

In [84]:
#Check dtypes
res.astype({"id":"int32"}).dtypes

id                                int32
country_id                       object
league_id                        object
season                           object
stage                            object
                                  ...  
away_player_11_gk_diving         object
away_player_11_gk_handling       object
away_player_11_gk_kicking        object
away_player_11_gk_positioning    object
away_player_11_gk_reflexes       object
Length: 846, dtype: object

In [59]:
#Print cols

colList=list(res.columns)
for i,t in enumerate(res.dtypes):
    print(colList[i],t)

id object
country_id object
league_id object
season object
stage object
date object
match_api_id object
home_team_api_id object
away_team_api_id object
result object
home_player_1_overall_rating float64
home_player_1_potential float64
home_player_1_preferred_foot object
home_player_1_attacking_work_rate object
home_player_1_defensive_work_rate object
home_player_1_crossing float64
home_player_1_finishing float64
home_player_1_heading_accuracy float64
home_player_1_short_passing float64
home_player_1_volleys float64
home_player_1_dribbling float64
home_player_1_curve float64
home_player_1_free_kick_accuracy float64
home_player_1_long_passing float64
home_player_1_ball_control float64
home_player_1_acceleration float64
home_player_1_sprint_speed float64
home_player_1_agility float64
home_player_1_reactions float64
home_player_1_balance float64
home_player_1_shot_power float64
home_player_1_jumping float64
home_player_1_stamina float64
home_player_1_strength float64
home_player_1_long_sho