In [1]:
import pandas as pd
import numpy as np

## Read data

In [23]:
general    = pd.read_pickle('./data/general.p')
shoting    = pd.read_pickle('./data/shoting.p')
passing    = pd.read_pickle('./data/passing.p')
pass_types = pd.read_pickle('./data/pass_types.p')
possession = pd.read_pickle('./data/possession.p')
defense    = pd.read_pickle('./data/defense.p')
gk         = pd.read_pickle('./data/gk.p')
gk_adv     = pd.read_pickle('./data/gk_adv.p')

## Remove Empty Rows

In [26]:
def remove_empty_rows(df):
    df = df.loc[df[('Unnamed: 1_level_0', 'Player')] != 'Player']
    return df

In [36]:
general    = remove_empty_rows(general) 
shoting    = remove_empty_rows(shoting)
passing    = remove_empty_rows(passing)
pass_types = remove_empty_rows(pass_types)
possession = remove_empty_rows(possession)
defense    = remove_empty_rows(defense)
gk         = remove_empty_rows(gk)
gk_adv     = remove_empty_rows(gk_adv)

## Remove multiplicated columns 

In [44]:
cols_todrop = [('Unnamed: 0_level_0', 'Rk'),
 ('Unnamed: 2_level_0', 'Nation'),
 ('Unnamed: 3_level_0', 'Pos'),
 ('Unnamed: 4_level_0', 'Squad'),
 ('Unnamed: 5_level_0', 'Comp'),
 ('Unnamed: 6_level_0', 'Age'),
 ('Unnamed: 7_level_0', 'Born'),
 ('Unnamed: 8_level_0', '90s')]

In [55]:
shoting.columns

MultiIndex([( 'Unnamed: 1_level_0',  'Player'),
            (           'Standard',     'Gls'),
            (           'Standard',      'PK'),
            (           'Standard',   'PKatt'),
            (           'Standard',      'Sh'),
            (           'Standard',     'SoT'),
            (           'Standard',      'FK'),
            (           'Standard',    'SoT%'),
            (           'Standard',   'Sh/90'),
            (           'Standard',  'SoT/90'),
            (           'Standard',    'G/Sh'),
            (           'Standard',   'G/SoT'),
            (           'Expected',      'xG'),
            (           'Expected',    'npxG'),
            (           'Expected', 'npxG/Sh'),
            (           'Expected',    'G-xG'),
            (           'Expected', 'np:G-xG'),
            ('Unnamed: 25_level_0', 'Matches')],
           )

In [110]:
dataframes = [
shoting   ,
passing   ,
pass_types,
possession,
defense   ,
gk        ,
gk_adv
]

In [152]:
for df in dataframes:
    df.drop(labels=cols_todrop, axis=1, errors='ignore', inplace=True)
    df.drop(columns=[x for x in df.columns if 'Matches' in x], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


## Set player name as index 

In [109]:
colplayer = ( 'Unnamed: 1_level_0',  'Player')

In [111]:
for df in dataframes:
    df.set_index(colplayer, inplace=True)

In [127]:
general.set_index(colplayer, inplace=True)

In [132]:
t = pd.merge(general, shoting, left_index=True, right_index=True)

In [135]:
m2 = pd.merge(t, passing, left_index=True, right_index=True)

In [137]:
m3 = pd.merge(m2, pass_types, left_index=True, right_index=True)

In [138]:
m4 = pd.merge(m3, possession, left_index=True, right_index=True)

In [139]:
m5 = pd.merge(m4, defense, left_index=True, right_index=True)

In [141]:
result = m5

In [151]:
result.to_pickle('./data/merged_data.p')

In [149]:
mask_midfielder_attacking = [
        ('Standard', 'Gls'),
        ('Expected_y', 'npxG'),
        ('Expected_y', 'npxG/Sh')
        ('Unnamed: 24_level_0', 'xA'),
        ('Unnamed: 27_level_0', '1/3'),
        ('Total', 'PrgDist'),
        ('Total', 'Cmp%'),
        ('Dribbles', 'Succ%'),
        ('Unnamed: 28_level_0', 'Dispos'),
        ('Vs Dribbles', 'Past')
       ]

In [150]:
result[mask_midfielder_attacking].loc[result.index == 'Piotr Zieliński']

Unnamed: 0_level_0,Unnamed: 24_level_0,Unnamed: 27_level_0,Total,Total,Dribbles,Unnamed: 28_level_0,Vs Dribbles
Unnamed: 0_level_1,xA,1/3,PrgDist,Cmp%,Succ%,Dispos,Past
"(Unnamed: 1_level_0, Player)",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Piotr Zieliński,2.0,214,6688,87.0,63.0,37,26


In [15]:
df = pd.read_pickle('merged_fieldplayers.p')

In [16]:
non_num_cols = [(   'Unnamed: 2_level_0',  'Nation'),
            (   'Unnamed: 3_level_0',     'Pos'),
            (   'Unnamed: 4_level_0',   'Squad'),
            (   'Unnamed: 5_level_0',    'Comp')]

In [17]:
for column in df.columns:
    if column not in non_num_cols:
        df[column] = pd.to_numeric(df[column])

In [19]:
positions = df[('Unnamed: 3_level_0', 'Pos')].str.split(',', expand=True)
df[('Position', 'Pos')] = positions[0]
df[('Position', 'Alt')] = positions[1]

In [24]:
df.reset_index(inplace.)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Playing Time,Playing Time,...,Pressures,Blocks,Blocks,Blocks,Blocks,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Position,Position
Unnamed: 0_level_1,Player,Rk,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Att 3rd,Blocks,Sh,ShSv,Pass,Int,Clr,Err,Pos,Alt
0,Aaron Connolly,521,ie IRL,FW,Brighton,eng Premier League,19.0,2000.0,17,9,...,96.0,6.0,1.0,0.0,5.0,0,1.0,0.0,FW,
1,Aaron Cresswell,555,eng ENG,DF,West Ham,eng Premier League,29.0,1989.0,23,23,...,25.0,52.0,9.0,0.0,43.0,26,71.0,2.0,DF,
2,Aaron Lennon,1332,eng ENG,MF,Burnley,eng Premier League,32.0,1987.0,16,4,...,39.0,8.0,2.0,0.0,6.0,7,6.0,0.0,MF,
3,Aaron Leya Iseka,1343,be BEL,FW,Toulouse,fr Ligue 1,21.0,1997.0,22,6,...,54.0,6.0,1.0,0.0,5.0,1,7.0,0.0,FW,
4,Aaron Mooy,1610,au AUS,"MF,FW",Brighton,eng Premier League,28.0,1990.0,22,20,...,95.0,37.0,5.0,1.0,32.0,16,20.0,0.0,MF,FW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11036,Łukasz Skorupski,2213,pl POL,GK,Bologna,it Serie A,28.0,1991.0,25,25,...,0.0,0.0,0.0,0.0,0.0,2,0.0,1.0,GK,
11037,Łukasz Teodorczyk,2330,pl POL,"DF,FW",Udinese,it Serie A,28.0,1991.0,7,0,...,9.0,0.0,0.0,0.0,0.0,0,1.0,0.0,DF,FW
11038,Šime Vrsaljko,2484,hr CRO,DF,Atlético Madrid,es La Liga,27.0,1992.0,5,4,...,8.0,12.0,3.0,0.0,9.0,8,12.0,0.0,DF,
11039,Žan Majer,1433,si SVN,MF,Lecce,it Serie A,27.0,1992.0,20,18,...,47.0,38.0,8.0,1.0,30.0,22,23.0,1.0,MF,


In [25]:
df[('info', 'Player')] = df.index