In [154]:
import pandas as pd
import numpy as np

In [155]:
players_df = pd.read_csv('data/laliga_player_stats_english.csv')

In [156]:
display(players_df.head())

Unnamed: 0,Team,Position,Shirt number,Name,Minutes played,Games played,Percentage of games played,Full games played,Percentage of full games played,Games started,...,Corners,Tackles.1,Duels,Man-to-man duels,Aerial duels,Passes,Short passes,Long passes,Through balls,Goals scored per attempt
0,Athletic Club,Goalkeeper,,Hodei Oleaga,0.0,0,0.00%,0,0.00%,0,...,0,0,0,0,0,0.0,0.0,0,0,0
1,Athletic Club,Goalkeeper,1.0,A. Remiro,0.0,0,0.00%,0,0.00%,0,...,0,0,0,0,0,0.0,0.0,0,0,0
2,Athletic Club,Goalkeeper,13.0,Herrerín,2.79,31,82.00%,31,82.00%,31,...,0,0,25,6,19,887.0,128.0,759,1,0
3,Athletic Club,Goalkeeper,25.0,Unai Simón,630.0,7,18.00%,7,18.00%,7,...,0,0,3,2,1,155.0,49.0,106,0,0
4,Athletic Club,Defender,3.0,Núñez,1.063,12,32.00%,11,29.00%,11,...,0,15,107,38,69,536.0,457.0,78,1,0


In [157]:
players_df.shape

(556, 62)

In [158]:
players_df.size

34472

# Limpieza

Antes de nada cambiamos el nombre de las columnas para poder trabajar con ellos sin problemas.

In [160]:
for e in players_df.columns:
    players_df = players_df.rename(columns={e:e.lower().replace(' ', '_')})

In [161]:
players_df.head()

Unnamed: 0,team,position,shirt_number,name,minutes_played,games_played,percentage_of_games_played,full_games_played,percentage_of_full_games_played,games_started,...,corners,tackles.1,duels,man-to-man_duels,aerial_duels,passes,short_passes,long_passes,through_balls,goals_scored_per_attempt
0,Athletic Club,Goalkeeper,,Hodei Oleaga,0.0,0,0.00%,0,0.00%,0,...,0,0,0,0,0,0.0,0.0,0,0,0
1,Athletic Club,Goalkeeper,1.0,A. Remiro,0.0,0,0.00%,0,0.00%,0,...,0,0,0,0,0,0.0,0.0,0,0,0
2,Athletic Club,Goalkeeper,13.0,Herrerín,2.79,31,82.00%,31,82.00%,31,...,0,0,25,6,19,887.0,128.0,759,1,0
3,Athletic Club,Goalkeeper,25.0,Unai Simón,630.0,7,18.00%,7,18.00%,7,...,0,0,3,2,1,155.0,49.0,106,0,0
4,Athletic Club,Defender,3.0,Núñez,1.063,12,32.00%,11,29.00%,11,...,0,15,107,38,69,536.0,457.0,78,1,0


Limpiamos la columna 'shirt_number' (única en la que encontramos valores NaN).

Sustituimos los NaN en 'Shirt Number' por "unknown".

Realmente no sería complicado rellenar estos valores con los dorsales correspondientes (tal y como haremos más adelante con los nombres repetidos), pero en principio esta columna no vamos a utilizarla y tampoco es algo necesario en este momento (aunque sería lo ideal). El proceso que deberíamos seguir lo veremos en el cambio de nombres de jugadores (estos sí que los cambiaremos porque las utilizaremos a la hora de construir las relaciones).

In [162]:
players_df.shirt_number.fillna("unknown",inplace=True)

In [164]:
duplicate = list(np.where(players_df["shirt_number"] == 'unknown')) 
duplicate # Estos serían los valores a rellenar con el dorsal correspondiente.

[array([  0,  11,  24,  29,  75,  84, 101, 119, 166, 179, 180, 181, 198,
        233, 254, 255, 265, 287, 311, 320, 321, 339, 349, 367, 370, 392,
        398, 412, 532, 540])]

Comenzamos por la columna 'Team'.

In [166]:
players_df['team'].unique()

array(['Athletic Club', 'Atlético de Madrid', 'CD Leganés', 'D. Alavés',
       'FC Barcelona', 'Getafe CF', 'Girona FC', 'Levante UD',
       'R. Valladolid CF', 'Rayo Vallecano', 'RC Celta', 'RCD Espanyol',
       'Real Betis', 'Real Madrid', 'Real Sociedad', 'SD Eibar',
       'SD Huesca', 'Sevilla FC', 'Valencia CF', 'Villarreal CF'],
      dtype=object)

In [167]:
players_df['team'] = players_df['team'].apply(lambda x: x.replace(' ', '_')) # Sustituimos los espacios por "_"

In [170]:
players_df['team'] = players_df['team'].apply(lambda x: x.replace('.', '')) # ELiminamos los puntos.

In [172]:
players_df['team'].unique()

array(['Athletic_Club', 'Atlético_de_Madrid', 'CD_Leganés', 'D_Alavés',
       'FC_Barcelona', 'Getafe_CF', 'Girona_FC', 'Levante_UD',
       'R_Valladolid_CF', 'Rayo_Vallecano', 'RC_Celta', 'RCD_Espanyol',
       'Real_Betis', 'Real_Madrid', 'Real_Sociedad', 'SD_Eibar',
       'SD_Huesca', 'Sevilla_FC', 'Valencia_CF', 'Villarreal_CF'],
      dtype=object)

Identificamos un problema con los nombres de los jugadores en nuestro DataFrame.

In [173]:
len(players_df.name)

556

In [174]:
len(players_df.name.unique())

547

Observamos que existen nombres repetidos (el número de valores es mayor al de valores únicos). En concreto, se repite un total de 9 valores (nombres de jugadores).

Esto será un problema a la hora de vincular esta tabla con la de valores de mercado por jugadores, así que trataremos de limpiar los datos y arreglar este problema.

In [175]:
# Primero averiguamos cuáles son los valores repetidos.

pl = players_df.copy()  # Generemos una copia del DataFrame (para mantener el original intacto).
p = pl.groupby('name').team.nunique().reset_index() # Agrupamos por 'Name' y en la columna 'Team' incluimos el número de valores en esta columna (una vez hecho el groupby).

In [177]:
print(p.loc[p['team'] > 1]) # Seleccionamos aquellos registros con un valor de 'Team' mayor que 1 (aquellos que se repiten).

          name  team
68       Borja     2
76       Bruno     2
248    Joaquín     2
262   Juanfran     2
280       Koke     2
361      Nacho     2
434    Rodrigo     2
467  Sergio A.     2
512    Vázquez     2


In [178]:
duplicate = list(np.where(players_df["name"] == 'Borja')) 
duplicate # Nos devuelve el index del registro donde 'Name' = 'Borja'.

[array([114, 236])]

In [179]:
# Observamos que el 'Name' para los index consultados coincide con 'Borja'

print(players_df["name"][114], players_df["name"][236])

Borja Borja


In [180]:
# Consultamos el equipo de cada Borja para poder buscar su apellido y cambiar el valor.

print(players_df["team"][114], players_df["team"][236])

D_Alavés R_Valladolid_CF


Cambiaremos el nombre por nombre + apellido.

Lo haremos "a mano" buscando el apellido del jugador en función del equipo en el que juega.

En este primer caso:

   - Borja del Alavés [114] = Borja Bastón
   - Borja del Valladolid [236] = Borja Fernández
   
Cabe destacar que aprovecharemos para incluir el valor (nombre del jugador) tal y como lo tenemos en la tabla de market_values (ahorramos futuros procesos de limpieza).

In [181]:
players_df["name"][114] = 'Borja Bastón'
players_df["name"][236] = 'Borja Fernández'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][114] = 'Borja Bastón'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][236] = 'Borja Fernández'


In [182]:
print(players_df["name"][114], players_df["name"][236])

Borja Bastón Borja Fernández


Seguimos este mismo proceso para el resto de valores repetidos.

Para los "Bruno".

In [183]:
duplicate = list(np.where(players_df["name"] == 'Bruno')) 
duplicate 

[array([147, 548])]

In [184]:
print(players_df["team"][147], players_df["team"][548])

Getafe_CF Villarreal_CF


In [185]:
players_df["name"][147] = 'Bruno González'
players_df["name"][548] = 'Bruno Soriano'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][147] = 'Bruno González'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][548] = 'Bruno Soriano'


In [186]:
print(players_df["name"][147], players_df["name"][548])

Bruno González Bruno Soriano


Para los "Joaquín".

In [187]:
duplicate = list(np.where(players_df["name"] == 'Joaquín')) 
duplicate 

[array([223, 354])]

In [188]:
print(players_df["team"][223], players_df["team"][354])

R_Valladolid_CF Real_Betis


In [189]:
players_df["name"][223] = 'Joaquín Fernández'
players_df["name"][354] = 'Joaquín'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][223] = 'Joaquín Fernández'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][354] = 'Joaquín'


Para los "Juanfran".

In [190]:
duplicate = list(np.where(players_df["name"] == 'Juanfran')) 
duplicate 

[array([37, 65])]

In [191]:
print(players_df["team"][37], players_df["team"][65])

Atlético_de_Madrid CD_Leganés


In [192]:
players_df["name"][37] = 'Juanfran Torres'
players_df["name"][65] = 'Juanfran'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][37] = 'Juanfran Torres'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][65] = 'Juanfran'


Para los "Koke".

In [193]:
duplicate = list(np.where(players_df["name"] == 'Koke')) 
duplicate 

[array([ 44, 195])]

In [194]:
print(players_df["team"][44], players_df["team"][195])

Atlético_de_Madrid Levante_UD


In [195]:
players_df["name"][44] = 'Koke'
players_df["name"][195] = 'Koke Vegas'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][44] = 'Koke'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][195] = 'Koke Vegas'


Para los "Nacho".

In [196]:
duplicate = list(np.where(players_df["name"] == 'Nacho')) 
duplicate 

[array([229, 375])]

In [197]:
print(players_df["team"][229], players_df["team"][375])

R_Valladolid_CF Real_Madrid


In [198]:
players_df["name"][229] = 'Nacho Martínez'
players_df["name"][375] = 'Nacho Fernández'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][229] = 'Nacho Martínez'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][375] = 'Nacho Fernández'


Para los "Rodrigo".

In [199]:
duplicate = list(np.where(players_df["name"] == 'Rodrigo')) 
duplicate 

[array([ 46, 525])]

In [200]:
print(players_df["team"][46], players_df["team"][525])

Atlético_de_Madrid Valencia_CF


In [201]:
players_df["name"][46] = 'Rodri'
players_df["name"][525] = 'Rodrigo'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][46] = 'Rodri'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][525] = 'Rodrigo'


Para los "Sergio A.".

In [202]:
duplicate = list(np.where(players_df["name"] == 'Sergio A.')) 
duplicate 

[array([284, 441])]

In [203]:
print(players_df["team"][284], players_df["team"][441])

RC_Celta SD_Eibar


In [204]:
players_df["name"][284] = 'Sergio Álvarez'
players_df["name"][441] = 'Sergio Álvarez Díaz' # Ojo que este no coincide con el nombre en market_values (allí es 'Sergio Álvarez'; lo cambiamos para que sea valor único).

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][284] = 'Sergio Álvarez'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][441] = 'Sergio Álvarez Díaz' # Ojo que este no coincide con el nombre en market_values (allí es 'Sergio Álvarez'; lo cambiamos para que sea valor único).


Para los "Vázquez".

In [205]:
duplicate = list(np.where(players_df["name"] == 'Vázquez')) 
duplicate 

[array([293, 498])]

In [206]:
print(players_df["team"][293], players_df["team"][498])

RC_Celta Sevilla_FC


In [207]:
players_df["name"][293] = 'Kevin Vázquez'
players_df["name"][498] = 'Franco Vázquez'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][293] = 'Kevin Vázquez'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  players_df["name"][498] = 'Franco Vázquez'


Comprobamos que ya no quedan valores duplicados en esta columna.

In [209]:
len(players_df.name)

556

In [210]:
len(players_df.name.unique())

556

### Resultado Final

In [227]:
players_df.head(10)

Unnamed: 0,team,position,shirt_number,name,minutes_played,games_played,percentage_of_games_played,full_games_played,percentage_of_full_games_played,games_started,...,corners,tackles.1,duels,man-to-man_duels,aerial_duels,passes,short_passes,long_passes,through_balls,goals_scored_per_attempt
0,Athletic_Club,Goalkeeper,unknown,Hodei Oleaga,0.0,0,0.00%,0,0.00%,0,...,0,0,0,0,0,0.0,0.0,0,0,0
1,Athletic_Club,Goalkeeper,1.0,A. Remiro,0.0,0,0.00%,0,0.00%,0,...,0,0,0,0,0,0.0,0.0,0,0,0
2,Athletic_Club,Goalkeeper,13.0,Herrerín,2.79,31,82.00%,31,82.00%,31,...,0,0,25,6,19,887.0,128.0,759,1,0
3,Athletic_Club,Goalkeeper,25.0,Unai Simón,630.0,7,18.00%,7,18.00%,7,...,0,0,3,2,1,155.0,49.0,106,0,0
4,Athletic_Club,Defender,3.0,Núñez,1.063,12,32.00%,11,29.00%,11,...,0,15,107,38,69,536.0,457.0,78,1,0
5,Athletic_Club,Defender,4.0,I. Martínez,2.903,33,87.00%,32,84.00%,33,...,0,53,329,162,167,1.44,1.074,361,5,0
6,Athletic_Club,Defender,5.0,Yeray,2.614,30,79.00%,28,74.00%,30,...,0,55,296,126,170,1.284,1.051,233,0,0
7,Athletic_Club,Defender,6.0,San José,1.716,33,87.00%,15,39.00%,16,...,0,42,243,141,102,707.0,637.0,64,6,0
8,Athletic_Club,Defender,12.0,Yuri B.,3.12,35,92.00%,33,87.00%,35,...,2,48,373,254,119,1.367,1.273,92,2,1
9,Athletic_Club,Defender,15.0,I. Lekue,210.0,4,11.00%,2,5.00%,2,...,2,5,29,20,9,85.0,75.0,10,0,0


### Exportamos a csv.

In [225]:
players_df.to_csv(r'data/players_stats.csv', index=False)