In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

Verificamos que el archivo sqlite existe con este comando en Databricks:
>
```dbutils.fs.ls("/FileStore/tables/futbol.sqlite")```

# 1. Lectura de los datos
Con el archivo `futbol.sqlite` importamos la tabla `Player_Attributes` en un dataframe de pandas para analizar los datos, aquí usamos spark en un notebook creado por *databricks*, en jupyter se podría usar la librería `sqlite3` para cargar los datos

In [0]:
file_path = "dbfs:/FileStore/tables/futbol.sqlite"
copy_path = "file:/tmp/futbol.sqlite"

# Copiar el archivo sqlite temporalmente si no existe ya una copia
if not dbutils.fs.cp(file_path, copy_path, True):
      dbutils.fs.cp(file_path, copy_path)

# DataFrame de Spark
futbol_df = spark.read.format('jdbc') \
      .options(driver='org.sqlite.JDBC', dbtable='Player_Attributes',
      url=f'jdbc:sqlite:{copy_path}').load()

# Spark DataFrame a Pandas
futbol_pd = futbol_df.toPandas()

In [0]:
display(futbol_pd.head(10))

id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
6,189615,155782,2016-04-21 00:00:00,74.0,76.0,left,high,medium,80.0,53.0,58.0,71.0,40.0,73.0,70.0,69.0,68.0,71.0,79.0,78.0,78.0,67.0,90.0,71.0,85.0,79.0,56.0,62.0,68.0,67.0,60.0,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
7,189615,155782,2016-04-07 00:00:00,74.0,76.0,left,high,medium,80.0,53.0,58.0,71.0,32.0,73.0,70.0,69.0,68.0,71.0,79.0,78.0,78.0,67.0,90.0,71.0,85.0,79.0,56.0,60.0,68.0,67.0,60.0,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
8,189615,155782,2016-01-07 00:00:00,73.0,75.0,left,high,medium,79.0,52.0,57.0,70.0,29.0,71.0,68.0,69.0,68.0,70.0,79.0,78.0,78.0,67.0,90.0,71.0,84.0,79.0,56.0,59.0,67.0,66.0,58.0,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
9,189615,155782,2015-12-24 00:00:00,73.0,75.0,left,high,medium,79.0,51.0,57.0,70.0,29.0,71.0,68.0,69.0,68.0,70.0,79.0,78.0,78.0,67.0,90.0,71.0,84.0,79.0,56.0,58.0,67.0,66.0,58.0,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
10,189615,155782,2015-12-17 00:00:00,73.0,75.0,left,high,medium,79.0,51.0,57.0,70.0,29.0,71.0,68.0,69.0,68.0,70.0,79.0,78.0,78.0,67.0,90.0,71.0,84.0,79.0,56.0,58.0,67.0,66.0,58.0,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0



# 2. Preparación y transformación de datos

In [0]:
futbol_shape = futbol_pd.shape
print("Shape Player_Attributes:", futbol_shape)

Shape Player_Attributes: (183978, 42)



El tamaño de los datos almacenados son de casi 184 mil datos, a este punto es necesario verificar si el dataset necesita de una limpieza, es decir eliminar valores nulos o que sean ceros, tomar en cuenta que valores categoricos en caso de existir tomar para la limpieza, etc

In [0]:
# Para empezar con la limpieza, eliminamos todos los valores nulos que se encuentren en el dataframe
futbol_pd = futbol_pd.dropna(how='any')
print(f'Antes: {futbol_shape} vs Después: {futbol_pd.shape}')

Antes: (183978, 42) vs Después: (180354, 42)


In [0]:
display(futbol_pd.dtypes)

id                       int32
player_fifa_api_id       int32
player_api_id            int32
date                    object
overall_rating         float64
potential              float64
preferred_foot          object
attacking_work_rate     object
defensive_work_rate     object
crossing               float64
finishing              float64
heading_accuracy       float64
short_passing          float64
volleys                float64
dribbling              float64
curve                  float64
free_kick_accuracy     float64
long_passing           float64
ball_control           float64
acceleration           float64
sprint_speed           float64
agility                float64
reactions              float64
balance                float64
shot_power             float64
jumping                float64
stamina                float64
strength               float64
long_shots             float64
aggression             float64
interceptions          float64
positioning            float64
vision  

Revisamos las columnas que no son de tipo numerico (categoricas), en este caso las que son object para determinar si son de utilidad

In [0]:
def print_unique_values(data_frame, dtype='object'):
    for column in data_frame.columns:
        if data_frame[column].dtype == dtype:
            print(f'{column}: {list(data_frame[column].unique())}')

print_unique_values(futbol_pd)

date: ['2016-02-18 00:00:00', '2015-11-19 00:00:00', '2015-09-21 00:00:00', '2015-03-20 00:00:00', '2007-02-22 00:00:00', '2016-04-21 00:00:00', '2016-04-07 00:00:00', '2016-01-07 00:00:00', '2015-12-24 00:00:00', '2015-12-17 00:00:00', '2015-10-16 00:00:00', '2015-09-25 00:00:00', '2015-01-09 00:00:00', '2014-12-05 00:00:00', '2014-11-07 00:00:00', '2014-09-18 00:00:00', '2014-05-02 00:00:00', '2014-04-04 00:00:00', '2014-03-14 00:00:00', '2013-12-13 00:00:00', '2013-11-08 00:00:00', '2013-10-04 00:00:00', '2013-09-20 00:00:00', '2013-05-03 00:00:00', '2013-03-22 00:00:00', '2013-03-15 00:00:00', '2013-02-22 00:00:00', '2013-02-15 00:00:00', '2012-08-31 00:00:00', '2012-02-22 00:00:00', '2011-08-30 00:00:00', '2010-08-30 00:00:00', '2010-02-22 00:00:00', '2009-08-30 00:00:00', '2009-02-22 00:00:00', '2008-08-30 00:00:00', '2015-10-09 00:00:00', '2014-12-12 00:00:00', '2014-04-18 00:00:00', '2014-01-31 00:00:00', '2013-11-29 00:00:00', '2013-05-31 00:00:00', '2013-04-26 00:00:00', '201

Observamos que `date` no es de utilidad, `attacking_work_rate` y `defensive_work_rate` tienen valores basura.
>
Teniendo en cuenta esto decidimos eliminar la columna `date` ya que es irrelevante.
>
Para attacking y defensive eliminamos los datos basura que no sean igual a 'high', 'medium' y 'low'

In [0]:
# Eliminamos la columna date
if 'date' in futbol_pd.columns:
    futbol_pd = futbol_pd.drop('date', axis=1)

# Marcamos en un array los valores validos para `attacking` y `defensive`
valid_attack_defense_values = ['high', 'medium', 'low']

# Tomamos en cuenta las filas que contengan los valores validos de `attacking` y `defensive`
futbol_pd = futbol_pd[(futbol_pd['attacking_work_rate'].isin(valid_attack_defense_values))]
futbol_pd = futbol_pd[(futbol_pd['defensive_work_rate'].isin(valid_attack_defense_values))]
display(futbol_pd.head(10))

id,player_fifa_api_id,player_api_id,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
1,218353,505942,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,218353,505942,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
3,218353,505942,62.0,66.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
4,218353,505942,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
5,218353,505942,61.0,65.0,right,medium,medium,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
6,189615,155782,74.0,76.0,left,high,medium,80.0,53.0,58.0,71.0,40.0,73.0,70.0,69.0,68.0,71.0,79.0,78.0,78.0,67.0,90.0,71.0,85.0,79.0,56.0,62.0,68.0,67.0,60.0,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
7,189615,155782,74.0,76.0,left,high,medium,80.0,53.0,58.0,71.0,32.0,73.0,70.0,69.0,68.0,71.0,79.0,78.0,78.0,67.0,90.0,71.0,85.0,79.0,56.0,60.0,68.0,67.0,60.0,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
8,189615,155782,73.0,75.0,left,high,medium,79.0,52.0,57.0,70.0,29.0,71.0,68.0,69.0,68.0,70.0,79.0,78.0,78.0,67.0,90.0,71.0,84.0,79.0,56.0,59.0,67.0,66.0,58.0,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
9,189615,155782,73.0,75.0,left,high,medium,79.0,51.0,57.0,70.0,29.0,71.0,68.0,69.0,68.0,70.0,79.0,78.0,78.0,67.0,90.0,71.0,84.0,79.0,56.0,58.0,67.0,66.0,58.0,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0
10,189615,155782,73.0,75.0,left,high,medium,79.0,51.0,57.0,70.0,29.0,71.0,68.0,69.0,68.0,70.0,79.0,78.0,78.0,67.0,90.0,71.0,84.0,79.0,56.0,58.0,67.0,66.0,58.0,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0


Verificamos que los valores validos para `attacking` y `defensive` sean los asignados como validos.
>
# Datos despues de dejar las columnas que tengan los valores para `attacking` y `defensive`

In [0]:
print(futbol_pd['attacking_work_rate'].unique())
print(futbol_pd['defensive_work_rate'].unique())
print(f'Antes: {futbol_shape} vs Después: {futbol_pd.shape}')

['medium' 'high' 'low']
['medium' 'high' 'low']
Antes: (183978, 42) vs Después: (176161, 41)


Una ves hecha la eliminación de datos basura, decidimos utilizar one-hot para dividir en columnas a tipo numericas que son categoricas, en este caso por cada categorica en `defensive`, `attacking` y `preferred_foot` añadiremos columnas rellenando con 0 las que no corresponden y con 1 las que sí

In [0]:
# Realizando one-hot-encoding de las columnas categóricas
futbol_pd = pd.get_dummies(futbol_pd, columns=['preferred_foot', 'attacking_work_rate', 'defensive_work_rate'])
display(futbol_pd.head(10))

  Unable to convert the field preferred_foot_left. If this column is not necessary, you may consider dropping it or converting to primitive type before the conversion.
Direct cause: Unsupported type in conversion from Arrow: uint8
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


id,player_fifa_api_id,player_api_id,overall_rating,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,preferred_foot_left,preferred_foot_right,attacking_work_rate_high,attacking_work_rate_low,attacking_work_rate_medium,defensive_work_rate_high,defensive_work_rate_low,defensive_work_rate_medium
1,218353,505942,67.0,71.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0,0,1,0,0,1,0,0,1
2,218353,505942,67.0,71.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0,0,1,0,0,1,0,0,1
3,218353,505942,62.0,66.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0,0,1,0,0,1,0,0,1
4,218353,505942,61.0,65.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0,0,1,0,0,1,0,0,1
5,218353,505942,61.0,65.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0,0,1,0,0,1,0,0,1
6,189615,155782,74.0,76.0,80.0,53.0,58.0,71.0,40.0,73.0,70.0,69.0,68.0,71.0,79.0,78.0,78.0,67.0,90.0,71.0,85.0,79.0,56.0,62.0,68.0,67.0,60.0,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0,1,0,1,0,0,0,0,1
7,189615,155782,74.0,76.0,80.0,53.0,58.0,71.0,32.0,73.0,70.0,69.0,68.0,71.0,79.0,78.0,78.0,67.0,90.0,71.0,85.0,79.0,56.0,60.0,68.0,67.0,60.0,66.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0,1,0,1,0,0,0,0,1
8,189615,155782,73.0,75.0,79.0,52.0,57.0,70.0,29.0,71.0,68.0,69.0,68.0,70.0,79.0,78.0,78.0,67.0,90.0,71.0,84.0,79.0,56.0,59.0,67.0,66.0,58.0,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0,1,0,1,0,0,0,0,1
9,189615,155782,73.0,75.0,79.0,51.0,57.0,70.0,29.0,71.0,68.0,69.0,68.0,70.0,79.0,78.0,78.0,67.0,90.0,71.0,84.0,79.0,56.0,58.0,67.0,66.0,58.0,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0,1,0,1,0,0,0,0,1
10,189615,155782,73.0,75.0,79.0,51.0,57.0,70.0,29.0,71.0,68.0,69.0,68.0,70.0,79.0,78.0,78.0,67.0,90.0,71.0,84.0,79.0,56.0,58.0,67.0,66.0,58.0,65.0,59.0,76.0,75.0,78.0,14.0,7.0,9.0,9.0,12.0,1,0,1,0,0,0,0,1


Una ves convertidas las variables categoricas a numericas, es hora de verificar la correlacion respecto a Penalties utilizando Pearson

In [0]:
# No es necesario eliminar los valores con 0, aunque haremos la verificación para entender los datos:
def print_zeros_values(data_frame):
    for column in data_frame.columns:
        print (f'{column}: {sum(data_frame[column] == 0)}')

print_zeros_values(futbol_pd)

id: 0
player_fifa_api_id: 0
player_api_id: 0
overall_rating: 0
potential: 0
crossing: 0
finishing: 0
heading_accuracy: 0
short_passing: 0
volleys: 0
dribbling: 0
curve: 0
free_kick_accuracy: 0
long_passing: 0
ball_control: 0
acceleration: 0
sprint_speed: 0
agility: 0
reactions: 0
balance: 0
shot_power: 0
jumping: 0
stamina: 0
strength: 0
long_shots: 0
aggression: 0
interceptions: 0
positioning: 0
vision: 0
penalties: 0
marking: 0
standing_tackle: 0
sliding_tackle: 0
gk_diving: 0
gk_handling: 0
gk_kicking: 0
gk_positioning: 0
gk_reflexes: 0
preferred_foot_left: 133102
preferred_foot_right: 43059
attacking_work_rate_high: 133410
attacking_work_rate_low: 167622
attacking_work_rate_medium: 51290
defensive_work_rate_high: 149189
defensive_work_rate_low: 157736
defensive_work_rate_medium: 45397



No existen valores con 0 en columnas anteriormente de tipo númericas, a excepción donde utilizamos one-hot para las columnas categoricas donde es normal existan valores con 0.
>
Teniendo en cuenta que tenemos 46 parametros despues de utilizar one-hot y dejar todos los datos a valores numericos, procederemos a quitar todas aquellos parametros que no tengan una correlacion moderada para arriba, siguiendo la siguiente tabla como referencia:

| Valor absoluto de $r$ | Fuerza de la relación |
|-----------------------|-----------------------|
| $$ r <0,25 $$           | Sin relación          |
| $$ 0,25 <r <0,5 $$      | Relación débil        |
| $$ 0,5 <r <0,75 $$      | Relación moderada     |
| $$ r> 0,75 $$           | Relación fuerte       |

Fuente: [Statologos. (2021). ¿Qué se considera una correlación «débil»? Statologos.](https://statologos.com/que-es-una-correlacion-debil/#google_vignette)

De esta manera verificamos las correlaciones en base al valor que se busca predecir `penalties`:

In [0]:
# Tomamos en cuenta las columnas que tienen una correlacion moderado con respecto a `penalties`, escogeremos las primeras 10 relevantes arriba de 0.5
correlation_penalties = futbol_pd.corr(method='pearson')['penalties']

# Moviendo los datos de correlación en forma ascendente
correlation_penalties = correlation_penalties[np.argsort(correlation_penalties, axis=0)[::-1]]

# Mostramos las 11 primeras correlaciones
display(correlation_penalties.head(11))

penalties             1.000000
positioning           0.754793
finishing             0.728901
long_shots            0.716380
volleys               0.715863
ball_control          0.685799
shot_power            0.682513
free_kick_accuracy    0.670293
vision                0.666163
dribbling             0.665066
curve                 0.651492
Name: penalties, dtype: float64

En caso de no tener más de 10 parámetros relevantes podríamos haber eliminado datos atípicios para verificar si la correlación mejoraba

In [0]:
futbol_pd_top_10 = futbol_pd[correlation_penalties.sort_values(ascending=False).head(11).keys()]
display(futbol_pd_top_10.head(10))

penalties,positioning,finishing,long_shots,volleys,ball_control,shot_power,free_kick_accuracy,vision,dribbling,curve
48.0,45.0,44.0,35.0,44.0,49.0,55.0,39.0,54.0,51.0,45.0
48.0,45.0,44.0,35.0,44.0,49.0,55.0,39.0,54.0,51.0,45.0
48.0,45.0,44.0,35.0,44.0,49.0,55.0,39.0,54.0,51.0,45.0
47.0,44.0,43.0,34.0,43.0,48.0,54.0,38.0,53.0,50.0,44.0
47.0,44.0,43.0,34.0,43.0,48.0,54.0,38.0,53.0,50.0,44.0
59.0,60.0,53.0,62.0,40.0,71.0,71.0,69.0,66.0,73.0,70.0
59.0,60.0,53.0,60.0,32.0,71.0,71.0,69.0,66.0,73.0,70.0
59.0,58.0,52.0,59.0,29.0,70.0,71.0,69.0,65.0,71.0,68.0
59.0,58.0,51.0,58.0,29.0,70.0,71.0,69.0,65.0,71.0,68.0
59.0,58.0,51.0,58.0,29.0,70.0,71.0,69.0,65.0,71.0,68.0


In [0]:
print(f'Antes: {futbol_shape} vs Después: {futbol_pd_top_10.shape}')

Antes: (183978, 42) vs Después: (176161, 11)


Una ves hecha la limpieza definitiva de los datos, necesitamos normalizar nuestro dataset, para esto usaremos la libreria `StandardScaler` de sklearn

In [0]:
scaler = StandardScaler()
futbol_normalized_array = scaler.fit_transform(futbol_pd_top_10)
futbol_pd_top_10_normalized = pd.DataFrame(futbol_normalized_array, columns=futbol_pd_top_10.columns)

display(futbol_pd_top_10_normalized.head(10))

penalties,positioning,finishing,long_shots,volleys,ball_control,shot_power,free_kick_accuracy,vision,dribbling,curve
-0.4462080000429676,-0.5821508605443947,-0.3177860308118029,-1.004306298615822,-0.3053714028973665,-0.9554141927645208,-0.4306887725168158,-0.5847890607093886,-0.2564311792213669,-0.4715488241267285,-0.4432013705785032
-0.4462080000429676,-0.5821508605443947,-0.3177860308118029,-1.004306298615822,-0.3053714028973665,-0.9554141927645208,-0.4306887725168158,-0.5847890607093886,-0.2564311792213669,-0.4715488241267285,-0.4432013705785032
-0.4462080000429676,-0.5821508605443947,-0.3177860308118029,-1.004306298615822,-0.3053714028973665,-0.9554141927645208,-0.4306887725168158,-0.5847890607093886,-0.2564311792213669,-0.4715488241267285,-0.4432013705785032
-0.5104930755414098,-0.6363098842483361,-0.3703281299744539,-1.0587096704118175,-0.3601201505255816,-1.0211970615302368,-0.492677161611234,-0.6408939986657455,-0.3223914758142396,-0.5278972889525612,-0.4979409557572213
-0.5104930755414098,-0.6363098842483361,-0.3703281299744539,-1.0587096704118175,-0.3601201505255816,-1.0211970615302368,-0.492677161611234,-0.6408939986657455,-0.3223914758142396,-0.5278972889525612,-0.4979409557572213
0.2609278304398966,0.2302344950147255,0.1550928616520552,0.4645847398760554,-0.5243663934102272,0.49180892008123,0.5611254529938736,1.098359077981316,0.5350923798931053,0.7681174020415887,0.9252882588894492
0.2609278304398966,0.2302344950147255,0.1550928616520552,0.3557779962840645,-0.9623563744359483,0.49180892008123,0.5611254529938736,1.098359077981316,0.5350923798931053,0.7681174020415887,0.9252882588894492
0.2609278304398966,0.1219164476068428,0.1025507624894043,0.301374624488069,-1.126602617320594,0.426026051315514,0.5611254529938736,1.098359077981316,0.4691320833002326,0.6554204723899235,0.815809088532013
0.2609278304398966,0.1219164476068428,0.0500086633267534,0.2469712526920736,-1.126602617320594,0.426026051315514,0.5611254529938736,1.098359077981316,0.4691320833002326,0.6554204723899235,0.815809088532013
0.2609278304398966,0.1219164476068428,0.0500086633267534,0.2469712526920736,-1.126602617320594,0.426026051315514,0.5611254529938736,1.098359077981316,0.4691320833002326,0.6554204723899235,0.815809088532013
