# Regresión con scikit-learn

# Dataset: European Soccer Database from Kaggle

**Descripción del Conjunto de Datos:**
- +25.000 partidos
- +10.000 jugadores
- 11 Países
- Temporadas desde 2008 hasta 2016
- Atributos de los jugadores obtenidos del juego FIFA de EA Sports
- Alineación con coordenadas (X, Y)
- Estadísticas detalladas de cada partido


**Fuente:** <a href="https://www.kaggle.com">Kaggle</a>

**Más información:**
<a href="https://www.kaggle.com/hugomathien/soccer"> European Soccer Database</a>

# Importar librerías

In [1]:
%matplotlib inline
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import minmax_scale
from math import sqrt

# Importar datos

In [2]:
# Crear una Conexión a la Base de Datos
cnx = sqlite3.connect('../datasets/soccer/database.sqlite')
# Importar los Datos de la Base de Datos
df = pd.read_sql_query("SELECT * FROM Player_Attributes", cnx)

In [3]:
df.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [4]:
df.shape

(183978, 42)

In [5]:
df.columns

Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes'],
      dtype='object')

# Definir las variables que usaremos

In [6]:
features = [
       'potential', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes']

# Definir la variable objetivo

In [7]:
target = ['overall_rating']

# Limpiar los datos

In [8]:
df = df.dropna()

# Extraer los valores de las variables

In [9]:
X = df[features]
X.shape

(180354, 34)

# Extraer los valores del objetivo

In [10]:
y = df[target]
y.shape

(180354, 1)

# Consultar una muestra de las variables

In [11]:
X.iloc[2]

potential             66.0
crossing              49.0
finishing             44.0
heading_accuracy      71.0
short_passing         61.0
volleys               44.0
dribbling             51.0
curve                 45.0
free_kick_accuracy    39.0
long_passing          64.0
ball_control          49.0
acceleration          60.0
sprint_speed          64.0
agility               59.0
reactions             47.0
balance               65.0
shot_power            55.0
jumping               58.0
stamina               54.0
strength              76.0
long_shots            35.0
aggression            63.0
interceptions         41.0
positioning           45.0
vision                54.0
penalties             48.0
marking               65.0
standing_tackle       66.0
sliding_tackle        69.0
gk_diving              6.0
gk_handling           11.0
gk_kicking            10.0
gk_positioning         8.0
gk_reflexes            8.0
Name: 2, dtype: float64

# Consultar algunos valores del objetivo

In [12]:
y.head(7)

Unnamed: 0,overall_rating
0,67.0
1,67.0
2,62.0
3,61.0
4,61.0
5,74.0
6,74.0


In [13]:
y.tail()

Unnamed: 0,overall_rating
183973,83.0
183974,78.0
183975,77.0
183976,78.0
183977,80.0


# Dividir el conjunto de datos en Entrenamiento y Evaluación

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

# Regresión Lineal
## La regresión líneal se usa para ajustar una línea a los datos
<img src="../images/linear-regression.png" align="middle" style="width:550px;height:360px"/>

## Crear el modelo con los datos de entrenamiento

In [15]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

## Ejecutar el modelo con los datos de evaluación

In [16]:
y_prediction = regressor.predict(X_test)
y_prediction

array([[66.51284879],
       [79.77234615],
       [66.57371825],
       ...,
       [69.23780133],
       [64.58351696],
       [73.6881185 ]])

## Calcular estadísticas básicas de los datos de test

In [17]:
y_test.describe()

Unnamed: 0,overall_rating
count,59517.0
mean,68.635818
std,7.041297
min,33.0
25%,64.0
50%,69.0
75%,73.0
max,94.0


## Evaluar la precisión del módelo

In [18]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))

In [19]:
print(RMSE)

2.805303046855209


# Decision Tree Regression
## El arbol de decisión se usa para ajustar una curva a los datos
<img src="../images/decision-tree-regression.png" align="middle" style="width:550px;height:360px"/>

## Crear el modelo con los datos de entrenamiento

In [20]:
regressor = DecisionTreeRegressor(max_depth=10)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

## Ejecutar el modelo con los datos de evaluación

In [21]:
y_prediction = regressor.predict(X_test)
y_prediction

array([62.73623188, 84.        , 62.08435374, ..., 71.44178082,
       62.43518519, 75.26771654])

## Calcular estadísticas básicas de los datos de test

In [22]:
y_test.describe()

Unnamed: 0,overall_rating
count,59517.0
mean,68.635818
std,7.041297
min,33.0
25%,64.0
50%,69.0
75%,73.0
max,94.0


## Evaluar la precisión del modelo

In [23]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
print(RMSE)

2.2337339035451005
