# Regresión usando scikit-learn
<br><br><br>
<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/05/Scikit_learn_logo_small.svg/245px-Scikit_learn_logo_small.svg.png">
<br><br><br>
Usaremos una base de datos de fútbol europeo, la cual tiene más de 25,000 partidos y más de 10,000 jugadores para las temporadas de fútbol profesional europeo de 2008 a 2016.

In [None]:
# importamos las librerias
import sqlite3
import pandas as pd 
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn import preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
import numpy as np

<br><br>
#### Creamos una conexión a la base de datos y leemos la tabla "Player_Attributes"

In [None]:
cnx = sqlite3.connect('european_soccer/database.sqlite')
df = pd.read_sql_query("SELECT * FROM Player_Attributes", cnx)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

#### De todas las columnas determinaremos cuáles usaremos como "features" para alimentar nuestro modelo

In [None]:

features = [
       'potential', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes']

<br><br>
#### Seleccionamos el target 

In [None]:
target = ['overall_rating']

<br><br>
#### Limpiamos los datos eliminando filas con valores nan

In [None]:
df = df.dropna()

<br><br>
#### Separamos las features en X y el target en y 

In [None]:
X = df[features]
y = df[target]

<br><br>
#### Miramos cómo luce una fila típica

In [None]:
X.iloc[3] #iloc presenta de manera cómoda una fila

In [None]:
X.head()

In [None]:
y.head()

In [None]:
plt.scatter(X['reactions'],y,color='darkgreen',label="Data", alpha=.1)

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold">
    <br>
Separamos los datos en Training y Test Datasets
<br><br></p>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=324)

In [None]:
scale = preprocessing.StandardScaler()
scale.fit(X_train)
X_train = scale.transform(X_train)

<br><br>
#### Creamos una instancia del modelo "LinearRegression" de Scikit Learn

In [None]:
regressor = LinearRegression()

<br><br>
#### Ajustamos el modelo a los datos de entrenamiento

In [None]:
regressor.fit(X_train, y_train)

<br><br>
#### Llevamos a cabo una predicción utilizando el set de testeo que reservamos para tal fin

In [None]:
X_test = scale.transform(X_test)
y_prediction = regressor.predict(X_test)
y_result = y_prediction - y_test
y_prediction.shape

In [None]:

RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))
regressor.score(X_test, y_test)


In [None]:
print(RMSE)

In [None]:
regressor.coef_