# Ejercicio
- Con el dataset del titanic:
    - Haz un EDA de los datos
    - Crea algunas columnas nuevas que creas que pueden ser interesantes
    - Separa los datos en features y target
    - Separa los datos en train y test
    - Haz un preprocesado de los datos
        - Trata los NaNs
        - Trata los outliers
        - Codifica las variables categóricas
        - Realiza transformaciones que consideres interesantes
        - Escala los datos
    - En un bucle `for` entrena todos los modelos de clasificación que conoces y encuentra cuál es el mejor.
    - Valida las métricas del mejor modelo mediante las técnicas **Hold Out**, **K Folds** y **Leave One Out**.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Modelo
from sklearn.linear_model import LinearRegression

# Herramientas de preprocesamiento
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, TargetEncoder, OneHotEncoder

# Split
from sklearn.model_selection import train_test_split

# Métricas
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

df = pd.read_csv("../Data/titanic.csv")
df.head(2)

Unnamed: 0,PassengerId,Name,Sex,Age,Pclass,Ticket,Fare,Sibsp,Parch,Embarked,Embark_Town,Survived
0,1,"Braund, Mr. Owen Harris",male,22.0,3,A/5 21171,7.25,1,0,C,Cherbourg,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,PC 17599,71.2833,0,0,S,Southampton,1


In [2]:
df.describe(include='all').T
# Con .T invertimos las cabeceras de las columnas y las ponemos en las filas 
# y ya nos da toda la informacion de las columnas(mediana, media, quartiles, desviacion,etc)

# El parametro "y" sera la columna "Survived"

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,890.0,,,,445.5,257.065167,1.0,223.25,445.5,667.75,890.0
Name,890.0,890.0,"Braund, Mr. Owen Harris",1.0,,,,,,,
Sex,890.0,2.0,male,576.0,,,,,,,
Age,713.0,,,,29.695891,14.536439,0.42,20.0,28.0,38.0,80.0
Pclass,890.0,,,,2.307865,0.83622,1.0,2.0,3.0,3.0,3.0
Ticket,890.0,680.0,CA. 2343,7.0,,,,,,,
Fare,890.0,,,,32.231685,49.714597,0.0,7.925,14.4542,31.0,512.3292
Sibsp,890.0,,,,0.522472,1.103247,0.0,0.0,0.0,1.0,8.0
Parch,890.0,,,,0.382022,0.806409,0.0,0.0,0.0,0.0,6.0
Embarked,888.0,3.0,S,643.0,,,,,,,


### EDA
* Revisar columnas y quitar nulos, huecos vacios , etc y convertir a numericos para 
poder utilizarlos en los metodos de prediccion

In [3]:
print(df.describe())

       PassengerId         Age      Pclass        Fare       Sibsp  \
count   890.000000  713.000000  890.000000  890.000000  890.000000   
mean    445.500000   29.695891    2.307865   32.231685    0.522472   
std     257.065167   14.536439    0.836220   49.714597    1.103247   
min       1.000000    0.420000    1.000000    0.000000    0.000000   
25%     223.250000   20.000000    2.000000    7.925000    0.000000   
50%     445.500000   28.000000    3.000000   14.454200    0.000000   
75%     667.750000   38.000000    3.000000   31.000000    1.000000   
max     890.000000   80.000000    3.000000  512.329200    8.000000   

            Parch    Survived  
count  890.000000  890.000000  
mean     0.382022    0.384270  
std      0.806409    0.486696  
min      0.000000    0.000000  
25%      0.000000    0.000000  
50%      0.000000    0.000000  
75%      0.000000    1.000000  
max      6.000000    1.000000  


In [4]:
# Vemos la columnas
df.columns

Index(['PassengerId', 'Name', 'Sex', 'Age', 'Pclass', 'Ticket', 'Fare',
       'Sibsp', 'Parch', 'Embarked', 'Embark_Town', 'Survived'],
      dtype='object')

In [5]:
# Vemos la cantidad de nulos
df.isnull().sum()

PassengerId      0
Name             0
Sex              0
Age            177
Pclass           0
Ticket           0
Fare             0
Sibsp            0
Parch            0
Embarked         2
Embark_Town      2
Survived         0
dtype: int64

In [6]:
# Calculamos la mediana de la edad y rellenamos los nulos
age_median = df['Age'].median()
df['Age'] = df['Age'].fillna(age_median)

In [7]:
# Comprobamos que y ano hay nulos en la columna "Age"
df['Age'].isnull().sum()

0

In [9]:
# Hacemos lo mismo con la columna "Embarked" y "Embark_town"
Embarked_median = df['Embarked'].median()
df['Embarked'] = df['Embarked'].fillna(Embarked_median)

Embark_town_median = df['Embark_town'].median()
df['Embark_town'] = df['Embark_town'].fillna(Embark_town_median)

TypeError: Cannot convert ['C' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'S'
 'C' 'S' 'S' 'Q' 'S' 'S' 'S' 'C' 'S' 'Q' 'S' 'C' 'C' 'Q' 'S' 'C' 'S' 'C'
 'S' 'S' 'C' 'S' 'S' 'C' 'C' 'Q' 'S' 'Q' 'Q' 'C' 'S' 'S' 'S' 'C' 'S' 'C'
 'S' 'S' 'C' 'S' 'S' 'C' nan 'S' 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'Q' 'S' 'C' 'S' 'S' 'C' 'S' 'Q' 'S' 'C' 'S' 'S' 'S' 'C' 'S' 'S' 'C' 'Q'
 'S' 'C' 'S' 'C' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'Q' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'Q' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'C' 'S' 'S' 'C' 'S' 'S' 'S'
 'C' 'S' 'S' 'S' 'S' 'Q' 'S' 'Q' 'S' 'S' 'S' 'S' 'S' 'C' 'C' 'Q' 'S' 'Q'
 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'Q' 'C' 'S' 'S' 'S' 'S' 'Q' 'C' 'S'
 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'C' 'Q' 'S' 'S' 'C' 'Q' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'C' 'C' 'S' 'C' 'S' 'Q' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'C' 'Q' 'S' 'S' 'S' 'Q' 'S' 'Q' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S'
 'Q' 'S' 'C' 'C' 'S' 'S' 'C' 'C' 'S' 'S' 'C' 'Q' 'Q' 'S' 'Q' 'S' 'S' 'C'
 'C' 'C' 'C' 'C' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'Q' 'S' 'S'
 'C' 'S' 'S' 'S' 'C' 'Q' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'C' 'S' 'S' 'S' 'Q' 'Q' 'S'
 'C' 'C' 'S' 'Q' 'S' 'C' 'C' 'Q' 'C' 'C' 'S' 'S' 'C' 'S' 'C' 'S' 'C' 'C'
 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'C' 'S' 'S' 'S' 'C' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'Q' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'C' 'Q' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'S'
 'S' 'C' 'C' 'S' 'C' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'Q'
 'C' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'S'
 'C' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'Q' 'Q' 'S' 'S'
 'C' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'C' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'S' 'C'
 'C' 'C' 'Q' 'S' 'S' 'S' 'S' 'S' 'C' 'C' 'C' 'S' 'S' 'S' 'C' 'S' 'C' 'S'
 'S' 'S' 'S' 'C' 'S' 'S' 'C' 'S' 'S' 'C' 'S' 'Q' 'C' 'S' 'S' 'C' 'C' 'S'
 'S' 'Q' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'S'
 'S' 'C' 'S' 'S' 'C' 'S' 'C' 'C' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'S' 'Q' 'S'
 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'Q'
 'Q' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'C' 'S' 'S' 'S' 'Q' 'S' 'S' 'Q' 'S'
 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'C' 'C' 'S' 'C' 'S'
 'S' 'S' 'S' 'S' 'Q' 'Q' 'S' 'S' 'Q' 'S' 'C' 'S' 'C' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'Q' 'C' 'S' 'S' 'S'
 'C' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'C' 'S' 'S' 'S' 'Q' 'C' 'S' 'C' 'S' 'C'
 'Q' 'S' 'S' 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'Q' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'C' 'S'
 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'Q' 'Q' 'S' 'S' 'S' 'S' 'C' 'S'
 'S' 'Q' 'S' 'Q' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'C' 'Q' 'S' 'S'
 'C' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S'
 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'Q' 'S' 'C' 'Q'
 nan 'C' 'S' 'C' 'S' 'S' 'C' 'S' 'S' 'S' 'C' 'S' 'S' 'C' 'C' 'S' 'S' 'S'
 'C' 'S' 'C' 'S' 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'S' 'S' 'S'
 'S' 'C' 'S' 'S' 'S' 'S' 'S' 'S' 'S' 'C' 'C' 'S' 'S' 'S' 'C' 'S' 'S' 'S'
 'S' 'S' 'Q' 'S' 'S' 'S' 'C' 'Q'] to numeric