Es el momento de realizar el ajuste de vuestro modelo, en este caso tendréis que usar el csv que guardastéis ayer después de todo el preprocesamiento. Los objetivos de esta lección son:

Realizar el ajuste o ajustes de los modelos

Sacad la matriz de confusión de vuestro modelo e identificad cuáles son los verdaderos positivos, verdaderos negativos, falsos positivos y falsos negativos.

In [1]:

# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns


#  Modelado y matriz de confusión
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


#  Gestión de warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_esta = pd.read_csv("data/adults_esta_enco_balanceo.csv", index_col = 0)
df_esta.head()

Unnamed: 0,Exited,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,RowNumber,CreditScore.1,Age.1,Tenure.1,Balance.1,NumOfProducts.1,HasCrCard,IsActiveMember,EstimatedSalary.1,gender_map,France,Germany,Spain
0,0,1.246488,-1.518201,0.68713,1.276364,-0.911583,-0.471235,4428,771,23,7,156123.73,1,1,0,72990.62,0,1,0,0
1,1,0.667069,0.198164,1.032908,-0.324886,0.807737,-0.123595,191,715,41,8,56214.85,2,0,0,92982.61,0,1,0,0
2,1,-1.733383,0.198164,-1.387538,0.670711,-0.911583,1.096511,3939,483,41,1,118334.44,1,0,0,163147.99,1,0,1,0
3,1,-0.191713,0.484225,-0.695982,0.918482,-0.911583,-1.138686,3304,632,44,3,133793.89,1,1,1,34607.14,0,1,0,0
4,0,-0.150326,-0.469311,-0.695982,-1.225848,0.807737,-0.962203,1767,636,34,3,0.0,2,1,1,44756.25,0,1,0,0


In [3]:
df_enco= pd.read_csv("data/adults_enco.csv", index_col = 0)
df_enco.head()

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,gender_map,France,Germany,Spain
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,619,42,2,0.0,1,1,1,101348.88,1,0,1,0,0
2,608,41,1,83807.86,1,0,1,112542.58,0,0,0,0,1
3,502,42,8,159660.8,3,1,0,113931.57,1,0,1,0,0
4,699,39,1,0.0,2,0,0,93826.63,0,0,1,0,0
5,850,43,2,125510.82,1,1,1,79084.1,0,0,0,0,1


In [19]:
df_enco.reshape(-1,1)

AttributeError: 'DataFrame' object has no attribute 'reshape'

In [4]:
X1 = df_esta.drop("Exited", axis = 1)
y1 = df_esta["Exited"]

In [5]:
X1.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,RowNumber,CreditScore.1,Age.1,Tenure.1,Balance.1,NumOfProducts.1,HasCrCard,IsActiveMember,EstimatedSalary.1,gender_map,France,Germany,Spain
0,1.246488,-1.518201,0.68713,1.276364,-0.911583,-0.471235,4428,771,23,7,156123.73,1,1,0,72990.62,0,1,0,0
1,0.667069,0.198164,1.032908,-0.324886,0.807737,-0.123595,191,715,41,8,56214.85,2,0,0,92982.61,0,1,0,0
2,-1.733383,0.198164,-1.387538,0.670711,-0.911583,1.096511,3939,483,41,1,118334.44,1,0,0,163147.99,1,0,1,0
3,-0.191713,0.484225,-0.695982,0.918482,-0.911583,-1.138686,3304,632,44,3,133793.89,1,1,1,34607.14,0,1,0,0
4,-0.150326,-0.469311,-0.695982,-1.225848,0.807737,-0.962203,1767,636,34,3,0.0,2,1,1,44756.25,0,1,0,0


In [6]:
y1.head()

0    0
1    1
2    1
3    1
4    0
Name: Exited, dtype: int64

In [7]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.2, random_state = 42)

In [8]:
# definimos la regresión logistica
log_reg_esta = LogisticRegression(n_jobs=-1, max_iter = 1000)

# ajustamos el modelo
log_reg_esta.fit(x_train1,y_train1)

# obtenemos las predicciones para el conjunto de entrenamiento
y_pred_train_esta = log_reg_esta.predict(x_train1)

# obtenemos las predicciones para el conjunto de test
y_pred_test_esta = log_reg_esta.predict(x_test1)

In [9]:
train_df_esta = pd.DataFrame({'Real': y_train1, 'Predicted': y_pred_train_esta, 'Set': ['Train']*len(y_train1)})
test_df_esta  = pd.DataFrame({'Real': y_test1,  'Predicted': y_pred_test_esta,  'Set': ['Test']*len(y_test1)})
resultados = pd.concat([train_df_esta,test_df_esta], axis = 0)
resultados.head()

Unnamed: 0,Real,Predicted,Set
4097,1,0,Train
6486,1,1,Train
3307,0,1,Train
7466,1,1,Train
4414,0,1,Train


In [10]:
X2 = df_enco.drop("Exited", axis = 1)
y2 = df_enco["Exited"]

In [11]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

In [12]:
log_reg = LogisticRegression(n_jobs=-1, max_iter = 1000)

log_reg.fit(x_train2,y_train2)

y_pred_train = log_reg_esta.predict(x_train2)

y_pred_test = log_reg_esta.predict(x_test2)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Age.1
- Balance.1
- CreditScore.1
- EstimatedSalary.1
- NumOfProducts.1
- ...
