In [113]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

from sklearn.pipeline import Pipeline

from feature_engine.imputation import(
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder
)

from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

In [114]:
import my_preprocessors as mypp #nuestra libraria

In [115]:
pd.pandas.set_option('display.max_columns', None) #habilitamos despliegue maximo de columnas

In [116]:
data = pd.read_csv('Marqueting.csv', encoding='latin-1')
data.head()

Unnamed: 0,MIS,Cumpleaños,NivelEducativo,EstadoCivil,Ingreso,CantNiños,CantAdolecente,InicioCliente,AlimentosFrescos,Vinos,Frutas,Carnes,Pescado,Dulces,Oro,ComprasOfertas,ComprasWeb,ComprasCatalogo,ComprasTienda,VisitasWeb,AceptaCompra1,AceptaCompra2,AceptaCompra3,AceptaCompra4,AceptaCompra5,Quejas,CostoContacto,CostoIngresos,Respuestas,TotalProductos,TotalCompras,Edad,Antiguedad,TieneNiños,TieneAdolecentes
0,5524,1957,Graduation,Single,58138.0,0,0,4/09/2012,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1,1675,32,65,10,0,0
1,2174,1954,Graduation,Single,46344.0,1,1,8/03/2014,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0,65,11,68,8,1,1
2,4141,1965,Graduation,Together,71613.0,0,0,21/08/2013,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0,802,25,57,9,0,0
3,6182,1984,Graduation,Together,26646.0,1,0,10/02/2014,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0,79,14,38,8,1,0
4,5324,1981,PhD,Married,58293.0,1,0,19/01/2014,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0,516,24,41,8,1,0


In [117]:
data.isnull().sum()

MIS                  0
Cumpleaños           0
NivelEducativo       0
EstadoCivil          0
Ingreso             24
CantNiños            0
CantAdolecente       0
InicioCliente        0
AlimentosFrescos     0
Vinos                0
Frutas               0
Carnes               0
Pescado              0
Dulces               0
Oro                  0
ComprasOfertas       0
ComprasWeb           0
ComprasCatalogo      0
ComprasTienda        0
VisitasWeb           0
AceptaCompra1        0
AceptaCompra2        0
AceptaCompra3        0
AceptaCompra4        0
AceptaCompra5        0
Quejas               0
CostoContacto        0
CostoIngresos        0
Respuestas           0
TotalProductos       0
TotalCompras         0
Edad                 0
Antiguedad           0
TieneNiños           0
TieneAdolecentes     0
dtype: int64

In [118]:
X_train, X_test, y_train, y_test = train_test_split(
        data.drop(['MIS', 'Respuestas'], axis=1),
        data['Respuestas'],
        test_size=0.1,
        random_state=2022)

X_train.shape, X_test.shape

((2016, 33), (224, 33))

In [119]:
X_train.head()

Unnamed: 0,Cumpleaños,NivelEducativo,EstadoCivil,Ingreso,CantNiños,CantAdolecente,InicioCliente,AlimentosFrescos,Vinos,Frutas,Carnes,Pescado,Dulces,Oro,ComprasOfertas,ComprasWeb,ComprasCatalogo,ComprasTienda,VisitasWeb,AceptaCompra1,AceptaCompra2,AceptaCompra3,AceptaCompra4,AceptaCompra5,Quejas,CostoContacto,CostoIngresos,TotalProductos,TotalCompras,Edad,Antiguedad,TieneNiños,TieneAdolecentes
876,1971,Master,Divorced,42835.0,1,1,30/06/2013,64,379,4,93,12,9,98,7,6,6,4,6,0,0,0,0,0,0,3,11,659,29,51,9,1,1
912,1963,PhD,Widow,52278.0,0,1,25/01/2013,24,953,0,71,0,0,174,6,10,5,10,8,0,0,0,0,0,0,3,11,1222,39,59,9,0,1
683,1955,Graduation,Together,38946.0,0,1,24/10/2013,84,116,6,82,6,6,41,2,3,1,6,5,0,0,0,0,0,0,3,11,341,17,67,9,0,1
2080,1973,Graduation,Married,27803.0,1,0,26/08/2012,40,8,26,46,38,9,49,2,3,0,4,8,0,0,0,0,0,0,3,11,216,17,49,10,1,0
467,1970,Graduation,Widow,31880.0,1,0,31/10/2012,13,4,1,5,2,0,3,1,1,0,2,8,0,0,0,0,0,0,3,11,28,12,52,10,1,0


In [120]:
### Transformación al Target
#y_train = np.log(y_train)
#y_test = np.log(y_test)

In [121]:
#Variables categoricas con NA
CATEGORICAL_VARS_WITH_NA_FREQUENT = []

#Variable categoricas con NA pero indicador de Missing
CATEGORICAL_VARS_WITH_NA_MISSING = []

#Variables para hacer mapeo categorico por codificación ordinal
NivelEducativo_VARS = ['NivelEducativo']

EstadoCivil_VARS = ['EstadoCivil']

#Variables numéricas con NA
NUMERICAL_VARS_WITH_NA = ['Ingreso']

DROP_FEATURES = ['MIS', 'InicioCliente','Cumpleaños']

NUMERICALS_LOG_VARS = ['ComprasTienda','VisitasWeb','Ingreso']

NUMERICALS_YEJ_VARS = ['AlimentosFrescos','Vinos','Carnes','Oro','ComprasTienda','TotalProductos','Edad']

#Variables para binarización por sesgo fuerte
BINARIZE_VARS = ['AceptaCompra1','AceptaCompra2','AceptaCompra3','AceptaCompra4','AceptaCompra5','Quejas','TieneNiños','TieneAdolecentes']

#Variables categoricas a codificar sin ordinalidad
CATEGORICAL_VARS = ['NivelEducativo' ]

TOTAL_PRODUCTOS = ['AlimentosFrescos','Vinos','Frutas','Carnes','Dulces','Oro']

VACIO_MAPPINGS = ['']
#Mapeos de variables categoricas
NivelEducativo_MAPPINGS = {'Graduation':1, 'PhD':2, 'Master':3, 'Basic':4, '2n Cycle':5, 'Missing':0, 'NA':0, 'NaN':0}

EstadoCivil_MAPPINGS = {'Married':1, 'Single':2, 'Together':3, 'Divorced':4, 'Widow':5, 'Absurd':6, 'Alone':7, 'YOLO':8, 'Missing':0, 'NA':0, 'NaN':0}

#Variables seleccionadas según análisis de Lasso
FEATURES = [
    'Ingreso','EstadoCivil', 'AlimentosFrescos', 'Frutas', 'Carnes', 'Pescado', 'Oro',
       'ComprasCatalogo', 'ComprasTienda', 'VisitasWeb', 'AceptaCompra1',
       'AceptaCompra2', 'AceptaCompra3', 'AceptaCompra4', 'AceptaCompra5',
       'TotalCompras', 'Antiguedad', 'TieneNiños', 'TieneAdolecentes','Quejas','NivelEducativo','Vinos','TotalProductos','Edad',
]

In [122]:
#Selección de variables para entrenamiento
X_train = X_train[FEATURES]
X_train.head()

Unnamed: 0,Ingreso,EstadoCivil,AlimentosFrescos,Frutas,Carnes,Pescado,Oro,ComprasCatalogo,ComprasTienda,VisitasWeb,AceptaCompra1,AceptaCompra2,AceptaCompra3,AceptaCompra4,AceptaCompra5,TotalCompras,Antiguedad,TieneNiños,TieneAdolecentes,Quejas,NivelEducativo,Vinos,TotalProductos,Edad
876,42835.0,Divorced,64,4,93,12,98,6,4,6,0,0,0,0,0,29,9,1,1,0,Master,379,659,51
912,52278.0,Widow,24,0,71,0,174,5,10,8,0,0,0,0,0,39,9,0,1,0,PhD,953,1222,59
683,38946.0,Together,84,6,82,6,41,1,6,5,0,0,0,0,0,17,9,0,1,0,Graduation,116,341,67
2080,27803.0,Married,40,26,46,38,49,0,4,8,0,0,0,0,0,17,10,1,0,0,Graduation,8,216,49
467,31880.0,Widow,13,1,5,2,3,0,2,8,0,0,0,0,0,12,10,1,0,0,Graduation,4,28,52


#### Meachine Learing PipeLine

In [132]:
housePrice_pipeline = Pipeline ([
    #==================== IMPUTACIONES ============#
    #1. Faltantes en variables numericas
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),
    
    #2. Imputación de mediana para variables categoricas
    ('mean_imputacion',MeanMedianImputer(imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)),
    
    #============= TRANSFORMACIÓN DE VARIABLES NUMÉRICAS =============
    
    #3. Transformación logaritmica
    #('log', LogTransformer(variables=NUMERICALS_LOG_VARS)),
    
    #4. Binarización de Variables con Sesgo Fuerte
    ('binarizer', SklearnTransformerWrapper(
        transformer=Binarizer(threshold=0), variables=BINARIZE_VARS)
    ),
    
    #=============== CODIFICACION DE VARIABLES CATEGORICAS ORDINALES ==============
    ('mapper_NivelEducativo', mypp.Mapper(variables=NivelEducativo_VARS, mappings=NivelEducativo_MAPPINGS)),
    
    ('mapper_EstadoCivil', mypp.Mapper(variables=EstadoCivil_VARS, mappings=EstadoCivil_MAPPINGS)),
    
    #=========== SCALER ==============
    ('scaler', MinMaxScaler()),
    
    #=========== ENTRENAMIENTO DEL MODELO ============
    #('Lasso', Lasso(alpha=0.01, random_state=2022)),

])

In [133]:
housePrice_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_indicator',
                 AddMissingIndicator(variables=['Ingreso'])),
                ('mean_imputacion',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['Ingreso'])),
                ('binarizer',
                 SklearnTransformerWrapper(transformer=Binarizer(threshold=0),
                                           variables=['AceptaCompra1',
                                                      'AceptaCompra2',
                                                      'AceptaCompra3',
                                                      'AceptaCompra4',
                                                      'AceptaCompra5', 'Quejas',
                                                      'TieneNiñ...
                ('mapper_NivelEducativo',
                 Mapper(mappings={'2n Cycle': 5, 'Basic': 4, 'Graduation': 1,
                                  'Master': 3, 'Missing': 0, 'NA': 0, 'NaN': 0,
  

In [134]:
#Seleccionamos variables para predicción
X_test = X_test[FEATURES]

In [135]:
#preds = housePrice_pipeline.predict(X_test)

In [137]:
housePrice_pipeline.transform(X_test)

array([[0.05122448, 0.14285714, 0.94949495, ..., 0.04656577, 0.30097087,
        0.        ],
       [0.02146823, 0.        , 0.6969697 , ..., 0.06325184, 0.15533981,
        0.        ],
       [0.10237827, 0.28571429, 0.50505051, ..., 0.41870392, 0.42718447,
        0.        ],
       ...,
       [0.07887676, 0.28571429, 0.7979798 , ..., 0.1024447 , 0.2815534 ,
        0.        ],
       [0.05409092, 0.14285714, 0.82828283, ..., 0.06092355, 0.19417476,
        0.        ],
       [0.05275245, 0.28571429, 0.88888889, ..., 0.03686457, 0.40776699,
        0.        ]])

In [138]:
from sklearn.metrics import mean_squared_error 

In [139]:
X_test

Unnamed: 0,Ingreso,EstadoCivil,AlimentosFrescos,Frutas,Carnes,Pescado,Oro,ComprasCatalogo,ComprasTienda,VisitasWeb,AceptaCompra1,AceptaCompra2,AceptaCompra3,AceptaCompra4,AceptaCompra5,TotalCompras,Antiguedad,TieneNiños,TieneAdolecentes,Quejas,NivelEducativo,Vinos,TotalProductos,Edad
616,35791.0,Single,94,0,5,0,3,0,3,8,0,0,0,0,0,14,9,1,1,0,Master,27,129,57
2133,16005.0,Married,69,3,2,20,47,1,2,8,0,0,0,0,0,16,10,1,0,0,Basic,1,172,42
2117,69805.0,Together,50,71,174,13,20,8,11,2,0,0,0,0,0,29,8,0,1,0,PhD,750,1088,70
1149,26576.0,Single,40,0,8,0,9,0,2,9,1,0,0,0,0,13,10,1,0,0,Graduation,10,67,36
596,23910.0,Divorced,80,12,18,7,13,0,3,7,0,0,0,0,0,13,10,1,0,0,Graduation,16,147,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,54450.0,Together,0,0,171,8,32,2,8,8,0,0,0,0,0,39,10,1,1,0,Graduation,454,684,66
567,29298.0,Together,60,0,2,2,1,0,2,5,0,0,0,0,0,9,9,1,1,0,Master,6,71,71
89,54178.0,Together,79,9,39,4,7,2,5,2,0,0,0,0,0,12,8,0,1,0,Graduation,135,273,55
1522,37697.0,Single,82,6,21,11,8,1,3,6,0,0,0,0,0,13,8,1,0,0,Graduation,34,166,46


In [140]:
#Guardamos pipeline
joblib.dump(housePrice_pipeline, 'housePrice_pipeline.pkl')

['housePrice_pipeline.pkl']

In [141]:
X_train

Unnamed: 0,Ingreso,EstadoCivil,AlimentosFrescos,Frutas,Carnes,Pescado,Oro,ComprasCatalogo,ComprasTienda,VisitasWeb,AceptaCompra1,AceptaCompra2,AceptaCompra3,AceptaCompra4,AceptaCompra5,TotalCompras,Antiguedad,TieneNiños,TieneAdolecentes,Quejas,NivelEducativo,Vinos,TotalProductos,Edad
876,42835.0,Divorced,64,4,93,12,98,6,4,6,0,0,0,0,0,29,9,1,1,0,Master,379,659,51
912,52278.0,Widow,24,0,71,0,174,5,10,8,0,0,0,0,0,39,9,0,1,0,PhD,953,1222,59
683,38946.0,Together,84,6,82,6,41,1,6,5,0,0,0,0,0,17,9,0,1,0,Graduation,116,341,67
2080,27803.0,Married,40,26,46,38,49,0,4,8,0,0,0,0,0,17,10,1,0,0,Graduation,8,216,49
467,31880.0,Widow,13,1,5,2,3,0,2,8,0,0,0,0,0,12,10,1,0,0,Graduation,4,28,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1713,80398.0,Married,92,51,936,207,26,8,12,3,0,0,1,0,0,29,10,0,0,0,Graduation,342,1689,56
624,72635.0,Married,54,22,323,104,107,8,6,3,0,0,0,1,0,24,9,0,0,0,2n Cycle,390,1035,67
173,53537.0,Together,17,0,6,0,6,1,3,5,0,0,0,0,0,13,8,1,1,0,PhD,81,110,63
1244,27683.0,Together,90,9,121,12,45,2,4,8,0,0,0,0,0,24,10,1,0,0,PhD,152,441,44


In [142]:
mean_squared_error(np.exp(y_test),np.exp(preds),squared=False)

0.5926761989450151