In [17]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer
from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.transformation import LogTransformer
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import myPreprocessors as mypp #nuestra librerías de transformaciones.


import joblib

In [18]:
datos = pd.read_excel("dataset-servicios.xlsx")

## Imputación de datos

### Extraemos las  columnas NAN

In [19]:
cols_con_na = [col for col in datos.columns if (datos[col].isnull().mean() > 0)]
cols_con_na

['cod_cli',
 'locali',
 'per_aut',
 'cti_pro_a',
 'tit_pol',
 'ben_ser',
 'lug_asi',
 'cod_zon',
 'marca',
 'color',
 'año',
 'placa',
 'inciso',
 'tel_cas',
 'tel_ofi',
 'cel_con']

In [20]:
datos[cols_con_na].dtypes

cod_cli       object
locali        object
per_aut       object
cti_pro_a    float64
tit_pol       object
ben_ser       object
lug_asi       object
cod_zon      float64
marca         object
color         object
año          float64
placa         object
inciso       float64
tel_cas      float64
tel_ofi      float64
cel_con       object
dtype: object

In [21]:
cols_rescatables = [col for col in datos.columns if (datos[col].isnull().mean() <=0.05)]
print(cols_rescatables)

['num_exp', 'cod_est', 'num_pol', 'cod_com', 'nom_com', 'cod_enc', 'cti_pro_a', 'tit_pol', 'ben_ser', 'lug_asi', 'num_lin', 'cod_pro', 'nom_pro', 'mes', 'fec_ser', 'hor_ser', 'cod_zon', 'cod_pre', 'des_pre', 'cod_dep', 'nom_dep', 'cti_pro_b', 'nti_pro', 'estimado', 'Sin IVA', 'xcontacto', 'let_tip', 'let_cat', 'cod_are', 'fechor', 'cel_con', 'num']


In [22]:
data_rescatable = datos[cols_rescatables]

In [23]:
cols_con_na_rescatable = [col for col in data_rescatable.columns if (data_rescatable[col].isnull().mean() > 0)]
cols_con_na_rescatable

['cti_pro_a', 'tit_pol', 'ben_ser', 'lug_asi', 'cod_zon', 'cel_con']

In [24]:
data_rescatable_con_na = datos[cols_con_na_rescatable]
data_rescatable_con_na.dtypes

cti_pro_a    float64
tit_pol       object
ben_ser       object
lug_asi       object
cod_zon      float64
cel_con       object
dtype: object

#### 2. Train Test Split para Entrenamiento y Prueba

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    datos.drop(['marca'], axis=1),
    datos['estimado'],
    test_size=0.3,
    random_state=2022)

#### 3. Configuración del Pipeline

In [26]:
#imputación de variables categóricas @
CATEGORICAL_VARS_WITH_NA_FREQUENT = [ ]

#imputación de variables categoricas con indicador de Faltante (Missing) @
CATEGORICAL_VARS_WITH_NA_MISSING = ['tit_pol','ben_ser','lug_asi', 'cel_con']
    
#Imputación de variables numéricas @
NUMERICAL_VARS_WITH_NA = ['cti_pro_a', 'cod_zon']

#Variables temporales
TEMPORAL_VARS = ['YearRemodAdd']

#Año de Referencia
REF_VAR = []

#Variables para Binarización por sesgo
BINARIZE_VARS = []

#Variables que eliminaremos @
DROP_FEATURES = ["cod_cli", "locali", "per_aut",  
"marca", "color", "año",  "placa inciso", "tel_cas", "tel_ofi"]

#Variables para transfomraicón logarítmica
NUMERICAL_LOG_VARS = []

#Variables para codificación ordinal.
QUAL_VARS = []


#variables especiales
EXPOSURE_VARS = []

FINISH_VARS = []

GARAGE_VARS = []

FENCE_VARS = []

#Variables para codificación por frecuencia (no ordinal)
CATEGORICAL_VARS = []

#Mapeo para varibels categótricas para calidad.
QUAL_MAPPINGS = {}

EXPOSURE_MAPPINGS = {}

FINISH_MAPPINGS = {}

GARAGE_MAPPINGS = {}

#Variables a utilzar en el entrenamiento
FEATURES = [
   'num_exp', 
   'cod_est',
    'num_pol',
    'cod_com',
    'nom_com', 
    'cod_enc',
    'cti_pro_a', 
    'tit_pol', 
    'ben_ser',
    'lug_asi',
    'num_lin',
    'cod_pro', 
    'nom_pro', 
    'mes', 
    'fec_ser',
    'hor_ser',
    'cod_zon',
    'cod_pre',
    'des_pre',
    'cod_dep', 
    'nom_dep', 
    'cti_pro_b', 
    'nti_pro', 
    'estimado', 
    'Sin IVA',
    'xcontacto',
    'let_tip',
    'let_cat',
    'cod_are', 
    'fechor', 
    'cel_con', 
    'num'
]

### 4. Selecionamos variables de interés

In [27]:
X_train= X_train[FEATURES]

### 5. Tratar la variable target

In [28]:
y_train = np.log(y_train)
y_test = np.log(y_test)

TypeError: loop of ufunc does not support argument 0 of type str which has no callable log method

### 6. Construcción de pipeline

In [None]:
housePrice_pipeline_v01072023 = Pipeline([
    
    #=========== IMPUTACIONES ===============
    
    #1. Imputaciín de variables categóricas con indicador de faltante
    ('missing_imputation',
        CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
    #2. Imputación de varaibles categóticas basada en frecuencia
    ('frequent_imputation',
        CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    #3. Indicador faltane en variables numericas para imputación
    ('missing_indicator_numeric',
        AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #4. Imputación de variables numéricas
    ('mean_imputation',
        MeanMedianImputer(imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #========== VARIABLES TEMPORALES ================
    
    #5. tratamiento de variables temporales (diferencias)
    ('elapsed_time', mypp.TremporalVariableTransformer(
        variables=TEMPORAL_VARS, reference_variable=REF_VAR)
    ),
    
    #6. Drop de variables.
    ('drop_time_features',
        DropFeatures(features_to_drop=DROP_FEATURES)
    ),
    
    
    #============= CODIFICACIÓN DE VARIABLES CATEGORICAS ORINALES ==================
    ('quality_mapper',
        mypp.Mapper(variables=QUAL_VARS, mappings=QUAL_MAPPINGS)
    ),
    
    ('exposure_mapper',
       mypp.Mapper(variables=EXPOSURE_VARS, mappings=EXPOSURE_MAPPINGS)
    ),
    
    ('garage_mapper',
        mypp.Mapper(variables=GARAGE_VARS, mappings=GARAGE_MAPPINGS)
    ),
    
    ('finish_mapper',
        mypp.Mapper(variables=FINISH_VARS, mappings=FINISH_MAPPINGS)
    ),
    
    #============= CODIFICACIÓN DE VARIABLES CATEGORICAS NOMINALES ==================
    ('rare_label_encoder',
        RareLabelEncoder(n_categories=1, tol=0.01, variables=CATEGORICAL_VARS)
    ),
    
    ('categorical_encoder',
        OrdinalEncoder(encoding_method='ordered', variables=CATEGORICAL_VARS)
    ),
    
    #=============== TRANSFORMACIÓN DE VARIABLES CONTINUAS ============
    ('log_transformer',
        LogTransformer(variables=NUMERICAL_LOG_VARS)
    ),
    
    ('binarizer_transform',
        SklearnTransformerWrapper(transformer=Binarizer(threshold=0), variables=BINARIZE_VARS)
    ),
    
     #=============== SCALER ============
    ('scaler',
        MinMaxScaler()
    ),
    
    ('modelo_lasso', 
         Lasso(alpha=0.01, random_state=2022)
    )
])

In [None]:
housePrice_pipeline_v01072023.fit(X_train, y_train)

ValueError: The list of variables is empty.