In [149]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.transformation import LogTransformer
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import myPreprocessors as mypp # librerías de transformaciones.

import joblib

In [191]:
dataTrain = pd.read_csv("HR_employee_attrition_2.csv")
dataTrain.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41.0,Yes,Travel_Rarely,1102.0,Sales,1,2,Life Sciences,1.0,1.0,...,1,80,0,8,0,1,6,4,0,5
1,49.0,No,Travel_Frequently,279.0,Research & Development,8,1,Life Sciences,1.0,2.0,...,4,80,1,10,3,3,10,7,1,7
2,37.0,Yes,Travel_Rarely,1373.0,Research & Development,2,2,Other,1.0,4.0,...,2,80,0,7,3,3,0,0,0,0
3,33.0,No,Travel_Frequently,1392.0,Research & Development,3,4,Life Sciences,1.0,5.0,...,3,80,0,8,3,3,8,7,3,0
4,,No,Travel_Rarely,591.0,,2,1,Medical,1.0,7.0,...,4,80,1,6,3,3,2,2,2,2


In [192]:
dataTrain.dtypes

Age                         float64
Attrition                    object
BusinessTravel               object
DailyRate                   float64
Department                   object
DistanceFromHome              int64
Education                     int64
EducationField               object
EmployeeCount               float64
EmployeeNumber              float64
EnvironmentSatisfaction     float64
Gender                       object
HourlyRate                  float64
JobInvolvement              float64
JobLevel                    float64
JobRole                      object
JobSatisfaction               int64
MaritalStatus                object
MonthlyIncome                 int64
MonthlyRate                 float64
NumCompaniesWorked          float64
Over18                       object
OverTime                     object
PercentSalaryHike             int64
PerformanceRating             int64
RelationshipSatisfaction      int64
StandardHours                 int64
StockOptionLevel            

In [65]:
#dataTrain['MSSubClass'] = dataTrain['MSSubClass'].astype('O')
#dataTrain['GarageCars'] = dataTrain['GarageCars'].astype('O')
#dataTrain['BsmtFullBath'] = dataTrain['BsmtFullBath'].astype('O')

#### 2. Train Test Split para Entrenamiento y Prueba

In [193]:
X_train, X_test, y_train, y_test = train_test_split(
    dataTrain,
    dataTrain['Age'],
    test_size=0.3,
    random_state=2022)

#### 3. Configuración del Pipeline

In [194]:
#imputación de variables categóricas
CATEGORICAL_VARS_WITH_NA_FREQUENT = ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'MaritalStatus', 'OverTime', 'Gender', 'Over18']


#Imputación de variables numéricas
NUMERICAL_VARS_WITH_NA = ['Age', 'DailyRate', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'MonthlyRate', 'NumCompaniesWorked' ]



#Variables que eliminaremos
#DROP_FEATURES = ["YrSold"]



#Variables para codificación por frecuencia (no ordinal)
CATEGORICAL_VARS = ['Attrition', 'BusinessTravel', 'Department', 'EducationField',
                                     'JobRole', 'Gender', 'MaritalStatus', 'Over18', 'OverTime']


#Variables a utilzar en el entrenamiento
FEATURES = ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'MaritalStatus', 'Over18','Age', 'DailyRate', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime']


In [195]:
X_train = X_train[FEATURES]

In [196]:
y_train = np.log(y_train)
y_test = np.log(y_test)

#### 4. Construcción del Pipeline

In [209]:
from feature_engine.encoding import OneHotEncoder

attritions_pipeline = Pipeline([
    
    #=========== IMPUTACIONES ===============
    
    #2. Imputación de varaibles categóticas basada en frecuencia
    ('frequent_imputation',
        CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    
    #3. Indicador faltane en variables numericas para imputación
    ('missing_indicator_numeric',
        AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #4. Imputación de variables numéricas
    ('mean_imputation',
        MeanMedianImputer(imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    

    
    #============= CODIFICACIÓN DE VARIABLES CATEGORICAS NOMINALES ==================
    ('rare_label_encoder',
        RareLabelEncoder(n_categories=1, tol=0.01, variables=CATEGORICAL_VARS)
    ),
    
    
 #   ('modelo_lasso', 
  #       Lasso(alpha=0.01, random_state=2022)
   # )
    
    
])

In [210]:
attritions_pipeline.fit(X_train, y_train)



Pipeline(steps=[('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['Attrition', 'BusinessTravel',
                                               'Department', 'EducationField',
                                               'MaritalStatus', 'OverTime',
                                               'Gender', 'Over18'])),
                ('missing_indicator_numeric',
                 AddMissingIndicator(variables=['Age', 'DailyRate',
                                                'EmployeeCount',
                                                'EmployeeNumber',
                                                'EnvironmentSatisfaction',
                                                'HourlyR...
                                   variables=['Age', 'DailyRate',
                                              'EmployeeCount', 'EmployeeNumber',
                                              'EnvironmentSatisf

In [211]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1029 entries, 651 to 893
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Attrition                1013 non-null   object 
 1   BusinessTravel           990 non-null    object 
 2   Department               728 non-null    object 
 3   EducationField           1024 non-null   object 
 4   Gender                   652 non-null    object 
 5   MaritalStatus            1004 non-null   object 
 6   Over18                   896 non-null    object 
 7   Age                      1023 non-null   float64
 8   DailyRate                998 non-null    float64
 9   EmployeeCount            939 non-null    float64
 10  EmployeeNumber           1019 non-null   float64
 11  EnvironmentSatisfaction  997 non-null    float64
 12  HourlyRate               1019 non-null   float64
 13  JobInvolvement           1020 non-null   float64
 14  JobLevel               

In [212]:
X_test = X_test[FEATURES]

In [None]:
preds = attritions_pipeline.predict(X_test)

In [None]:
rmseTest = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(preds)))
rmseTest

In [17]:
np.min(np.exp(y_train)), np.max(np.exp(y_train))

(34900.00000000001, 754999.9999999999)

In [18]:
rmseTest / (np.max(np.exp(y_train)) - np.min(np.exp(y_train)))

0.06067434691313879

In [19]:
#guardamos pipeline para uso en producción.
joblib.dump(attritions_pipeline, 'attritions_pipeline.pkl')

['housePrice_pipeline_v112022.pkl']

In [20]:
joblib.dump(FEATURES, 'FEATURES.pkl')

['FEATURES.pkl']