# Employee Attrition Prediction Pipeline

In [2]:
import re

# to handle datasets
import pandas as pd
import numpy as np

# for visualization
import matplotlib.pyplot as plt

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import StandardScaler

# to build the models
from sklearn.linear_model import LogisticRegression

# to evaluate the models
from sklearn.metrics import accuracy_score, roc_auc_score

# to persist the model and the scaler
import joblib

# ========== NEW IMPORTS ========
# Respect to notebook 02-Predicting-Survival-Titanic-Solution

# pipeline
from sklearn.pipeline import Pipeline

# for the preprocessors
from sklearn.base import BaseEstimator, TransformerMixin

# for imputation
from feature_engine.imputation import (
    CategoricalImputer,
    AddMissingIndicator,
    MeanMedianImputer)

# for encoding categorical variables
from feature_engine.encoding import (
    RareLabelEncoder,
    OneHotEncoder
)

In [3]:
# load the data - it is available open source and online

data = pd.read_csv('attrition.csv')

# display data
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary,attrition
0,0.83,0.51,4,215,3,0,0,product_mng,high,0
1,0.44,0.51,2,150,3,0,0,support,low,1
2,0.57,0.78,3,134,3,0,0,RandD,medium,0
3,0.87,0.48,4,264,3,0,0,IT,medium,0
4,0.97,0.91,2,112,5,0,0,IT,low,0


In [4]:
# replace interrogation marks by NaN values

data = data.replace('?', np.nan)

## Configuration

In [5]:
# list of variables to be used in the pipeline's transformers

target = 'attrition'

NUMERICAL_VARIABLES = [c for c in data.columns if data[c].dtypes!='O' and c!=target]

CATEGORICAL_VARIABLES = [c for c in data.columns if data[c].dtypes=='O']

## Separate data into train and test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('attrition', axis=1),  # predictors
    data[target],  # target
    test_size=0.2,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((11999, 9), (3000, 9))

## Preprocessors

* Impute categorical features with missing strings
* Add a binary missing indicator to numerical variables with missing data
* Fill nulls in original numerical variables with the median
* Group rare Categories
* Perform One Hot encoding
* Scale features with Standard Scaler
* Fit Logistic Regression model

In [9]:
# set up the pipeline
attrition_pipe = Pipeline([

    # ===== IMPUTATION =====
    # impute categorical variables with string missing
    ('categorical_imputation', CategoricalImputer(
        imputation_method='missing', variables=CATEGORICAL_VARIABLES)),

    # add missing indicator to numerical variables
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARIABLES)),

    # impute numerical variables with the median
    ('median_imputation', MeanMedianImputer(
        imputation_method='median', variables=NUMERICAL_VARIABLES)),
    

    # == CATEGORICAL ENCODING ======
    # remove categories present in less than 5% of the observations (0.05)
    # group them in one category called 'Rare'
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.05, n_categories=1, variables=CATEGORICAL_VARIABLES)),

    # encode categorical variables using one hot encoding into k-1 variables
    ('categorical_encoder', OneHotEncoder(
        drop_last=True, variables=CATEGORICAL_VARIABLES)),

    # scale
    ('scaler', StandardScaler()),

    ('Logit', LogisticRegression(C=0.0005, random_state=0)),
])

In [10]:
# train the pipeline
attrition_pipe.fit(X_train, y_train)

## Make predictions and evaluate model performance

Determine:

* `roc_auc`
* `accuracy`

In [12]:
# make predictions for test set
class_ = attrition_pipe.predict(X_train)
pred = attrition_pipe.predict_proba(X_train)[:,1]

# determine roc_auc and accuracy
print('train roc-auc: {}'.format(roc_auc_score(y_train, pred)))
print('train accuracy: {}'.format(accuracy_score(y_train, class_)))
print()

# make predictions for test set
class_ = attrition_pipe.predict(X_test)
pred = attrition_pipe.predict_proba(X_test)[:,1]

# determine roc_auc and accuracy
print('test roc-auc: {}'.format(roc_auc_score(y_test, pred)))
print('test accuracy: {}'.format(accuracy_score(y_test, class_)))
print()

train roc-auc: 0.8063670284582709
train accuracy: 0.7816484707058922

test roc-auc: 0.8135549106783616
test accuracy: 0.7913333333333333



In [13]:
NUMERICAL_VARIABLES

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years']

In [14]:
CATEGORICAL_VARIABLES

['dept', 'salary']