In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data_path = '../data/'
df = pd.read_csv(os.path.join(data_path, 'WA_Fn-UseC_-HR-Employee-Attrition.csv'))
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


There are 35 features in total in the dataset about. 
Let's focus on a few of them:
- Age
- Attrition
- Department
- DistanceFromHome
- Education
- EnvironmentSatisfaction
- Gender
- JobSatisfaction
- MaritalStatus
- MonthlyIncome
- OverTime
- PerformanceRating
- RelationshipSatisfaction
- TotalWorkingYears
- YearsAtCompany

In this lab, we will use attrition as our label, to try to predict the attrition status accroding to other attributes. 


In [3]:
to_keep = {'Age', 'Attrition', 'Department','DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'Gender', 'JobSatisfaction', 'MaritalStatus',
           'MonthlyIncome', 'OverTime', 'PerformanceRating', 'RelationshipSatisfaction','TotalWorkingYears','YearsAtCompany'}
to_drop = set(df.columns)-to_keep
df.drop(to_drop, axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 15 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null object
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome               1470 non-null int64
OverTime                    1470 non-null object
PerformanceRating           1470 non-null int64
RelationshipSatisfaction    1470 non-null int64
TotalWorkingYears           1470 non-null int64
YearsAtCompany              1470 non-null int64
dtypes: int64(10), object(5)
memory usage: 172.3+ KB


# Preprocessing

It's good that we don't have any null value. Let's one hot encode the Attrition, Department, Gender, MaritalStatus and Overtime. 

In [4]:
to_convert = ['Education','EnvironmentSatisfaction','JobSatisfaction',
            'PerformanceRating','RelationshipSatisfaction']
for col in to_convert:
    df[col] = df[col].astype(np.str)
    

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

to_encode = {'Attrition', 'Department','Gender','MaritalStatus','OverTime','Education','EnvironmentSatisfaction','JobSatisfaction',
            'PerformanceRating','RelationshipSatisfaction'}
encoders = dict()

for col in to_encode:
    if col=="attrition":
        tmp = LabelEncoder()
        df[col] = tmp.fit_transform(df[col])
    else:
        encoders[col] = LabelEncoder()
        df[col+'_int'] = encoders[col].fit_transform(df[col])
    

Then, let's scale the numeric features. 

In [6]:
categorical_features =list(to_encode)
categorical_features = [x+'_int' for x in categorical_features]
numerics = set(df.columns) - to_encode
numerics = list(numerics - set(categorical_features))

for atr in numerics:
    df[atr] = df[atr].astype(np.float)    
    ss = StandardScaler()
    df[atr] = ss.fit_transform(df[atr].values.reshape(-1, 1))

In [7]:
numerics

['MonthlyIncome',
 'Age',
 'TotalWorkingYears',
 'YearsAtCompany',
 'DistanceFromHome']

In [8]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 25 columns):
Age                             1470 non-null float64
Attrition                       1470 non-null object
Department                      1470 non-null object
DistanceFromHome                1470 non-null float64
Education                       1470 non-null object
EnvironmentSatisfaction         1470 non-null object
Gender                          1470 non-null object
JobSatisfaction                 1470 non-null object
MaritalStatus                   1470 non-null object
MonthlyIncome                   1470 non-null float64
OverTime                        1470 non-null object
PerformanceRating               1470 non-null object
RelationshipSatisfaction        1470 non-null object
TotalWorkingYears               1470 non-null float64
YearsAtCompany                  1470 non-null float64
OverTime_int                    1470 non-null int64
Attrition_int                   1470 non-

In [9]:
categorical_features =list(to_encode)
categorical_features = [x+'_int' for x in categorical_features]
numerics = list(numerics)
feature_columns = categorical_features + numerics

In [10]:
from sklearn.model_selection import train_test_split

# stratified 90/10 train/test split`
df_train, df_test = train_test_split(df, test_size=0.1, stratify=df.Attrition)

X_train = ss.fit_transform(df_train[feature_columns].values).astype(np.float32)
X_test = ss.fit_transform(df_test[feature_columns].values).astype(np.float32)

y_train = df_train['Attrition_int'].values.astype(np.int)
y_test = df_test['Attrition_int'].values.astype(np.int)

print('train', X_train.shape, 'test', X_test.shape)

train (1323, 15) test (147, 15)


# Initial test (Categorical not encoded)

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Input
from keras.layers import Embedding, Flatten, Merge, concatenate
from keras.models import Model
from sklearn.preprocessing import OneHotEncoder

Using TensorFlow backend.


In [12]:
inputs = Input(shape=(X_train.shape[1],))

In [13]:
x = Dense(units=10, activation='relu')(inputs)
predictions = Dense(1,activation='sigmoid')(x)

In [14]:
model = Model(inputs=inputs, outputs=predictions)

In [15]:
model.compile(optimizer='sgd',
              loss='mean_squared_error',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 15)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                160       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 171
Trainable params: 171
Non-trainable params: 0
_________________________________________________________________


In [16]:
from sklearn import metrics as mt
yhat = np.round(model.predict(X_test))
print(mt.confusion_matrix(y_test,yhat),mt.accuracy_score(y_test,yhat))

[[78 45]
 [18  6]] 0.571428571429


# Sparse Encoded categorical features test only

In [17]:
df_train

Unnamed: 0,Age,Attrition,Department,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,JobSatisfaction,MaritalStatus,MonthlyIncome,...,OverTime_int,Attrition_int,RelationshipSatisfaction_int,MaritalStatus_int,Department_int,Education_int,EnvironmentSatisfaction_int,JobSatisfaction_int,PerformanceRating_int,Gender_int
659,-0.977174,No,Sales,-0.517332,4,1,Male,4,Single,-0.338889,...,0,0,1,2,2,3,0,3,0,1
631,0.774856,No,Research & Development,-0.147150,4,1,Male,4,Married,-0.782969,...,1,0,2,1,1,3,0,3,1,1
137,0.227347,No,Sales,-0.640727,4,4,Female,3,Married,-0.127685,...,0,0,2,1,2,3,3,2,0,0
1211,0.008343,No,Sales,-1.010909,4,3,Male,4,Divorced,0.642125,...,0,0,2,0,2,3,2,3,0,1
80,-0.758170,No,Research & Development,-1.010909,1,4,Male,4,Married,-0.292569,...,1,0,2,1,1,0,3,3,0,1
375,1.322365,No,Research & Development,-0.270544,3,2,Male,3,Single,0.948094,...,0,0,2,2,1,2,1,2,1,1
643,0.555852,No,Research & Development,-0.764121,3,3,Female,4,Married,-0.270258,...,1,0,1,1,1,2,2,3,0,0
781,-1.196177,No,Research & Development,-1.010909,2,1,Male,1,Married,-0.541381,...,0,0,0,1,1,1,0,0,0,1
448,0.336849,No,Research & Development,-0.393938,3,2,Female,3,Single,1.430846,...,0,0,2,2,1,2,1,2,0,0
1145,-0.101159,No,Research & Development,0.346427,4,3,Female,3,Married,-0.390946,...,1,0,1,1,1,3,2,2,0,0


In [18]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
X_train_ohe = ohe.fit_transform(df_train[categorical_features].values)
X_test_ohe = ohe.transform(df_test[categorical_features].values)

In [19]:
inputs = Input(shape=(X_train_ohe.shape[1],),sparse=True)

x = Dense(units=10, activation='relu')(inputs)
predictions = Dense(1,activation='sigmoid')(x)

model = Model(inputs=inputs, outputs=predictions)

model.compile(optimizer='sgd',
              loss='mean_squared_error',
              metrics=['accuracy'])

model.fit(X_train_ohe,y_train, epochs=10, batch_size=50, verbose=0)

yhat = np.round(model.predict(X_test_ohe))
print(mt.confusion_matrix(y_test,yhat),mt.accuracy_score(y_test,yhat))

[[123   0]
 [ 19   5]] 0.87074829932


# Dense, Sparce and Combined

In [20]:

ohe = OneHotEncoder()
X_train_ohe = ohe.fit_transform(df_train[categorical_features].values)
X_test_ohe = ohe.transform(df_test[categorical_features].values)


X_train_num =  df_train[numerics].values
X_test_num = df_test[numerics].values


inputsSparse = Input(shape=(X_train_ohe.shape[1],),sparse=True)
xSparse = Dense(units=10, activation='relu')(inputsSparse)


inputsDense = Input(shape=(X_train_num.shape[1],),sparse=False)
xDense = Dense(units=10, activation='relu')(inputsDense)

x = concatenate([xSparse, xDense])
predictions = Dense(1,activation='sigmoid')(x)

# This creates a model that includes
# the Input layer and Dense layers
model = Model(inputs=[inputsSparse,inputsDense], outputs=predictions)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 31)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 5)            0                                            
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 10)           320         input_3[0][0]                    
__________________________________________________________________________________________________
dense_6 (Dense)                 (None, 10)           60          input_4[0][0]                    
__________________________________________________________________________________________________
concatenat

In [21]:
model.compile(optimizer='sgd',
              loss='mean_squared_error',
              metrics=['accuracy'])

model.fit([X_train_ohe,X_train_num],y_train, epochs=10, batch_size=50, verbose=0)

yhat = np.round(model.predict([X_test_ohe,X_test_num]))
print(mt.confusion_matrix(y_test,yhat),mt.accuracy_score(y_test,yhat))

[[120   3]
 [ 23   1]] 0.823129251701


# Dense Embeddings and Deep Model

In [22]:
# we need to create separate sequential models for each embedding
embed_branches = []
X_ints_train = []
X_ints_test = []
all_inputs = []
all_branch_outputs = []

for col in categorical_features:
    # encode as ints for the embedding
    X_ints_train.append( df_train[col].values )
    X_ints_test.append( df_test[col].values )
    
    # get the number of categories
    print(X_ints_train[-1])
    N = max(X_ints_train[-1]+1) # same as the max(df_train[col])
    
    # create embedding branch from the number of categories
    inputs = Input(shape=(1,),dtype='int32')
    all_inputs.append(inputs)
    x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
    x = Flatten()(x)
    all_branch_outputs.append(x)

# also get a dense branch of the numeric features
all_inputs.append(Input(shape=(X_train_num.shape[1],),sparse=False))
x = Dense(units=20, activation='relu')(all_inputs[-1])
all_branch_outputs.append( Dense(units=10,activation='relu')(x) )

# merge the branches together
final_branch = concatenate(all_branch_outputs)
final_branch = Dense(units=1,activation='sigmoid')(final_branch)

model = Model(inputs=all_inputs, outputs=final_branch)

model.compile(optimizer='sgd',
              loss='mean_squared_error',
              metrics=['accuracy'])

model.fit(X_ints_train + [X_train_num],
        y_train, epochs=10, batch_size=32, verbose=1)

[0 1 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[1 2 2 ..., 2 0 1]
[2 1 1 ..., 0 1 1]
[2 1 2 ..., 1 2 2]
[3 3 3 ..., 4 1 2]
[0 0 3 ..., 1 0 2]
[3 3 2 ..., 0 2 0]
[0 1 0 ..., 0 1 0]
[1 1 0 ..., 0 0 1]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a6a72c1e10>

In [23]:
yhat = np.round(model.predict(X_ints_test + [X_test_num]))
print(mt.confusion_matrix(y_test,yhat),mt.accuracy_score(y_test,yhat))

[[123   0]
 [ 24   0]] 0.836734693878


# Wide: Adding Crossed Columns

In [24]:
categorical_features

['OverTime_int',
 'Attrition_int',
 'RelationshipSatisfaction_int',
 'MaritalStatus_int',
 'Department_int',
 'Education_int',
 'EnvironmentSatisfaction_int',
 'JobSatisfaction_int',
 'PerformanceRating_int',
 'Gender_int']

In [25]:
cross_columns = [['Gender','MaritalStatus'],
                    ['Education', 'JobSatisfaction'],['Department','PerformanceRating'],
                    ['Education', 'JobSatisfaction','RelationshipSatisfaction'],['Department','OverTime'],
                ]

In [26]:
# we need to create separate sequential models for each embedding
embed_branches = []
X_ints_train = []
X_ints_test = []
all_inputs = []
all_branch_outputs = []

for cols in cross_columns:
    # encode as ints for the embedding
    enc = LabelEncoder()
    # create crossed labels
    X_crossed_train = df_train[cols].apply(lambda x: '_'.join(x), axis=1)
    X_crossed_test = df_test[cols].apply(lambda x: '_'.join(x), axis=1)
    
    enc.fit(np.hstack((X_crossed_train.values,  X_crossed_test.values)))
    X_crossed_train = enc.transform(X_crossed_train)
    X_crossed_test = enc.transform(X_crossed_test)
    X_ints_train.append( X_crossed_train )
    X_ints_test.append( X_crossed_test )
    
    # get the number of categories
    N = max(X_ints_train[-1]+1) # same as the max(df_train[col])
    
    # create embedding branch from the number of categories
    inputs = Input(shape=(1,),dtype='int32')
    all_inputs.append(inputs)
    x = Embedding(input_dim=N, output_dim=int(np.sqrt(N)), input_length=1)(inputs)
    x = Flatten()(x)
    all_branch_outputs.append(x)
    
# merge the branches together
final_branch = concatenate(all_branch_outputs)
final_branch = Dense(units=1,activation='sigmoid')(final_branch)

model = Model(inputs=all_inputs, outputs=final_branch)

model.compile(optimizer='sgd',
              loss='mean_squared_error',
              metrics=['accuracy'])

model.fit(X_ints_train,
        y_train, epochs=10, batch_size=32, verbose=1)

      Gender MaritalStatus
659     Male        Single
631     Male       Married
137   Female       Married
1211    Male      Divorced
80      Male       Married
375     Male        Single
643   Female       Married
781     Male       Married
448   Female        Single
1145  Female       Married
953     Male       Married
822     Male        Single
1377    Male       Married
718     Male       Married
583   Female       Married
667   Female      Divorced
975     Male        Single
266     Male       Married
634     Male       Married
970   Female       Married
819     Male       Married
1398    Male      Divorced
498     Male       Married
787     Male       Married
457     Male        Single
1016  Female        Single
363   Female        Single
1435    Male        Single
1038    Male      Divorced
404     Male      Divorced
...      ...           ...
1160  Female      Divorced
81      Male        Single
1281    Male        Single
770     Male      Divorced
1400    Male       Married
3

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a6a7cbac18>

In [27]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1323 entries, 659 to 768
Data columns (total 25 columns):
Age                             1323 non-null float64
Attrition                       1323 non-null object
Department                      1323 non-null object
DistanceFromHome                1323 non-null float64
Education                       1323 non-null object
EnvironmentSatisfaction         1323 non-null object
Gender                          1323 non-null object
JobSatisfaction                 1323 non-null object
MaritalStatus                   1323 non-null object
MonthlyIncome                   1323 non-null float64
OverTime                        1323 non-null object
PerformanceRating               1323 non-null object
RelationshipSatisfaction        1323 non-null object
TotalWorkingYears               1323 non-null float64
YearsAtCompany                  1323 non-null float64
OverTime_int                    1323 non-null int64
Attrition_int                   1323 non