In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os

In [7]:
data_path = '../data/'
df = pd.read_csv(os.path.join(data_path, 'WA_Fn-UseC_-HR-Employee-Attrition.csv'))
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


There are 35 features in total in the dataset about. 
Let's focus on a few of them:
- Age
- Attrition
- Department
- DistanceFromHome
- Education
- EnvironmentSatisfaction
- Gender
- JobSatisfaction
- MaritalStatus
- MonthlyIncome
- OverTime
- PerformanceRating
- RelationshipSatisfaction
- TotalWorkingYears
- YearsAtCompany

In this lab, we will use attrition as our label, to try to predict the attrition status accroding to other attributes. 


In [8]:
to_keep = {'Age', 'Attrition', 'Department','DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'Gender', 'JobSatisfaction', 'MaritalStatus',
           'MonthlyIncome', 'OverTime', 'PerformanceRating', 'RelationshipSatisfaction','TotalWorkingYears','YearsAtCompany'}
to_drop = set(df.columns)-to_keep
df.drop(to_drop, axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 15 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null object
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome               1470 non-null int64
OverTime                    1470 non-null object
PerformanceRating           1470 non-null int64
RelationshipSatisfaction    1470 non-null int64
TotalWorkingYears           1470 non-null int64
YearsAtCompany              1470 non-null int64
dtypes: int64(10), object(5)
memory usage: 172.3+ KB


It's good that we don't have any null value. Let's one hot encode the Attrition, Department, Gender, MaritalStatus and Overtime. 

In [20]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

to_encode = {'Attrition', 'Department','Gender','MaritalStatus','OverTime'}
encoders = dict()
for atr in to_encode:
    encoders[atr] = LabelEncoder()
    df[atr] = encoders[atr].fit_transform(df[atr] )

Then, let's scale the numeric features. 

In [23]:
numerics = set(df.columns) - to_encode
for atr in numerics:
    df[atr] = df[atr].astype(np.float)    
    ss = StandardScaler()
    df[atr] = ss.fit_transform(df[atr].values.reshape(-1, 1))

In [25]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 15 columns):
Age                         1470 non-null float64
Attrition                   1470 non-null int64
Department                  1470 non-null int64
DistanceFromHome            1470 non-null float64
Education                   1470 non-null float64
EnvironmentSatisfaction     1470 non-null float64
Gender                      1470 non-null int64
JobSatisfaction             1470 non-null float64
MaritalStatus               1470 non-null int64
MonthlyIncome               1470 non-null float64
OverTime                    1470 non-null int64
PerformanceRating           1470 non-null float64
RelationshipSatisfaction    1470 non-null float64
TotalWorkingYears           1470 non-null float64
YearsAtCompany              1470 non-null float64
dtypes: float64(10), int64(5)
memory usage: 172.3 KB


In [26]:
grouped_features = [('Age','MaritalStatus'),('Age','Gender'),('Department','MonthlyIncome'),('Department','YearsAtCompany'),('Age','TotalWorkingYears'),
                    ('Education', 'JobSatisfaction'),
                    ('Age','Gender','DistanceFromHome'),('Department','PerformanceRating','MonthlyIncome'),
                    ('Education', 'JobSatisfaction','RelationshipSatisfaction'),('Department','OverTime','YearsAtCompany'),
                    ('Age','TotalWorkingYears','YearsAtCompany')]