In [25]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
import os
import warnings

In [26]:
employee = pd.read_excel('INX_Future_Inc_Employee_Performance_CDS_Project2_Data_V1.8.xls')
warnings.filterwarnings('ignore')
employee.head()

Unnamed: 0,EmpNumber,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,E1001000,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,...,4,10,2,2,10,7,0,8,No,3
1,E1001006,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,...,4,20,2,3,7,7,1,7,No,3
2,E1001007,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,...,3,20,2,3,18,13,1,12,No,4
3,E1001009,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,...,2,23,2,2,21,6,12,6,No,3
4,E1001010,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,...,4,10,1,3,2,2,2,2,No,3


## Preprocessing started

In [27]:
# Employee department performace count
print(employee['EmpDepartment'].value_counts())
print(employee["PerformanceRating"].value_counts())

Sales                     373
Development               361
Research & Development    343
Human Resources            54
Finance                    49
Data Science               20
Name: EmpDepartment, dtype: int64
3    874
2    194
4    132
Name: PerformanceRating, dtype: int64


In [28]:
# Employee number do not make sense as predeictor
employee.drop(['EmpNumber'],axis=1,inplace=True)

In [29]:
employee.head()

Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating
0,32,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,4,...,4,10,2,2,10,7,0,8,No,3
1,47,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,4,...,4,20,2,3,7,7,1,7,No,3
2,40,Male,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,4,...,3,20,2,3,18,13,1,12,No,4
3,41,Male,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,2,...,2,23,2,2,21,6,12,6,No,3
4,60,Male,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,1,...,4,10,1,3,2,2,2,2,No,3


In [30]:
Gender = pd.get_dummies(employee['Gender'],drop_first=True)
employee = pd.concat([employee,Gender],axis=1)

In [31]:
employee.drop(['Gender'],axis=1,inplace=True)

In [32]:
employee.head()

Unnamed: 0,Age,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,...,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating,Male
0,32,Marketing,Single,Sales,Sales Executive,Travel_Rarely,10,3,4,55,...,10,2,2,10,7,0,8,No,3,1
1,47,Marketing,Single,Sales,Sales Executive,Travel_Rarely,14,4,4,42,...,20,2,3,7,7,1,7,No,3,1
2,40,Life Sciences,Married,Sales,Sales Executive,Travel_Frequently,5,4,4,48,...,20,2,3,18,13,1,12,No,4,1
3,41,Human Resources,Divorced,Human Resources,Manager,Travel_Rarely,10,4,2,73,...,23,2,2,21,6,12,6,No,3,1
4,60,Marketing,Single,Sales,Sales Executive,Travel_Rarely,16,4,1,84,...,10,1,3,2,2,2,2,No,3,1


In [33]:
employee.EducationBackground[employee.EducationBackground=='Life Sciences'] = 0
employee.EducationBackground[employee.EducationBackground=='Medical'] = 1
employee.EducationBackground[employee.EducationBackground=='Marketing'] = 2
employee.EducationBackground[employee.EducationBackground=='Technical Degree'] = 3
employee.EducationBackground[employee.EducationBackground=='Human Resources'] = 4
employee.EducationBackground[employee.EducationBackground=='Other'] = 5
employee.MaritalStatus[employee.MaritalStatus=='Single']=0
employee.MaritalStatus[employee.MaritalStatus=='Married']=1
employee.MaritalStatus[employee.MaritalStatus=='Divorced']=2

In [34]:
employee.EmpDepartment[employee.EmpDepartment=='Sales']=1
employee.EmpDepartment[employee.EmpDepartment=='Development']=2
employee.EmpDepartment[employee.EmpDepartment=='Research & Development']=3
employee.EmpDepartment[employee.EmpDepartment=='Human Resources']=4
employee.EmpDepartment[employee.EmpDepartment=='Finance']=5
employee.EmpDepartment[employee.EmpDepartment=='Data Science']=6

In [35]:
from sklearn.preprocessing import LabelEncoder

In [36]:
encoding = LabelEncoder()
EmpJobRole = encoding.fit_transform(employee['EmpJobRole'])
EmpJobRole = pd.DataFrame(EmpJobRole)

In [37]:
employee.drop(['EmpJobRole'],axis=1,inplace=True)

In [38]:
employee = pd.concat([employee,EmpJobRole],axis=1)

In [39]:
employee.BusinessTravelFrequency[employee.BusinessTravelFrequency=='Travel_Rarely']=1
employee.BusinessTravelFrequency[employee.BusinessTravelFrequency=='Travel_Frequently']=2
employee.BusinessTravelFrequency[employee.BusinessTravelFrequency=='Non-Travel']=0

In [40]:
employee.OverTime[employee.OverTime=='No'] = 0
employee.OverTime[employee.OverTime=='Yes'] = 1

In [41]:
employee.Attrition[employee.Attrition=='No']=0
employee.Attrition[employee.Attrition=='Yes']=1

## Preprocessing ended

In [42]:
employee.head()

Unnamed: 0,Age,EducationBackground,MaritalStatus,EmpDepartment,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,EmpHourlyRate,EmpJobInvolvement,...,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,PerformanceRating,Male,0
0,32,2,0,1,1,10,3,4,55,3,...,2,2,10,7,0,8,0,3,1,13
1,47,2,0,1,1,14,4,4,42,3,...,2,3,7,7,1,7,0,3,1,13
2,40,0,1,1,2,5,4,4,48,2,...,2,3,18,13,1,12,0,4,1,13
3,41,4,2,4,1,10,4,2,73,2,...,2,2,21,6,12,6,0,3,1,8
4,60,2,0,1,1,16,4,1,84,3,...,1,3,2,2,2,2,0,3,1,13


In [43]:
employee.to_csv('preprocessed.csv')