In [365]:
import pandas as pd
import xlrd
import scipy
import numpy as np
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import preprocessing

## Load and Clean Dataset

In [366]:
file = 'CaseStudy2data.xlsx'

# Load spreadsheet
xl = pd.ExcelFile(file)

# Load a sheet into a DataFrame by name: df1
attrition_df = xl.parse('HR-employee-attrition Data')

#drop unneccessary columns with constant values
attrition_column = attrition_df["Attrition"]
attrition_df.drop("Attrition", axis=1, inplace=True)
attrition_df.insert(0, "Attrition", attrition_column)
attrition_df.drop('EmployeeCount', axis=1, inplace=True)
attrition_df.drop('StandardHours', axis=1, inplace=True)
attrition_df.drop('Over18', axis=1, inplace=True)

#convert attrition variable to binary
attrition_df = attrition_df.replace("Yes", 1)
attrition_df = attrition_df.replace("No", 0)

# code dummy variables, this is necessary to use sklearn
attrition_df = pd.get_dummies(attrition_df, columns=["BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus"], prefix=["BusinessTravel", "Department", "EduationField", "Gender", "JobRole", "MaritalStatus"])
attrition_df

Unnamed: 0,Attrition,Age,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EduationField_Human Resources,EduationField_Life Sciences,EduationField_Marketing,EduationField_Medical,EduationField_Other,EduationField_Technical Degree,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,1,41,1102,1,2,1,2,94,3,2,4,5993,19479,8,1,11,3,1,0,8,0,1,6,4,0,5,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
1,0,49,279,8,1,2,3,61,2,2,2,5130,24907,1,0,23,4,4,1,10,3,3,10,7,1,7,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0
2,1,37,1373,2,2,4,4,92,2,1,3,2090,2396,6,1,15,3,2,0,7,3,3,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1
3,0,33,1392,3,4,5,4,56,3,1,3,2909,23159,1,1,11,3,3,0,8,3,3,8,7,3,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0,27,591,2,1,7,1,40,3,1,2,3468,16632,9,0,12,3,4,1,6,3,3,2,2,2,2,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0
5,0,32,1005,2,2,8,4,79,3,1,4,3068,11864,0,0,13,3,3,0,8,2,2,7,7,3,6,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1
6,0,59,1324,3,3,10,3,81,4,1,1,2670,9964,4,1,20,4,1,3,12,3,2,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0
7,0,30,1358,24,1,11,4,67,3,1,3,2693,13335,1,0,22,4,2,1,1,2,3,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
8,0,38,216,23,3,12,4,44,2,3,3,9526,8787,0,0,21,4,2,0,10,2,3,9,7,1,8,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1
9,0,36,1299,27,3,13,3,94,3,2,3,5237,16577,6,0,13,3,2,2,17,3,2,7,7,7,7,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0


## Correlation Matrix to Identify Covariance

In [367]:
attrition_df.corr().abs()

Unnamed: 0,Attrition,Age,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EduationField_Human Resources,EduationField_Life Sciences,EduationField_Marketing,EduationField_Medical,EduationField_Other,EduationField_Technical Degree,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
Attrition,1.0,0.159205,0.056652,0.077924,0.031373,0.010577,0.103369,0.006846,0.130016,0.169105,0.103481,0.15984,0.01517,0.043494,0.246118,0.013478,0.002889,0.045872,0.137145,0.171063,0.059478,0.063939,0.134392,0.160545,0.033019,0.156199,0.074457,0.115143,0.049538,0.016832,0.085293,0.080855,0.036466,0.032703,0.055781,0.046999,0.017898,0.069355,0.029453,0.029453,0.078696,0.036215,0.09829,0.083316,0.082994,0.08887,0.00036,0.019774,0.157234,0.087716,0.090984,0.175419
Age,0.159205,1.0,0.010661,0.001686,0.208034,0.010145,0.010146,0.024287,0.02982,0.509604,0.004892,0.497855,0.028051,0.299635,0.028062,0.003634,0.001904,0.053535,0.03751,0.680381,0.019621,0.02149,0.311309,0.212901,0.216513,0.202089,0.011215,0.024743,0.028791,0.020523,0.017883,0.027549,0.001696,0.016824,0.038162,0.006354,0.041466,0.027604,0.036311,0.036311,0.098825,0.029856,0.143176,0.294248,0.049726,0.185891,0.146518,0.002001,0.175785,0.03312,0.083919,0.119185
DailyRate,0.056652,0.010661,1.0,0.004985,0.016806,0.05099,0.018355,0.023381,0.046135,0.002966,0.030571,0.007707,0.032182,0.038153,0.009135,0.022704,0.000473,0.007846,0.042143,0.014515,0.002453,0.037848,0.034055,0.009932,0.033229,0.026363,0.012096,0.011776,0.002078,0.026726,0.014871,0.003616,0.043144,0.004028,0.064449,0.034202,0.003893,0.030869,0.011716,0.011716,0.040141,0.021156,0.006728,0.013224,0.005302,2.1e-05,0.002624,0.000513,0.005375,0.03708,0.040035,0.075835
DistanceFromHome,0.077924,0.001686,0.004985,1.0,0.021042,0.032916,0.016075,0.031131,0.008783,0.005303,0.003669,0.017014,0.027473,0.029251,0.025514,0.040235,0.02711,0.006557,0.044872,0.004628,0.036942,0.026556,0.009508,0.018845,0.010029,0.014406,0.023605,0.005081,0.020116,0.012901,0.008117,0.014085,0.002624,0.024499,0.039294,0.013486,0.007969,0.014802,0.001851,0.001851,0.022916,0.024089,0.012369,0.03919,0.011848,0.022351,0.010986,0.030761,0.015994,0.00544,0.030232,0.027445
Education,0.031373,0.208034,0.016806,0.021042,1.0,0.04207,0.027128,0.016775,0.042438,0.101589,0.011296,0.094961,0.026084,0.126317,0.020322,0.011111,0.024539,0.009118,0.018422,0.14828,0.0251,0.009819,0.069114,0.060236,0.054254,0.069065,0.004524,0.008292,0.004126,0.011435,0.018604,0.014215,0.026479,0.013184,0.072405,0.072335,0.038043,0.026742,0.016547,0.016547,0.02427,0.005295,0.063566,0.028453,0.00529,0.049694,0.000709,0.053398,0.091465,0.002439,0.001865,0.004168
EmployeeNumber,0.010577,0.010145,0.05099,0.032916,0.04207,1.0,0.017621,0.035179,0.006888,0.018519,0.046247,0.014829,0.012648,0.001251,0.024037,0.012944,0.020359,0.069861,0.062227,0.014365,0.023603,0.010309,0.01124,0.008416,0.009019,0.009197,0.022272,0.00798,0.007976,0.063431,0.041923,0.015441,0.035345,0.000609,0.014487,0.008689,0.010432,0.005938,0.022556,0.022556,0.025945,0.067287,0.019722,0.035058,0.01435,0.013983,0.017686,0.023263,0.006255,0.025149,0.053933,0.035189
EnvironmentSatisfaction,0.103369,0.010146,0.018355,0.016075,0.027128,0.017621,1.0,0.049857,0.008278,0.001212,0.006784,0.006259,0.0376,0.012594,0.070132,0.031701,0.029548,0.007665,0.003432,0.002693,0.019359,0.027627,0.001458,0.018007,0.016194,0.004999,0.003568,0.012624,0.008496,0.007597,0.027976,0.025606,0.006898,0.024526,0.000479,0.021299,0.064602,0.027713,0.000508,0.000508,0.01409,0.022014,0.001533,0.01073,0.059178,0.048689,0.00194,0.024421,0.002949,0.016439,0.02218,0.009035
HourlyRate,0.006846,0.024287,0.023381,0.031131,0.016775,0.035179,0.049857,1.0,0.042861,0.027853,0.071335,0.015794,0.015297,0.022157,0.007782,0.009062,0.002172,0.00133,0.050263,0.002334,0.008548,0.004607,0.019582,0.024106,0.026716,0.020123,0.016994,0.018819,0.027541,0.016551,0.018686,0.012047,0.03367,0.038759,0.004452,0.020418,0.042163,0.011283,0.000478,0.000478,0.014599,0.016189,0.018028,0.012659,0.014394,0.025128,0.020034,0.011886,0.018703,0.00615,0.036432,0.033436
JobInvolvement,0.130016,0.02982,0.046135,0.008783,0.042438,0.006888,0.008278,0.042861,1.0,0.01263,0.021476,0.015271,0.016322,0.015012,0.003507,0.017205,0.029071,0.034297,0.021523,0.005533,0.015338,0.014617,0.021355,0.008717,0.024184,0.025976,0.045779,0.004424,0.026714,0.004789,0.023187,0.026107,0.002079,0.003228,0.018657,0.017103,0.011895,0.004519,0.01796,0.01796,0.001272,0.004952,0.022724,0.017112,0.021939,0.0152,0.047604,0.011413,0.027282,0.016815,0.028324,0.045253
JobLevel,0.169105,0.509604,0.002966,0.005303,0.101589,0.018519,0.001212,0.027853,0.01263,1.0,0.001944,0.9503,0.039563,0.142501,0.000544,0.03473,0.021222,0.021642,0.013984,0.782208,0.018191,0.037818,0.534739,0.389447,0.353885,0.375281,0.007295,0.021557,0.023433,0.006157,0.10783,0.114307,0.010409,0.008431,0.092698,0.014114,0.016724,0.054707,0.039403,0.039403,0.115704,0.100922,0.344608,0.552744,0.114896,0.414319,0.387788,0.12749,0.216559,0.037087,0.050547,0.087072


Based off looking at the correlation matrix above, there are not any values that stand out as extremly correlated (0.8). Thus no variables will be removed.

## Chi-Square Univariate Selection

In [368]:
np.set_printoptions(precision=3)

values = attrition_df.values

X = values[:,1:(len(attrition_df.columns))]
Y = values[:,0]
number_of_featuers_to_select = 6

# feature extraction
test = SelectKBest(score_func=chi2, k=number_of_featuers_to_select).fit(X, Y)
test.transform(X)

# summarize scores
idxs = np.argsort(test.scores_)[:number_of_featuers_to_select]
scores = test.scores_[idxs]

# pvalues
idxs = np.argsort(test.pvalues_)[:number_of_featuers_to_select]
pvalues = test.pvalues_[idxs]

# feature names
idxs_selected = test.get_support(indices=True)
features = np.asarray(attrition_df.columns[idxs_selected])

# output features with their pvalues and score
print(list(zip(features, pvalues, scores)))


[('Age', 0.0, 0.0001523054445610236), ('JobSatisfaction', 3.2879327626280303e-262, 0.0005060644425394486), ('MonthlyIncome', 4.92392440516756e-210, 0.2350268575624977), ('StockOptionLevel', 4.149261818618412e-52, 0.39862638208752976), ('WorkLifeBalance', 9.247065655715055e-33, 0.4317790898209484), ('YearsAtCompany', 2.2057282778744752e-27, 0.4446056989443588)]


Based of Chi-Square analysis it seems that Age, JobSatisfaction, MonthlyIncome, StockOptionLevel, and WorkLifeBalance are the top factors the affect Attrition rate.