# Perform Logistic Regression for Attrition Analysis

In [1]:
import pandas as pd


In [2]:
dataset = pd.read_csv("general_data.csv")
dataset.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [3]:
dataset.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [5]:
dataset.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeID                   int64
Gender                      object
JobLevel                     int64
JobRole                     object
MaritalStatus               object
MonthlyIncome                int64
NumCompaniesWorked         float64
Over18                      object
PercentSalaryHike            int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears          float64
TrainingTimesLastYear        int64
YearsAtCompany               int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

In [4]:
dataset['NumCompaniesWorked'].mean()

2.6948303347756775

In [6]:
dataset['NumCompaniesWorked'].fillna(2.0,inplace = True)

In [9]:
dataset['TotalWorkingYears'].fillna(11.0,inplace=True)

In [10]:
dataset.isnull().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [11]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
dataset['Gender'] = label_encoder.fit_transform(dataset['Gender'])

In [12]:
dataset.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [20]:
indi_x = dataset[['Age','DistanceFromHome','Education','Gender','NumCompaniesWorked','MonthlyIncome','PercentSalaryHike','TotalWorkingYears','TrainingTimesLastYear','YearsAtCompany','YearsSinceLastPromotion']]
dataset['Attrition'] = label_encoder.fit_transform(dataset['Attrition'])
dep_y = dataset['Attrition']

In [21]:
dep_y

0       0
1       1
2       0
3       0
4       0
       ..
4405    0
4406    0
4407    0
4408    0
4409    0
Name: Attrition, Length: 4410, dtype: int32

In [22]:
import statsmodels.api as sm
x1 = sm.add_constant(indi_x)
logistic = sm.Logit(dep_y,x1)
result = logistic.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.410829
         Iterations 7


0,1,2,3
Dep. Variable:,Attrition,No. Observations:,4410.0
Model:,Logit,Df Residuals:,4398.0
Method:,MLE,Df Model:,11.0
Date:,"Tue, 11 Aug 2020",Pseudo R-squ.:,0.06988
Time:,12:33:03,Log-Likelihood:,-1811.8
converged:,True,LL-Null:,-1947.9
Covariance Type:,nonrobust,LLR p-value:,6.076e-52

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.3834,0.333,1.151,0.250,-0.270,1.036
Age,-0.0343,0.007,-5.144,0.000,-0.047,-0.021
DistanceFromHome,-0.0030,0.005,-0.569,0.569,-0.013,0.007
Education,-0.0559,0.041,-1.356,0.175,-0.137,0.025
Gender,0.0701,0.087,0.804,0.422,-0.101,0.241
NumCompaniesWorked,0.1091,0.018,6.084,0.000,0.074,0.144
MonthlyIncome,-2.374e-06,9.32e-07,-2.546,0.011,-4.2e-06,-5.46e-07
PercentSalaryHike,0.0130,0.011,1.139,0.255,-0.009,0.035
TotalWorkingYears,-0.0540,0.012,-4.629,0.000,-0.077,-0.031


___Here Age,NumCompaniesWorked,MonthlyIncome,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion all these variables are less than 0.05 so these variables shows significance importance___