In [1]:
import numpy as np
import pandas as pd

import scipy.stats as stats
from statsmodels.stats.proportion import proportions_ztest

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Data Description

**HR Job satisfaction/Attrition**
**Education:**
1 'Below College' 2 'College' 3 'Bachelor' 4 'Master' 5 'Doctor'

**EnvironmentSatisfaction:**
1 'Low' 2 'Medium' 3 'High' 4 'Very High'

**JobInvolvement:**
1 'Low' 2 'Medium' 3 'High' 4 'Very High'

**JobSatisfaction:**
1 'Low' 2 'Medium' 3 'High' 4 'Very High'

**PerformanceRating:**
1 'Low' 2 'Good' 3 'Excellent' 4 'Outstanding'

**RelationshipSatisfaction:**
1 'Low' 2 'Medium' 3 'High' 4 'Very High'

**WorkLifeBalance:**
1 'Bad' 2 'Good' 3 'Better' 4 'Best'

Acknowledgements
https://www.ibm.com/communities/analytics/watson-analytics-blog/watson-analytics-use-case-for-hr-retaining-valuable-employees/

In [2]:
df = pd.read_csv('../../Datasets/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [3]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [5]:
df.isnull().any()

Age                         False
Attrition                   False
BusinessTravel              False
DailyRate                   False
Department                  False
DistanceFromHome            False
Education                   False
EducationField              False
EmployeeCount               False
EmployeeNumber              False
EnvironmentSatisfaction     False
Gender                      False
HourlyRate                  False
JobInvolvement              False
JobLevel                    False
JobRole                     False
JobSatisfaction             False
MaritalStatus               False
MonthlyIncome               False
MonthlyRate                 False
NumCompaniesWorked          False
Over18                      False
OverTime                    False
PercentSalaryHike           False
PerformanceRating           False
RelationshipSatisfaction    False
StandardHours               False
StockOptionLevel            False
TotalWorkingYears           False
TrainingTimesL

In [6]:
df['Attrition'].value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

In [7]:
df['JobSatisfaction'].value_counts()

4    459
3    442
1    289
2    280
Name: JobSatisfaction, dtype: int64

In [8]:
overview = pd.crosstab(df['JobSatisfaction'], df['Attrition'], margins=True).T
overview

JobSatisfaction,1,2,3,4,All
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No,223,234,369,407,1233
Yes,66,46,73,52,237
All,289,280,442,459,1470


---------------
## Chi Square
**The Chi-Square test is a statistical procedure used by researchers to examine the differences between categorical variables in the same population. The Chi-Square test is most useful when analyzing cross tabulations of survey response data.**
<br/><br/>https://www.alchemer.com/resources/blog/introduction-to-chi-square-test-and-when-to-use-it/


\begin{equation}
X^2 = \sum{\frac{\displaystyle(O_i - E_i)^2} {E_i}} \\
X^2	=	\text{Chi squared}\\
O_i	=	\text{Observation value} \\
E_i	=	\text{Expected value} \\
\end{equation}

### Implement Formula:
**1- Calculate O and E values:**

In [9]:
# select yes/no values in each row 
observations = np.append(overview.iloc[0][:-1].values, overview.iloc[1][:-1].values)
observations

array([223, 234, 369, 407,  66,  46,  73,  52], dtype=int64)

In [10]:
# sum of yes/no attritions
row_sum = overview.iloc[0:2, 4].values
row_sum

array([1233,  237], dtype=int64)

In [11]:
overview.iloc[2, :-1].values

array([289, 280, 442, 459], dtype=int64)

In [12]:
# overview.loc['All', 'All']
total = overview.iloc[-1,-1]


In [13]:
# cacluate expected values for each element (probability of occurance for each element)
expected = []
for attr in range(2):
    for val in overview.iloc[2, :-1].values:
        exp = val * row_sum[attr] / total
        print(f'{val} * {row_sum[attr]} / {total} = {exp}')
        expected.append(exp)
        

289 * 1233 / 1470 = 242.4061224489796
280 * 1233 / 1470 = 234.85714285714286
442 * 1233 / 1470 = 370.7387755102041
459 * 1233 / 1470 = 384.99795918367346
289 * 237 / 1470 = 46.593877551020405
280 * 237 / 1470 = 45.142857142857146
442 * 237 / 1470 = 71.26122448979592
459 * 237 / 1470 = 74.00204081632653


**2- Putting values in the formula:**

In [14]:
# it is possible based on numpy abbilities:
chi2 = ((observations - expected) ** 2 / expected).sum()
print(f'Chi2 value = {chi2} \n\nfor evaluation must check with chi2 statistic table')

Chi2 value = 17.505077010348 

for evaluation must check with chi2 statistic table


#### DOF: Degree of Freedom
**The number of independent pieces of information used to calculate the statistic is called the degrees of freedom.**<br/>
**<u>Variables which are free to vary in a dataset</u>**



\begin{equation}
D_f = N-1 \\
D_f	=	\text{Degree of Freedom} \\
N	=	\text{Sample Size} \\
\end{equation}

In [15]:
# dof = (row nums - 1) * (col nums -1) [of course without margins]
dof = (len(row_sum) - 1 ) * (len(overview.iloc[2,:-1]) - 1)
dof

3

In [16]:
# Total job satisfactions for each group (Horizontal margin) 
overview.iloc[2,:-1]

JobSatisfaction
1    289
2    280
3    442
4    459
Name: All, dtype: int64

### Chi2 with python libraries
**Formula: P-value= 1 - CDF** <br/>
(Cumulative Distribution Function)

In [17]:
pvalue = 1 - stats.chi2.cdf(chi2, dof)
print('p-value:', pvalue)

p-value: 0.000556300451038716


In [18]:
overview.iloc[:2,:-1]

JobSatisfaction,1,2,3,4
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,223,234,369,407
Yes,66,46,73,52


In [19]:
#1
chisq, pvalue, dof, exp_freq = stats.chi2_contingency(observed=overview.iloc[:2, :-1], correction=True)
print(f'Chi2:{chisq},\np-value:{pvalue},\nDegree of Freedom:{dof}\n')
print('Expected Values:', exp_freq)

Chi2:17.505077010348,
p-value:0.0005563004510387556,
Degree of Freedom:3

Expected Values: [[242.40612245 234.85714286 370.73877551 384.99795918]
 [ 46.59387755  45.14285714  71.26122449  74.00204082]]


In [20]:
# 2
chisq, pvalue = stats.chisquare(f_obs=observations, f_exp=expected, ddof=dof)
print(f'Chi2:{chisq},\np-value:{pvalue}')

Chi2:17.505077010348,
p-value:0.001541482143508004


### Interpration:
**based on p-value and thresholds,Job Satisfaction has significant effect on Attrition**

-----------
### Note:
**Difference between chi2 and correlation:**

* 1- Chi 2 is a statistical model and represent probabilities to.
* 2- Correlation shows linear relations
    * 2-1- If there is no linear relationship between two variables, this method does not work.
    * 2-2- Data must be numerical and continues to use correlation

In [21]:
df.corr()

  df.corr()


Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,0.010661,-0.001686,0.208034,,-0.010145,0.010146,0.024287,0.02982,0.509604,...,0.053535,,0.03751,0.680381,-0.019621,-0.02149,0.311309,0.212901,0.216513,0.202089
DailyRate,0.010661,1.0,-0.004985,-0.016806,,-0.05099,0.018355,0.023381,0.046135,0.002966,...,0.007846,,0.042143,0.014515,0.002453,-0.037848,-0.034055,0.009932,-0.033229,-0.026363
DistanceFromHome,-0.001686,-0.004985,1.0,0.021042,,0.032916,-0.016075,0.031131,0.008783,0.005303,...,0.006557,,0.044872,0.004628,-0.036942,-0.026556,0.009508,0.018845,0.010029,0.014406
Education,0.208034,-0.016806,0.021042,1.0,,0.04207,-0.027128,0.016775,0.042438,0.101589,...,-0.009118,,0.018422,0.14828,-0.0251,0.009819,0.069114,0.060236,0.054254,0.069065
EmployeeCount,,,,,,,,,,,...,,,,,,,,,,
EmployeeNumber,-0.010145,-0.05099,0.032916,0.04207,,1.0,0.017621,0.035179,-0.006888,-0.018519,...,-0.069861,,0.062227,-0.014365,0.023603,0.010309,-0.01124,-0.008416,-0.009019,-0.009197
EnvironmentSatisfaction,0.010146,0.018355,-0.016075,-0.027128,,0.017621,1.0,-0.049857,-0.008278,0.001212,...,0.007665,,0.003432,-0.002693,-0.019359,0.027627,0.001458,0.018007,0.016194,-0.004999
HourlyRate,0.024287,0.023381,0.031131,0.016775,,0.035179,-0.049857,1.0,0.042861,-0.027853,...,0.00133,,0.050263,-0.002334,-0.008548,-0.004607,-0.019582,-0.024106,-0.026716,-0.020123
JobInvolvement,0.02982,0.046135,0.008783,0.042438,,-0.006888,-0.008278,0.042861,1.0,-0.01263,...,0.034297,,0.021523,-0.005533,-0.015338,-0.014617,-0.021355,0.008717,-0.024184,0.025976
JobLevel,0.509604,0.002966,0.005303,0.101589,,-0.018519,0.001212,-0.027853,-0.01263,1.0,...,0.021642,,0.013984,0.782208,-0.018191,0.037818,0.534739,0.389447,0.353885,0.375281
