In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pydataset import data
from scipy import stats

# Chi2 Test for Independence

- used when you have two categorical variables

- comparing them to see if they are independent or if there is an association

- Compare two categorical variables and help answer questions like:
    - is whether r not a customer churns independent of their subscriptions
    - are doctors less likely to smoke?
    - is the attrition higher for employees who travel more vs those who travel less

#### Observed Categorical frequency/distribution
**compared against**
#### Expected Categorical frequency/distribution

### Chi2 = ((observed value - expected value)^2) / expected value

- $H_0$: There is no association between two categorical values (ie: they are independent)
- $H_a$: There is an association between two categorical values (ie: they are dependent)

#### 100 people surveyed for their prefence of phone brand. Is the choice of brand independent of the gender?
##### observed Frequencies: 
- 30 females like apple
- 10 females like samsung
- 20 males like apple
- 40 makes like samsung

##### expected Frequencies: 
- calculated assuming the $H_0$ is true

Degrees of freedom = (num_cols - 1) * (num_rows-1)

### Example #1 : 
is there any association between being a smoker and the time of day they come to restaurant

In [8]:
df =data('tips')

In [9]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [19]:
#this will give you the OBSERVED frequency
observed = pd.crosstab(df.smoker, df.time)
observed

time,Dinner,Lunch
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
No,106,45
Yes,70,23


In [11]:
#set alpha
alpha = 0.01

In [15]:
#chi2 contingency returns 4 different values
chi2, p, degf, expected = stats.chi2_contingency(observed)
chi2, p, degf, expected

(0.5053733928754354,
 0.4771485672079724,
 1,
 array([[108.91803279,  42.08196721],
        [ 67.08196721,  25.91803279]]))

In [16]:
## make it easier to read
print('Observed\n')
print(observed.values)
print('---\nExpected\n')
print(expected.astype(int))
print('---\n')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

Observed

[[106  45]
 [ 70  23]]
---
Expected

[[108  42]
 [ 67  25]]
---

chi^2 = 0.5054
p     = 0.4771


In [18]:
if p< alpha:
    print('We reject the null hypothesis')
else:
    print('We fail to reject the null hypothesis')

We fail to reject the null hypothesis


**if p value is low = reject the null**

______

### Mini exercise:
Is attrition independent from Department?

In [20]:
# get your data
df = pd.read_csv("https://gist.githubusercontent.com/ryanorsinger/6ba2dd985c9aa92f5598fc0f7c359f6a/raw/b20a508cee46e6ac69eb1e228b167d6f42d665d8/attrition.csv")

In [21]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


### Step 1: Form the null and alternate hypothesis
- $H_0$: There is no association between attrition and department (independence)
- $H_a$: There is an association between attrition and department 

### Step 2: look at # of categories in Attrition column

In [28]:
df.Attrition.value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

### Step 3: # look at # of categories in business travel

In [29]:
df.Department.value_counts()

Research & Development    961
Sales                     446
Human Resources            63
Name: Department, dtype: int64

### Step 4: get observed frequencies

In [22]:
observed = pd.crosstab(df.Attrition, df.Department)
observed

Department,Human Resources,Research & Development,Sales
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,51,828,354
Yes,12,133,92


### Step 5: use chi2 test

In [34]:
chi2, p, degf, expected = stats.chi2_contingency(observed)
chi2, p, degf, expected

(10.79600732241067,
 0.004525606574479633,
 2,
 array([[ 52.84285714, 806.06326531, 374.09387755],
        [ 10.15714286, 154.93673469,  71.90612245]]))

### Step 6: make it easier to read

In [33]:
print('Observed\n')
print(observed.values)
print('---\nExpected\n')
print(expected.astype(int))
print('---\n')
print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

Observed

[[ 51 828 354]
 [ 12 133  92]]
---
Expected

[[ 52 806 374]
 [ 10 154  71]]
---

chi^2 = 10.7960
p     = 0.0045


### Step 7: set alpha

In [36]:
alpha = 0.01

### Step 8: come to conculsion

In [32]:
p < alpha

True

In [35]:
if p< alpha:
    print('We reject the null hypothesis')
else:
    print('We fail to reject the null hypothesis')

We reject the null hypothesis
