In [1]:
import numpy as np
import pandas as pd
from scipy.stats import *
import seaborn as sns

In [38]:
df = sns.load_dataset('tips')

In [39]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# Manual calculation step by step:

In [None]:
# Null hypothesis: there is no association between sex and smoker column
# Alternate hypothesis: there is association between sex and smoker column

In [77]:
# cross tab creation:

ct = pd.crosstab(df['sex'],df['smoker'],margins=True)
ct

smoker,Yes,No,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,60,97,157
Female,33,54,87
All,93,151,244


In [78]:
expected = np.zeros((ct.shape[0]-1, ct.shape[1]-1))
expected

array([[0., 0.],
       [0., 0.]])

In [79]:
# finding expected value

for i in range(ct.shape[0]-1):
    for j in range(ct.shape[1]-1):
        expected[i][j] = ct.iloc[-1,j]*ct.iloc[i,-1]/ct.iloc[-1,-1]

In [80]:
expected

array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]])

In [81]:
observed = ct.iloc[0:ct.shape[0]-1,0:ct.shape[1]-1].values

In [82]:
observed

array([[60, 97],
       [33, 54]], dtype=int64)

In [83]:
chi2_statistics = np.sum([((o-e)**2)/e for o,e in zip(observed,expected)])

In [84]:
chi2_statistics

0.001934818536627623

In [85]:
# degree of freedom:

nrows = ct.shape[0]-1
ncols = ct.shape[1]-1
degree_freedom = (nrows-1)*(ncols-1)
print(f"Degree of freedom: {degree_freedom}")

Degree of freedom: 1


In [86]:
# critical value:
confidence_level = 0.95
alpha = 1-confidence_level

critical_value = chi2.ppf(q=1-alpha, df = degree_freedom)
print(f"Critical value: {critical_value}")

Critical value: 3.841458820694124


In [87]:
if chi2_statistics<critical_value:
    print('We accept the Null Hypothesis')
else:
    print("We reject Null Hypothesis")

We accept the Null Hypothesis


In [88]:
# p-value calculation:

p_value = 1-chi2.cdf(chi2_statistics, df = degree_freedom)
print(f"P value: {p_value}")

P value: 0.964915107315732


In [89]:
if p_value>alpha:
    print('We accept the Null Hypothesis')
else:
    print("We reject Null Hypothesis")

We accept the Null Hypothesis


# from scipy library:

In [92]:
table = chi2_contingency(observed)
print(table)

Chi2ContingencyResult(statistic=0.0, pvalue=1.0, dof=1, expected_freq=array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]]))


In [93]:
print(f"Statistics: {table[0]}")
print(f"p_value: {table[1]}")
print(f"degree of freedom: {table[2]}")

Statistics: 0.0
p_value: 1.0
degree of freedom: 1
