## Chi-Square Test
The chi-square test is a statistical method used to determine if there is a significant association between two categorical variables.

In [1]:
# Importing the scipy.stats module from the SciPy library
import scipy.stats as stats

In [2]:
import seaborn as sns
import pandas as pd
import numpy as np

In [3]:
# Load the 'tips' dataset from the seaborn library
dataset = sns.load_dataset('tips')

In [4]:
dataset.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
# Create a crosstab (contingency table) to show the frequency distribution of 'sex' and 'smoker' columns
dataset_table = pd.crosstab(dataset['sex'], dataset['smoker'])
dataset_table

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [6]:
# get values in array
dataset_table.values

array([[60, 97],
       [33, 54]], dtype=int64)

In [7]:
observed_value = dataset_table.values
print("observed values : \n", observed_value)

observed values : 
 [[60 97]
 [33 54]]


In [8]:
# Perform the chi-square test for independence
# The chi2_contingency function returns several values, we will store them in the variable 'value'
value = stats.chi2_contingency(dataset_table)
value

Chi2ContingencyResult(statistic=0.0, pvalue=1.0, dof=1, expected_freq=array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]]))

In [9]:
value[3]  # expected value

array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]])

In [10]:
# Extract the expected values from the result
expected_value = value[3]
expected_value, observed_value

(array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]),
 array([[60, 97],
        [33, 54]], dtype=int64))

In [11]:
# Determine the number of rows in the crosstab table
# We use the length of the index of the dataset_table to find the number of rows
no_of_rows = len(dataset_table.iloc[0:len(dataset_table.index), 0])

# We use the length of the columns of the dataset_table to find the number of columns
no_of_columns = len(dataset_table.iloc[0, 0:len(dataset_table.columns)])


In [12]:
# Calculate the degrees of freedom for the chi-square test
# The formula for degrees of freedom in a contingency table is (number of rows - 1) * (number of columns - 1)
dof = (no_of_rows - 1) * (no_of_columns - 1)
dof

1

In [13]:
# Define the significance level (alpha)
# This is typically set to 0.05 for a 95% confidence level
alpha = 0.05

In [14]:
from scipy.stats import chi2

In [15]:
# # Calculate the chi-square statistic

# sum([(o - e)2.0 / e for o, e in zip(observed_values, expected_values)]):

# Iterates over pairs of observed and expected values using zip.
# For each pair, it calculates the squared difference divided by the expected value: 

# Sums these values to get the chi-square statistic.

chi_square = sum([(o-e)**2.0/e for o,e in zip(observed_value,expected_value)])
chi_square

array([0.00119737, 0.00073745])

In [16]:
# Combine the first two elements of the chi_square array to compute the chi-square statistic
# Typically, chi_square would be an array of chi-square contributions from different groups or sources.
# In this context, we're summing the first and second elements of this array.
chi_square_statistic = chi_square[0] + chi_square[1]
chi_square_statistic

0.001934818536627623

In [17]:
# Calculate the critical value for the chi-square distribution using the percent point function (PPF)
# chi2.ppf(q, df) computes the value of the chi-square distribution with the specified degrees of freedom (df)
# that corresponds to a cumulative probability equal to q.
# Here, we want the critical value such that the area to the right of this value under the chi-square distribution
# with the given degrees of freedom is equal to alpha (i.e., the tail probability).
critical_value = chi2.ppf(q=1-alpha, df=dof)
critical_value

3.841458820694124

In [18]:
# Calculate the p-value for the chi-square statistic
# chi2.cdf(x, df) computes the cumulative distribution function (CDF) for the chi-square distribution
# with the specified degrees of freedom (df). The CDF gives the probability that a chi-square random variable
# with the specified degrees of freedom will be less than or equal to x.
# To find the p-value, which is the probability of obtaining a test statistic at least as extreme as the one observed,
# we subtract the CDF value from 1.
# This gives the area in the tail to the right of the chi-square statistic.
p_value = 1 - chi2.cdf(x=chi_square_statistic,df=dof)
p_value

0.964915107315732

In [19]:
print('significant value alpha:', alpha)
print('degree of freedom:', dof)
print('critical value:', critical_value)
print('p_value:', p_value )

significant value alpha: 0.05
degree of freedom: 1
critical value: 3.841458820694124
p_value: 0.964915107315732


In [20]:
# Check if the chi-square statistic is greater than or equal to the critical value
# This comparison determines whether to reject or retain the null hypothesis (H0)
# H0: There is no relationship between the two categorical variables (independence)
# If the chi-square statistic is greater than or equal to the critical value, it means that
# the observed difference is statistically significant, and we reject the null hypothesis.
if chi_square_statistic >= critical_value:
    # Print the conclusion of the test indicating rejection of the null hypothesis
    print("Reject H0, There is a relationship between the two categorical variables")
else:
    # Print the conclusion of the test indicating failure to reject the null hypothesis
    print("Retain H0, there is no relationship between the two categorical variables")


Retain H0, there is no relationship between the two categorical variables


In [21]:
# Check if the p-value is less than or equal to the significance level (alpha)
# This comparison determines whether to reject or retain the null hypothesis (H0)
# H0: There is no relationship between the two categorical variables (independence)
# If the p-value is less than or equal to alpha, it means that the observed difference is statistically significant,
# and we reject the null hypothesis.
if p_value <= alpha:
    # Print the conclusion of the test indicating rejection of the null hypothesis
    print("Reject H0, There is a relationship between the two categorical variables")
else:
    # Print the conclusion of the test indicating failure to reject the null hypothesis
    print("Retain H0, there is no relationship between the two categorical variables")


Retain H0, there is no relationship between the two categorical variables
