# CHI-SQUARE TEST BINARY OUTCOME

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from math import sqrt,exp,log

df = pd.read_csv('../data/cleaned_data.csv',index_col='Record_Number')

## Remove rows where answer to Q!a is 'Not sure'

In [2]:
df = df[df.Q1a != 'Not sure'] 

In [3]:
from scipy.stats import chi2_contingency
# Defining a function to perform the Chi-Square Test and interpret results
def perform_chi_square_test(data, col1, col2):
    # Creating a contingency table
    contingency_table = pd.crosstab(data[col1], data[col2])

    # Performing the Chi-Square Test
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    # Interpreting the result
    significant = p < 0.05  # 5% significance level
    return chi2, p, significant, contingency_table

## Assumptions
- Both variables are categorical
- All observations are independent
- Cells in the contingency table are mutually exclusive
- Expected value of cells should be 5 or greater in at least 80% of cells

### X and Y are binary

In [4]:
binary = ['GCCSA','Asthma_moderate','Asthma_severe','Cardiovascular','COPD_moderate','COPD_severe','Dementia_Alzheimer','Diabetes',
          'Disabilities','Hypertension','Liver','Neurodevelopmental','Obesity','Other_Chronic_Respiratory','Renal_Kidney','Autoimmune',
          'Cancer','Chronic_Infections','Congenital_Acquired_Heart_Disease','Sickle','Solid_Organ','Previously_Admitted_COVID19',
          'HIGH_RISKr1','HIGH_RISKr4','HIGH_RISKr5','HIGH_RISKr6']

print('Significant Tests\n')
for i in range(len(binary)):
    test = perform_chi_square_test(df, binary[i], 'Q1a')
    if test[2] == True:
        print(test[3],'\n')
        print (f'Test statistic = {test[0]}')
        print (f'p-value = {test[1]}\n\n')
        print('---------------------------')

Significant Tests

Q1a            No  Yes
Hypertension          
False         251  490
True           34  117 

Test statistic = 6.927681820516224
p-value = 0.008487162690235075


---------------------------
Q1a            No  Yes
Renal_Kidney          
False         283  586
True            2   21 

Test statistic = 4.825651052212233
p-value = 0.028039273701622864


---------------------------
Q1a      No  Yes
Cancer          
False   280  577
True      5   30 

Test statistic = 4.416996137312998
p-value = 0.03558262396455648


---------------------------
Q1a                           No  Yes
Previously_Admitted_COVID19          
False                        280  567
True                           5   40 

Test statistic = 8.483519552364525
p-value = 0.00358378095526773


---------------------------
Q1a           No  Yes
HIGH_RISKr1          
False        266  507
True          19  100 

Test statistic = 15.299509597199714
p-value = 9.174032466723262e-05


---------------------------

#### People with Hypertension, Cancer, HIGH_RISKr1, HIGH_RISKr5 & HIGH_RISKr6 or people previously admitted With COVID-19 are more likely to seek treatment from the doctor if tested positive to COVID-19. 
NB: Renal Kidney breaks one of the assumptions as less than 80% of the cells have 5 or more occurences.

### Odds Ratio for X and Y Binary

a is True/Yes

b is True/No

c is False/Yes

d is False/No

#### Hypertension, Q1a

In [5]:
a = 117 
b = 34
c = 490
d = 251

In [6]:
odds_ratio = (a*d) / (b*c)
se = sqrt((1/a)+(1/b)+(1/c)+(1/d))
ci_lower = round(exp(log(odds_ratio - (1.96 * se))),2)
ci_upper = round(exp(log(odds_ratio + (1.96 * se))),2)
print(f'People with hypertension are {round(odds_ratio,2)} times more likely to seek treatment from a doctor if testing positve to COVID-19.')
print(f'Confidence interval is ({ci_lower}, {ci_upper}).\n')


People with hypertension are 1.76 times more likely to seek treatment from a doctor if testing positve to COVID-19.
Confidence interval is (1.35, 2.17).



#### Cancer, Q1a

In [7]:
a = 30 
b = 5
c = 577
d = 280

In [8]:
odds_ratio = (a*d) / (b*c)
se = sqrt((1/a)+(1/b)+(1/c)+(1/d))
ci_lower = round(exp(log(odds_ratio - (1.96 * se))),2)
ci_upper = round(exp(log(odds_ratio + (1.96 * se))),2)
print(f'People with cancer are {round(odds_ratio,2)} times more likely to seek treatment from a doctor if testing positve to COVID-19.')
print(f'Confidence interval is ({ci_lower}, {ci_upper}).\n')

People with cancer are 2.91 times more likely to seek treatment from a doctor if testing positve to COVID-19.
Confidence interval is (1.95, 3.87).



#### Previously Admitted With COVID19, Q1a

In [9]:
a = 40 
b = 5
c = 567
d = 280

In [10]:
odds_ratio = (a*d) / (b*c)
se = sqrt((1/a)+(1/b)+(1/c)+(1/d))
ci_lower = round(exp(log(odds_ratio - (1.96 * se))),2)
ci_upper = round(exp(log(odds_ratio + (1.96 * se))),2)
print(f'People previously admitted with Covid-19 are {round(odds_ratio,2)} times more likely to seek treatment from a doctor if testing positve to COVID-19.')
print(f'Confidence interval is ({ci_lower}, {ci_upper}).\n')

People previously admitted with Covid-19 are 3.95 times more likely to seek treatment from a doctor if testing positve to COVID-19.
Confidence interval is (3.01, 4.89).



#### HIGH_RISKr1, Q1a

In [11]:
a = 100 
b = 19
c = 507
d = 266

In [12]:
odds_ratio = (a*d) / (b*c)
se = sqrt((1/a)+(1/b)+(1/c)+(1/d))
ci_lower = round(exp(log(odds_ratio - (1.96 * se))),2)
ci_upper = round(exp(log(odds_ratio + (1.96 * se))),2)
print(f'People with HIGH_RISKr1 are {round(odds_ratio,2)} times more likely to seek treatment from a doctor if testing positve to COVID-19.')
print(f'Confidence interval is ({ci_lower}, {ci_upper}).\n')

People with HIGH_RISKr1 are 2.76 times more likely to seek treatment from a doctor if testing positve to COVID-19.
Confidence interval is (2.25, 3.27).



#### HIGH_RISKr5, Q1a

In [13]:
a = 53 
b = 12
c = 554
d = 273

In [14]:
odds_ratio = (a*d) / (b*c)
se = sqrt((1/a)+(1/b)+(1/c)+(1/d))
ci_lower = round(exp(log(odds_ratio - (1.96 * se))),2)
ci_upper = round(exp(log(odds_ratio + (1.96 * se))),2)
print(f'People with HIGH_RISKr5 are {round(odds_ratio,2)} times more likely to seek treatment from a doctor if testing positve to COVID-19.')
print(f'Confidence interval is ({ci_lower}, {ci_upper}).\n')

People with HIGH_RISKr5 are 2.18 times more likely to seek treatment from a doctor if testing positve to COVID-19.
Confidence interval is (1.53, 2.82).



#### HIGH_RISKr6, Q1a

In [15]:
a = 40 
b = 5
c = 567
d = 280

In [16]:
odds_ratio = (a*d) / (b*c)
se = sqrt((1/a)+(1/b)+(1/c)+(1/d))
ci_lower = round(exp(log(odds_ratio - (1.96 * se))),2)
ci_upper = round(exp(log(odds_ratio + (1.96 * se))),2)
print(f'People with HIGH_RISKr6 are {round(odds_ratio,2)} times more likely to seek treatment from a doctor if testing positve to COVID-19.')
print(f'Confidence interval is ({ci_lower}, {ci_upper}).\n')

People with HIGH_RISKr6 are 3.95 times more likely to seek treatment from a doctor if testing positve to COVID-19.
Confidence interval is (3.01, 4.89).

