# Pearson's chi-squared test

In [None]:
# import libraries
import numpy as np 
import pandas as pd
import scipy
from scipy.stats import chi2

In [None]:
# create contingency table with floats to avoid datatype issues with pd.DataFrame.at 
ar=np.array([[12.0, 12.0],[11.0, 32.0]])    
df=pd.DataFrame(ar, columns=["Disease", "No Disease"])
df.index=["Exposed", "Unexposed"] 
df 

In [None]:
df2=df.copy() # create contingency table with the marginal totals and the grand total. 
df2.loc['Column_Total']= df2.sum(numeric_only=True, axis=0)
df2.loc[:,'Row_Total'] = df2.sum(numeric_only=True, axis=1)
df2

In [None]:
n=df2.at["Column_Total", "Row_Total"]  # grand total 

exp=df2.copy()               # create dataframe with expected counts
for x in exp.index[0:-1]:
    for y in exp.columns[0:-1]:
        # round expected values to 6 decimal places to get the maximum available precision:
        v= (((df2.at[x, "Row_Total"]) * (df2.at["Column_Total", y])   )   /n ).round(6) 
        exp.at[x,y]=float(v)

exp = exp.iloc[[0, 1], [0, 1]]
exp

In [None]:
tstat = np.sum(((df-exp)**2/exp).values) # calculate chi-squared test statistic
tstat

In [None]:
dof = (len(df.columns)-1)*(len(df.index)-1) # determine degrees of freedom 
dof

In [None]:
pval=1-chi2.cdf(tstat, dof) # subtract the cumulative distribution function from 1
pval

In [None]:
from scipy.stats import chi2_contingency # import Scipy's built-in function

tstat_scipy,pval_scipy,ddof_scipy,exp_scipy=chi2_contingency(df, correction=False) # "correction=False" means no Yates' correction is used! 
print("Chi-squared test statistic without Yates correction (Scipy): " + str(tstat_scipy))
print("P-value without Yates correction (Scipy): " + str(pval_scipy))

# Chi-squared test with Yates correction:

In statistics, Yates's correction for continuity (or Yates's chi-squared test) is used in certain situations when testing for independence in a contingency table. It aims at correcting the error introduced by assuming that the discrete probabilities of frequencies in the table can be approximated by a continuous distribution (chi-squared). In some cases, Yates's correction may adjust too far, and so its current use is limited. 

All the aforementioned steps are basically the same but we use the following (adjusted) formula to determine our test statistic:

$$ \chi_{Yates}^{2}=  \sum_{i} \frac{  \big(  \big|O_i-E_i\big|-0.5  \big)^{2} }{E_i}$$


In [None]:
df 

In [None]:
exp

In [None]:
dof = (len(df.columns)-1)*(len(df.index)-1)
dof

In [None]:
# Apply Yates' correction by subtracting 0.5 from the absolute difference between observed and expected counts: 
tstat_yates= np.sum((((np.abs(df-exp)-0.5)**2)  / (exp)).values)
print("Chi-squared test statistic with Yates correction: " + str(tstat_yates))

pval=1-   chi2.cdf(tstat_yates, dof)
print("P-value with Yates correction: " + str(pval))

In [None]:
from scipy.stats import chi2_contingency
tstat_scipy,pval_scipy,ddof_scipy,exp_scipy=chi2_contingency(df, correction=True)# "correction=True" to apply Yates' correction
print("Chi-squared test statistic with Yates correction (Scipy): " + str(tstat_scipy))
print("P-value with Yates correction (Scipy): " + str(pval_scipy))

### (Fisher's exact test)
Out of curiosity, Fisher's exact test would give us the following p-value: 

In [None]:
import scipy.stats as stats

oddsratio, pvalue_fisher = stats.fisher_exact(df)   
pvalue_fisher

## Try it 

In [None]:
# create contingency table with floats to avoid datatype issues with pd.DataFrame.at 
ar=np.array([[209.0, 280.0],[225.0, 248.0]])    
df1=pd.DataFrame(ar, columns=["Beach", "Cruise"])
df1.index=["Men", "Women"] 
df1 

In [None]:
# YOUR CODE HERE

In [None]:
# create contingency table with floats to avoid datatype issues with pd.DataFrame.at 
ar=np.array([[207.0, 282.0],[231.0, 242.0]])    
df2=pd.DataFrame(ar, columns=["Cat", "Dog"])
df2.index=["Men", "Women"] 
df2 

In [None]:
# YOUR CODE HERE

### Implement throw pairs example test

In [None]:
# Data for Larry Bird pairs of free throws (basketball I suppose)
ar=np.array([[5.0, 82.0, 251.0]])    
df3=pd.DataFrame(ar, columns=[0,1,2])
df3 

In [None]:
# YOUR CODE HERE