In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats

### Chi-Squared Goodness-Of-Fit Test

The chi-squared goodness-of-fit test is an analog of the one-way t-test for categorical variables: it tests whether the distribution of sample categorical data matches an expected distribution. 

In [2]:
national = pd.DataFrame(["white"]*100000 + ["hispanic"]*60000 +\
                        ["black"]*50000 + ["asian"]*15000 + ["other"]*35000)
           

minnesota = pd.DataFrame(["white"]*600 + ["hispanic"]*300 + \
                         ["black"]*250 +["asian"]*75 + ["other"]*150)

national_table = pd.crosstab(index=national[0], columns="count")
minnesota_table = pd.crosstab(index=minnesota[0], columns="count")

print( "National")
print(national_table)
print(" ")
print( "Minnesota")
print(minnesota_table)

National
col_0      count
0               
asian      15000
black      50000
hispanic   60000
other      35000
white     100000
 
Minnesota
col_0     count
0              
asian        75
black       250
hispanic    300
other       150
white       600


In [3]:
observed = minnesota_table

national_ratios = national_table/len(national)  # Get population ratios

expected = national_ratios * len(minnesota)   # Get expected counts

chi_squared_stat = (((observed-expected)**2)/expected).sum()

print(chi_squared_stat)

col_0
count    18.194805
dtype: float64


In [4]:
crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*
                      df = 4)   # Df = number of variable categories - 1

print("Critical value")
print(crit)

p_value = 1 - stats.chi2.cdf(x=chi_squared_stat,  # Find the p-value
                             df=4)
print("P value")
print(p_value)

Critical value
9.48772903678
P value
[ 0.00113047]


Since the chi-squared statistic exceeds the critical value, we'd reject the null hypothesis that the two distributions are the same.

Chi-squared can be calculated automatically using stats.chisquare()

In [5]:
stats.chisquare(f_obs= observed,   # Array of observed counts
                f_exp= expected)   # Array of expected counts

Power_divergenceResult(statistic=array([ 18.19480519]), pvalue=array([ 0.00113047]))

### Chi-Squared Test of Independence

The chi-squared test of independence tests whether two categorical variables are independent.

In [6]:

np.random.seed(101)

# Sample data randomly at fixed probabilities
voter_race = np.random.choice(a= ["asian","black","hispanic","other","white"],
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

# Sample data randomly at fixed probabilities
voter_party = np.random.choice(a= ["democrat","independent","republican"],
                              p = [0.4, 0.2, 0.4],
                              size=1000)

voters = pd.DataFrame({"race":voter_race, 
                       "party":voter_party})

voter_tab = pd.crosstab(voters.race, voters.party, margins = True)

voter_tab.columns = ["democrat","independent","republican","row_totals"]

voter_tab.index = ["asian","black","hispanic","other","white","col_totals"]

observed = voter_tab.ix[0:5,0:3]   # Get table without totals for later use
voter_tab

Unnamed: 0,democrat,independent,republican,row_totals
asian,23,9,25,57
black,48,28,57,133
hispanic,112,51,107,270
other,19,12,20,51
white,192,110,187,489
col_totals,394,210,396,1000


In [7]:
expected =  np.outer(voter_tab["row_totals"][0:5],
                     voter_tab.ix["col_totals"][0:3]) / 1000

expected = pd.DataFrame(expected)

expected.columns = ["democrat","independent","republican"]
expected.index = ["asian","black","hispanic","other","white"]

expected

Unnamed: 0,democrat,independent,republican
asian,22.458,11.97,22.572
black,52.402,27.93,52.668
hispanic,106.38,56.7,106.92
other,20.094,10.71,20.196
white,192.666,102.69,193.644


In [8]:
chi_squared_stat = (((observed-expected)**2)/expected).sum().sum()

print(chi_squared_stat)

3.57488812573


In [9]:
crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*
                      df = 8)   # *

print("Critical value")
print(crit)

p_value = 1 - stats.chi2.cdf(x=chi_squared_stat,  # Find the p-value
                             df=8)
print("P value")
print(p_value)

Critical value
15.5073130559
P value
0.893300505478


*Note: The degrees of freedom for a test of independence equals the product of the number of categories in each variable minus 1. In this case we have a 5x3 table so df = 4x2 = 8.

As with the goodness-of-fit test, we can use scipy to conduct a test of independence quickly. 

In [10]:
stats.chi2_contingency(observed= observed)

(3.5748881257309497,
 0.8933005054783083,
 8,
 array([[  22.458,   11.97 ,   22.572],
        [  52.402,   27.93 ,   52.668],
        [ 106.38 ,   56.7  ,  106.92 ],
        [  20.094,   10.71 ,   20.196],
        [ 192.666,  102.69 ,  193.644]]))

The output shows the chi-square statistic, the p-value and the degrees of freedom followed by the expected counts.

Chi-squared tests provide a way to investigate differences in the distributions of categorical variables with the same categories and the dependence between categorical variables.