In [18]:
import pandas as pd
import numpy as np
import scipy.stats as stats


In [19]:
## To create the data
R1 = [60,40,100]
R2 = [54,44,98]
R3 = [46,53,99]
R4 = [41,57,98]
R5 = [201,194,395]

In [20]:
df = pd.DataFrame({'Highschool':R1, 'Bachelors':R2, 'Masters': R3, 'Ph.d': R4, 'Row_Total': R5 })

In [21]:
df.index = ['Female', 'Male', 'Col_Total']

In [22]:
df

Unnamed: 0,Bachelors,Highschool,Masters,Ph.d,Row_Total
Female,54,60,46,41,201
Male,44,40,53,57,194
Col_Total,98,100,99,98,395


In [23]:
observed = df.ix[0:2, 0:4]                     # To get table without totals for later use

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [24]:
observed

Unnamed: 0,Bachelors,Highschool,Masters,Ph.d
Female,54,60,46,41
Male,44,40,53,57


In [25]:
##To get the expected count for a cell, multiply the row total for that cell by the column total for that cell 
##and then divide by the total number of observations.

n= 395      #Total number of observation or sample size

expected = np.outer(df['Row_Total'][0:2], df.ix['Col_Total'][0:4])/n

In [26]:
expected = pd.DataFrame(expected)
expected.columns = ['Bachelors','Highschool', 'Masters', 'Ph.d']
expected.index = [ 'Female', 'Male',]

In [27]:
expected

Unnamed: 0,Bachelors,Highschool,Masters,Ph.d
Female,49.868354,50.886076,50.377215,49.868354
Male,48.131646,49.113924,48.622785,48.131646


In [28]:
#to calculate the chi-square statistic

chi_squared_stat = (((observed - expected)**2)/expected).sum().sum()    #We call .sum() twice: once to get the column sums and a second time to add the column sums together, 
                                                                        #returning the sum of the entire 2D table.

In [29]:
chi_squared_stat

8.006066246262538

In [35]:
# To calculate Degree of Freedom
n = len(observed.columns)
DoF = n-1

In [36]:
DoF

3

In [31]:
#the critical value and the p-value:

critical = stats.chi2.ppf(q=0.95, df=DoF)
p_value = 1 - stats.chi2.cdf(x= chi_squared_stat, df=DoF)

In [32]:
# TO print the result

print('Results:')
print('\u03A7\u00b2 = {:.3f}'.format(chi_squared_stat))
print('Critical Value= {:.2f}'.format(critical))
print('P value = {:.2f}'.format(p_value))

if chi_squared_stat> p_value:
    print('Conclusion: \n Since \u03A7\u00b2 > Critical Value, we reject the null hypothesis. \n Hence, education level depends on gender at a 5% level of significance.')
    
else:
    print('Conclusion: \n Since \u03A7\u00b2 < Critical Value, we accept the null hypothesis. \n i.e, education level does not depend on gender.')

Results:
Χ² = 8.006
Critical Value= 7.81
P value = 0.05
Conclusion: 
 Since Χ² > Critical Value, we reject the null hypothesis. 
 Hence, education level depends on gender at a 5% level of significance.


In [33]:
##we can use scipy to conduct a test of independence quickly. Use stats.chi2_contingency() function to conduct a test of independence automatically given a frequency table of observed counts:
test = stats.chi2_contingency(observed= observed)

In [34]:
test

(8.006066246262538,
 0.045886500891747214,
 3,
 array([[49.86835443, 50.88607595, 50.37721519, 49.86835443],
        [48.13164557, 49.11392405, 48.62278481, 48.13164557]]))