In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import math
%matplotlib inline

## chi-squared goodness of fit
t检验判断样本均值是否有总体均值不同，卡方goodness-of-fit 是相当与categorical变量的t检验，它检验categorical数据样本分布与期望分布是否一致
$$sum(\frac{(observed-expected)^2}{expected})$$
卡方检验通常有两种：
* 适配性或适度性(goodness or fit)
* 独立性(independence)

In [3]:
national = pd.DataFrame(['white'] * 100000 + ['hispanic'] * 60000 + ['black'] * 50000 + ['asian'] * 15000 + ['other'] * 35000)
minnesota = pd.DataFrame(['white'] * 600 + ['hispantic'] * 300 + ['black'] * 250 + ['asian'] * 75 + ['other'] * 150)
national_table = pd.crosstab(index=national[0], columns='count')
minnesota_table = pd.crosstab(index=minnesota[0], columns='count')
print('National')
print(national_table)
print('*'*50)
print('Minnesota')
print(minnesota_table)

National
col_0      count
0               
asian      15000
black      50000
hispanic   60000
other      35000
white     100000
**************************************************
Minnesota
col_0      count
0               
asian         75
black        250
hispantic    300
other        150
white        600


In [5]:
national[0]

0         white
1         white
2         white
3         white
4         white
5         white
6         white
7         white
8         white
9         white
10        white
11        white
12        white
13        white
14        white
15        white
16        white
17        white
18        white
19        white
20        white
21        white
22        white
23        white
24        white
25        white
26        white
27        white
28        white
29        white
          ...  
259970    other
259971    other
259972    other
259973    other
259974    other
259975    other
259976    other
259977    other
259978    other
259979    other
259980    other
259981    other
259982    other
259983    other
259984    other
259985    other
259986    other
259987    other
259988    other
259989    other
259990    other
259991    other
259992    other
259993    other
259994    other
259995    other
259996    other
259997    other
259998    other
259999    other
Name: 0, Length: 260000,

In [6]:
observed = minnesota_table
national_ratio = national_table / len(national)   # 全部的比率
expected = national_ratio * len(minnesota)      # 期待的值
chi_squared_stat = (((observed - expected)**2)/expected).sum()
print(chi_squared_stat)

col_0
count    17.250749
dtype: float64


In [7]:
expected

col_0,count
0,Unnamed: 1_level_1
asian,79.326923
black,264.423077
hispanic,317.307692
other,185.096154
white,528.846154


> 在以前我学习的时候，求出这个值之后，需要我们进行查表来获取相应的值，但是在scipy中为我们提供了相应的方法。

In [8]:
crit = stats.chi2.ppf(q=0.95,      # 95% confidence
                      df=4)         # DF = number of variable categories - 1
print('Critical value')
print(crit)
p_value = 1 - stats.chi2.cdf(x=chi_squared_stat,    # p-value
                            df=4)
print("P value")
print(p_value)

Critical value
9.48772903678
P value
[ 0.00172769]


> * 在使用scipy进行卡方检测的时候，我们可以使用 **`stats.chisquare`** 直接进行操作。
> * 返回的结果中的 **`statistic`** 参数表示的是卡方得出的值
> * 返回的结果中参数 **`pvalue`** 是我们需要使用的p的值。

In [9]:
stats.chisquare(f_obs=observed,     # Array of observed counts
               f_exp=expected)      # Array of expected counts

Power_divergenceResult(statistic=array([ 18.19480519]), pvalue=array([ 0.00113047]))

## chi-square independence test

独立性是指知道一个变量不会告诉你任何关于另一个变量的信息。比如你是几月出生的不会决定你的工资水平。卡方独立性检验就是用于判断两个分类变量s会否独立。

> * 在使用 **`random.choice`** 的时候，参数 **`a`** 是我们要取的集合，而参数 **`p`** 是我们提供的每个集合的元素的对应的概率。
> * 在使用 **`crosstab`** 的时候，我们如果设置了参数 **`margins`** 为True的话，crosstab会为我们求和。

In [19]:
np.random.seed(2016)
# race
voter_race = np.random.choice(a=['asian', 'black', 'hispanic', 'other', 'white'], 
                              p=[0.05, 0.15, 0.25, 0.05, 0.5], size=1000)
# party
voter_party = np.random.choice(a=['democrat', 'independent', 'republican'], 
                              p=[0.4, 0.2, 0.4], size=1000)
voters = pd.DataFrame({'race': voter_race, 'party': voter_party})
voter_tab = pd.crosstab(voter_race, voter_party, margins=True)
voter_tab.columns = ['democrat', 'independent', 'republican', 'row_totals']    # 设置列名
voter_tab.index = ['asian', 'black', 'hispanic', 'other', 'white', 'col_totals']   # 设置row的名称
observed = voter_tab.ix[0:5, 0:3]      # 不包含行和列的汇总值
voter_tab

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':


Unnamed: 0,democrat,independent,republican,row_totals
asian,18,4,16,38
black,56,36,58,150
hispanic,100,61,96,257
other,29,11,32,72
white,197,94,192,483
col_totals,400,206,394,1000


In [21]:
voter_tab.loc['col_totals'][0:3]

democrat       400
independent    206
republican     394
Name: col_totals, dtype: int64

In [23]:
voter_tab['row_totals'][0:5]

asian        38
black       150
hispanic    257
other        72
white       483
Name: row_totals, dtype: int64

In [22]:
# np.outer 是表示求外积
np.outer?

In [24]:
np.outer(voter_tab['row_totals'][0:5], voter_tab.ix['col_totals'][0:3])

array([[ 15200,   7828,  14972],
       [ 60000,  30900,  59100],
       [102800,  52942, 101258],
       [ 28800,  14832,  28368],
       [193200,  99498, 190302]])

In [25]:
expected = np.outer(voter_tab['row_totals'][0:5], 
                    voter_tab.ix['col_totals'][0:3]) / 1000
expected = pd.DataFrame(expected)
expected.columns = ['democrat', 'independent', 'republican']
expected.index = ['asian', 'black', 'hispanic', 'other', 'white']
expected

Unnamed: 0,democrat,independent,republican
asian,15.2,7.828,14.972
black,60.0,30.9,59.1
hispanic,102.8,52.942,101.258
other,28.8,14.832,28.368
white,193.2,99.498,190.302


In [27]:
chi_squared_stat = (((observed - expected)**2)/expected).sum().sum()
print(chi_squared_stat)

7.01309736929


In [29]:
crit = stats.chi2.ppf(q=0.95,      # critical value for 95% confidence
                    df=8)         # (5-1)*(3-1)
print('Critical value')
print(crit)
print('*'*50)
p_value = 1 - stats.chi2.cdf(x=chi_squared_stat,
                            df=8)
print('P value')
print(p_value)

Critical value
15.5073130559
**************************************************
P value
0.535220220178


> * 在scipy中进行卡方的独立性检验是通过 **`stats.chi2_contingency`** 进行的。
> * 这个方法的结果分别是：chi-square statistic, p-value ,degrees of freedom,expected counts.

In [30]:
stats.chi2_contingency(observed=observed)

(7.0130973692853118,
 0.53522022017829307,
 8,
 array([[  15.2  ,    7.828,   14.972],
        [  60.   ,   30.9  ,   59.1  ],
        [ 102.8  ,   52.942,  101.258],
        [  28.8  ,   14.832,   28.368],
        [ 193.2  ,   99.498,  190.302]]))