In [1]:
import numpy as np
import matplotlib.pylab as plt
import scipy.stats as st
import seaborn as sns
import pandas as pd

In [2]:
import statsmodels.api as sm

# Образование и вера

General Social Survey — ежегодный опрос населения США, записывается около 5000 факторов. Из опроса 2008 года выбраны данные о вере и образовании 2000 опрошенных. Есть ли связь между степенью веры и уровнем образования?

**2008 General Social Survey, National Opinion Research center (Agresti A. Categorical Data Analysis. — Hoboken: John Wiley & Sons, 2013, табл. 3.2)**

In [3]:
data= pd.read_csv('./fe.txt', delimiter='\t')

In [4]:
data.head()

Unnamed: 0,Highest degree,don’t believe,no way to find out,some higher power,believe sometimes,believe but doubts,know God exists
0,Less than high school,9,8,27,8,47,236
1,High school or junior college,23,39,88,49,179,706
2,Bachelor or graduate,28,48,89,19,104,293


## Pearson $\chi^2$

In [5]:
values = data.values[:, 1:].astype(np.float)

In [6]:
values

array([[  9.,   8.,  27.,   8.,  47., 236.],
       [ 23.,  39.,  88.,  49., 179., 706.],
       [ 28.,  48.,  89.,  19., 104., 293.]])

In [7]:
st.chi2_contingency(values)

(76.1483261658133,
 2.842572791195739e-12,
 10,
 array([[ 10.05  ,  15.9125,  34.17  ,  12.73  ,  55.275 , 206.8625],
        [ 32.52  ,  51.49  , 110.568 ,  41.192 , 178.86  , 669.37  ],
        [ 17.43  ,  27.5975,  59.262 ,  22.078 ,  95.865 , 358.7675]]))

## G-test

In [8]:
st.chi2_contingency(values, lambda_ = "log-likelihood")

(73.18790888970611,
 1.0705703476205307e-11,
 10,
 array([[ 10.05  ,  15.9125,  34.17  ,  12.73  ,  55.275 , 206.8625],
        [ 32.52  ,  51.49  , 110.568 ,  41.192 , 178.86  , 669.37  ],
        [ 17.43  ,  27.5975,  59.262 ,  22.078 ,  95.865 , 358.7675]]))

## Коэффициент V Крамера

In [27]:
def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = st.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2/(n*(min(k,r)-1))) ### Ваш код

In [28]:
stat = cramers_corrected_stat(values)
stat

0.1379749308441694

## Корреляция между порядковыми переенными

In [29]:
n = 0
pc = 0
pd = 0
pt = 0
for i in range(3):
    for j in range(6):
        for k in range(3):
            for z in range(6):
                current = values[i, j]*values[k, z]
                n+=current
                if i==k or j == z:
                    pt+=current### Ваш код
                elif i>k and j>z:
                    pc+=current### Ваш код
                else:
                    pd+=current### Ваш код
gamma = (pc-pd)/(pc+pd)### Ваш код
print(gamma)


-0.6241487279280523


## Встречаемость через длину

In [12]:
new_data = []
for i in range(values.shape[0]):
    for j in range(values.shape[1]):
        new_data.extend([[i,j]]*int(values[i,j]))
new_data = np.array(new_data)
new_data.shape

(2000, 2)

In [13]:
st.pearsonr(new_data[:,0], new_data[:, 1])[0]

-0.16203759162368087

In [14]:
st.kendalltau(new_data[:,0], new_data[:,1])[0]

-0.1475371344671059

In [15]:
st.spearmanr(new_data[:,0], new_data[:,1])[0]

-0.16436204271249027

In [16]:
data.values

array([['Less than high school', 9, 8, 27, 8, 47, 236],
       ['High school or junior college', 23, 39, 88, 49, 179, 706],
       ['Bachelor or graduate', 28, 48, 89, 19, 104, 293]], dtype=object)

## Через statmodels

In [17]:
table = sm.stats.Table(data.values[:, 1:])

In [18]:
table.table_orig

array([[9, 8, 27, 8, 47, 236],
       [23, 39, 88, 49, 179, 706],
       [28, 48, 89, 19, 104, 293]], dtype=object)

In [19]:
chi2_test = table.test_nominal_association()

In [20]:
chi2_test.statistic

76.14832616581332

In [21]:
chi2_test.pvalue

2.842615032250251e-12

## Корреляция Мэтьюся
$$MCC = \frac{ad-bc}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}$$

In [25]:
for i in range(1, 3):
    print('#'*30)
    print (data.values[i, 0])
    for j in range(1, 6):
        a = values[i:, j:].sum()
        b = values[i:, :j].sum()
        c = values[:i, j:].sum()
        d = values[:i, :j].sum()
        C = (a*d-b*c)/np.sqrt((a+b)*(a+c)*(b+d)*(c+d))### Ваш код
        
        print('{}: {}'.format(data.columns[j+1], C))


##############################
High school or junior college
no way to find out: -0.008241633202217229
some higher power: -0.044881318567683896
believe sometimes: -0.05628603186189359
believe but doubts: -0.06771190361644595
know God exists: -0.08027652478563879
##############################
Bachelor or graduate
no way to find out: -0.06824160441477072
some higher power: -0.12757423096478482
believe sometimes: -0.17422602433945117
believe but doubts: -0.15385669222212642
know God exists: -0.14903829337062885
