In [29]:
from pathlib import Path
import pandas as pd
import numpy as np
import scipy.stats as ss
from scipy.interpolate import interp1d
from scipy.special import expit
from fractions import Fraction
from empiricaldist import Pmf, Cdf
import matplotlib.pyplot as plt
from collections import Counter
import statsmodels.formula.api as smfa

In [2]:
from data.utils import prob, conditional

In [3]:
gss = pd.read_csv(Path.cwd() / 'data' / 'gss_bayes.csv')
gss.head()

Unnamed: 0,caseid,year,age,sex,polviews,partyid,indus10
0,1,1974,21.0,1,4.0,2.0,4970.0
1,2,1974,41.0,1,5.0,0.0,9160.0
2,5,1974,58.0,2,6.0,1.0,2670.0
3,6,1974,30.0,1,5.0,4.0,6870.0
4,7,1974,48.0,1,5.0,4.0,7860.0


In [4]:
banker = (gss['indus10'] == 6870)
banker.sum(), banker.mean(), prob(banker)

(728, 0.014769730168391155, 0.014769730168391155)

In [19]:
male = (gss['sex'] == 1)
female = (gss['sex'] == 2)
prob(male), prob(female)

(0.46214242239805237, 0.5378575776019476)

In [6]:
liberal = (gss['polviews'] <= 3)
prob(liberal)

0.27374721038750255

In [7]:
democrat = (gss['partyid'].isin([0, 1]))
prob(democrat)

0.3662609048488537

In [8]:
prob(banker & democrat), prob(democrat & banker)

(0.004686548995739501, 0.004686548995739501)

In [9]:
selected = democrat[liberal]
prob(selected)

0.5206403320240125

In [10]:
prob(female[banker])

0.7706043956043956

In [12]:
conditional(liberal, female)

0.27581004111500884

In [13]:
conditional(female, liberal & democrat)

0.576085409252669

In [14]:
conditional(female & liberal, banker)

0.17307692307692307

In [16]:
prob(female & banker) / prob(banker) == conditional(female, banker)

True

In [17]:
prob(female & banker) == conditional(female, banker) * prob(banker)

True

In [18]:
conditional(liberal, banker) == prob(liberal) * conditional(banker, liberal) / prob(banker)

True

In [21]:
prob(banker) == prob(male & banker) + prob(female & banker)

True

In [25]:
# 부동소수점수 계산으로 인한 False
prob(male) * conditional(banker, male) + prob(female) * conditional(banker, female), prob(male & banker) + prob(female & banker)

(0.014769730168391153, 0.014769730168391155)

In [26]:
B = gss['polviews']
B.value_counts().sort_index()

1.0     1442
2.0     5808
3.0     6243
4.0    18943
5.0     7940
6.0     7319
7.0     1595
Name: polviews, dtype: int64

In [28]:
sum(prob(B == i) * conditional(banker, B == i) for i in range(1, 8)), prob(banker)

(0.014769730168391157, 0.014769730168391155)