In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import scipy.stats as ss
from scipy.interpolate import interp1d
from scipy.special import expit
from fractions import Fraction
from empiricaldist import Pmf, Cdf
import matplotlib.pyplot as plt
from collections import Counter
import statsmodels.formula.api as smfa

In [2]:
from data.utils import prob, conditional

In [3]:
gss = pd.read_csv(Path.cwd() / 'data' / 'gss_bayes.csv')
gss.head()

Unnamed: 0,caseid,year,age,sex,polviews,partyid,indus10
0,1,1974,21.0,1,4.0,2.0,4970.0
1,2,1974,41.0,1,5.0,0.0,9160.0
2,5,1974,58.0,2,6.0,1.0,2670.0
3,6,1974,30.0,1,5.0,4.0,6870.0
4,7,1974,48.0,1,5.0,4.0,7860.0


In [4]:
banker = (gss['indus10'] == 6870)
banker.head(), banker.sum(), banker.mean()

(0    False
 1    False
 2    False
 3     True
 4    False
 Name: indus10, dtype: bool,
 728,
 0.014769730168391155)

In [5]:
male = (gss['sex'] == 1)
female = (gss['sex'] == 2)
# 왜 prob 함수가 제대로 안먹지..? → utils 파일에 prob 이름의 다른 함수가 있었음 → 수정
prob(male), prob(female)

(0.46214242239805237, 0.5378575776019476)

In [6]:
liberal = (gss['polviews'] <= 3)
democrat = (gss['partyid'].isin([0, 1]))
prob(liberal), prob(democrat)

(0.27374721038750255, 0.3662609048488537)

In [8]:
prob(banker & democrat), prob(democrat & banker)

(0.004686548995739501, 0.004686548995739501)

In [12]:
selected = democrat[liberal]
selected, prob(selected)

(8        False
 17        True
 18       False
 24        True
 28       False
          ...  
 49267    False
 49271     True
 49278    False
 49279     True
 49285     True
 Name: partyid, Length: 13493, dtype: bool,
 0.5206403320240125)

In [13]:
prob(female[banker])

0.7706043956043956

In [16]:
conditional(liberal, female), conditional(female, liberal & democrat), conditional(liberal & female, banker)

(0.27581004111500884, 0.576085409252669, 0.17307692307692307)

In [17]:
# P(A|B) = P(A and B) / P(B)
conditional(female, banker) == prob(female & banker) / prob(banker)

True

In [20]:
# P(A and B) = P(A|B) * P(B)
prob(liberal & democrat) == (conditional(liberal, democrat) * prob(democrat))

True

In [21]:
# P(A|B) = P(B|A) * P(A) / P(B)
conditional(liberal, banker) == (conditional(banker, liberal) * prob(liberal) / prob(banker))

True

In [22]:
# Law of total probability
# P(A) = P(B1 and A) + P(B2 and A) cf. Bn → MECE
prob(male & banker) + prob(female & banker) == prob(banker)

True

In [23]:
# P(A) = ∑P(Bi)P(A|Bi)
# 부동소수점수 계산으로 인한 False
prob(male) * conditional(banker, male) + prob(female) * conditional(banker, female), prob(male & banker) + prob(female & banker)

(0.014769730168391153, 0.014769730168391155)

In [24]:
B = gss['polviews']
B.value_counts().sort_index()

1.0     1442
2.0     5808
3.0     6243
4.0    18943
5.0     7940
6.0     7319
7.0     1595
Name: polviews, dtype: int64

In [25]:
# sum()과 Generator Expression 이용
sum(prob(B == i) * conditional(banker, B == i) for i in range(1, 8)), prob(banker)

(0.014769730168391157, 0.014769730168391155)