A Simple definition of Probability is probability is a fraction of a finite set. This will be refined later.

In [2]:
# Load the General Social Survey data
from os.path import basename, exists

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve
        local, _ = urlretrieve(url, filename)
        print('Download ' + local)

download('https://github.com/AllenDowney/ThinkBayes2/raw/master/data/gss_bayes.csv')

Download gss_bayes.csv


In [3]:
import pandas as pd

gss = pd.read_csv('gss_bayes.csv', index_col=0)
gss.head()

Unnamed: 0_level_0,year,age,sex,polviews,partyid,indus10
caseid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1974,21.0,1,4.0,2.0,4970.0
2,1974,41.0,1,5.0,0.0,9160.0
5,1974,58.0,2,6.0,1.0,2670.0
6,1974,30.0,1,5.0,4.0,6870.0
7,1974,48.0,1,5.0,4.0,7860.0


In [5]:
banker = (gss['indus10'] == 6870)
banker.head()

caseid
1    False
2    False
5    False
6     True
7    False
Name: indus10, dtype: bool

In [6]:
banker.sum()

728

In [7]:
banker.mean() # probability randomly chosen person in the dataset is a banker

0.014769730168391155

In [8]:
def prob(A):
    """Computes the probability of a proposition, A."""
    return A.mean()

In [9]:
prob(banker)

0.014769730168391155

In [17]:
female = (gss['sex'] == 2)

In [18]:
prob(female)

0.5378575776019476

The values of polviews are on a seven-point scale:

1    Extremely liberal
2    Liberal
3    Slightly liberal
4    Moderate
5    Slightly conservative
6    Conservative
7    Extremely conservative

In [23]:
liberal = (gss['polviews'] <= 3)

In [24]:
prob(liberal)

0.27374721038750255

The values of partyid are encoded like this:

0    Strong democrat
1    Not strong democrat
2    Independent, near democrat
3    Independent
4    Independent, near republican
5    Not strong republican
6    Strong republican
7    Other party

In [19]:
democrat = (gss['partyid'] <= 1)

In [20]:
prob(democrat)

0.3662609048488537

In [21]:
prob(banker & democrat)

0.004686548995739501

In [25]:
#Conditional Probability
#What is the probability that a respondent is a Democrat, given that they are liberal?
selected = democrat[liberal]

In [26]:
prob(selected)

0.5206403320240125

In [27]:
#What is the probability that a respondent is female, given that they are a banker?
selected = female[banker]
prob(selected)

0.7706043956043956

In [30]:
def conditional(proposition, given):
    """Probability of A conditinal on given."""
    return prob(proposition[given])

In [31]:
conditional(liberal, given=female)

0.27581004111500884

In [32]:
#conditional probability is not commutative
conditional(female, given=liberal)

0.5419106203216483

In [33]:
#condition and conjunction
#probability a respondent is female, given that they are a liberal Democrat
conditional(female, given=liberal & democrat)

0.576085409252669

In [36]:
#probability liberal female, given that they are a banker
conditional(female & liberal, given=banker)

0.17307692307692307

In [37]:
#Theorem 1. Use Conjunction to compute a probability
#What fraction of bankers are female?
female[banker].mean()

0.7706043956043956

In [38]:
conditional(female, given=banker)

0.7706043956043956

In [39]:
#Theorem 1 P(A|B) = P(A & B)/P(B)
prob(female & banker) / prob(banker)

0.7706043956043956

In [40]:
#Theorem 2, Using a conditional probability to compute a conjunction. P(A & B) = P(B) * P(A|B)
prob(liberal & democrat) # compute directly

0.1425238385067965

In [41]:
#calc by Theorem 2
prob(democrat) * conditional(liberal, democrat)

0.1425238385067965

In [42]:
#Theorem 3: Using conditional(A, B) to compute conditional(B, A). P(A|B) = (P(A) * P(B|A)) / P(B) , Bayes Theorem
conditional(liberal, given=banker) #compute directly

0.2239010989010989

In [44]:
prob(liberal) * conditional(banker, given=liberal) / prob(banker) #bayes theorem

0.2239010989010989

In [45]:
#law of total probability
prob(banker)

0.014769730168391155

In [46]:
male = (gss['sex'] == 1)

In [47]:
prob(male & banker) + prob(female & banker)

0.014769730168391155

In [48]:
prob(male)*conditional(banker, given=male) + prob(female)*conditional(banker, given=female)

0.014769730168391153

In [50]:
B = gss['polviews']
B.value_counts().sort_index()

polviews
1.0     1442
2.0     5808
3.0     6243
4.0    18943
5.0     7940
6.0     7319
7.0     1595
Name: count, dtype: int64

In [51]:
# 4.0 considered moderate, prob of moderate banker
i = 4
prob(B==i) * conditional(banker, B==i)

0.005822682085615744

In [52]:
sum(prob(B==i) * conditional(banker, B==i) for i in range(1,8)) #generator expression

0.014769730168391157

In [53]:
prob(female & banker)

0.011381618989653074

In [55]:
prob(liberal & female & banker)

0.002556299452221546

In [56]:
prob(liberal & female & banker & democrat)

0.0012375735443294787

In [57]:
conditional(liberal, given=democrat)

0.3891320002215698

In [58]:
conditional(democrat, given=liberal)

0.5206403320240125

In [61]:
young = gss['age'] < 30
prob(young)

0.19435991073240008

In [62]:
old = gss['age'] >= 65
prob(old)

0.17328058429701765

In [64]:
conservative = gss['polviews'] >= 5
prob(conservative)

0.3419354838709677

In [65]:
#What is the probability that a randomly chosen respondent is a young liberal?
prob(young & liberal)

0.06579427875836884

In [66]:
#What is the probability that a young person is liberal?
conditional(liberal, given=young)

0.338517745302714

In [67]:
#What fraction of respondents are old conservatives?
prob(old & conservative)

0.06701156421180766

In [68]:
#What fraction of conservatives are old?
conditional(old, given=conservative)

0.19597721609113564