In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from scipy.special import expit
from scipy import stats

## Causal Graphs

In [2]:
np.random.seed(42)

n = 1000

debt = np.random.choice([100, 500, 1000], n)
call = np.random.binomial(1, debt / debt.max())
payment = np.random.normal(40 + 20 * call - 0.03 * debt, 5)

data = pd.DataFrame(dict(debt=debt, call=call, payment=payment))

In [3]:
data.corr()

Unnamed: 0,debt,call,payment
debt,1.0,0.729144,-0.395212
call,0.729144,1.0,0.225433
payment,-0.395212,0.225433,1.0


## Marginal Independence

In [4]:
np.random.seed(42)

n = 10000

smoker = np.random.binomial(1, 0.2, n)
lighter = np.random.binomial(1, 0.1 + 0.8 * smoker)
lung_cancer = np.random.binomial(1, 0.1 + 0.7 * smoker)
cancer_death = np.random.binomial(1, 0.1 + 0.5 * lung_cancer)

data = pd.DataFrame(dict(smoker=smoker,
                         lighter=lighter,
                         lung_cancer=lung_cancer,
                         cancer_death=cancer_death))

In [5]:
data.head()

Unnamed: 0,smoker,lighter,lung_cancer,cancer_death
0,0,0,0,0
1,1,1,1,1
2,0,0,0,1
3,0,0,0,0
4,0,0,0,0


In [6]:
(data
 .assign(count=1)   
 .groupby(["smoker", "lighter", "lung_cancer", "cancer_death"])
 .agg({"count":"sum"}))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
smoker,lighter,lung_cancer,cancer_death,Unnamed: 4_level_1
0,0,0,0,5818
0,0,0,1,653
0,0,1,0,288
0,0,1,1,444
0,1,0,0,691
0,1,0,1,68
0,1,1,0,31
0,1,1,1,45
1,0,0,0,27
1,0,1,0,58


In [7]:
data.corr()

Unnamed: 0,smoker,lighter,lung_cancer,cancer_death
smoker,1.0,0.724489,0.656842,0.32855
lighter,0.724489,1.0,0.466769,0.225772
lung_cancer,0.656842,0.466769,1.0,0.514679
cancer_death,0.32855,0.225772,0.514679,1.0


In [8]:
stats.pearsonr(
    data["smoker"],
    data["cancer_death"]
)

(0.32854951851980524, 2.473440835815302e-250)

## Conditional Independence: Conditining on Causes

In [9]:
(data
 .query("lung_cancer==1")
 .corr())

Unnamed: 0,smoker,lighter,lung_cancer,cancer_death
smoker,1.0,0.787903,,-0.012698
lighter,0.787903,1.0,,-0.023876
lung_cancer,,,,
cancer_death,-0.012698,-0.023876,,1.0


In [10]:
stats.pearsonr(
    data.query("lung_cancer==1")["smoker"],
    data.query("lung_cancer==1")["cancer_death"]
)

(-0.01269796852809334, 0.5350314336765932)

### Exercise
Is caring a ligher associated with lung cancer for the group that is a smoker? And for those that are not a smoker? What about the group that is both a smoker and died from lung cancer?

## Conditional Independence: Conditining on Effects

In [12]:
np.random.seed(42)

n = 10000

college_parents = np.random.binomial(1, 0.6, n)
college = np.random.binomial(1, 0.1 + 0.6 * college_parents)
white_collar = np.random.binomial(1, 0.1 + 0.4 * college_parents + 0.3 * college) 
high_income = np.random.binomial(1, 0.05 + 0.3 * college_parents + 0.5 * white_collar)

data = pd.DataFrame(dict(college_parents=college_parents,
                         college=college,
                         white_collar=white_collar,
                         high_income=high_income))

In [13]:
(data
 .assign(count=1)   
 .groupby(["college_parents", "college", "white_collar", "high_income"])
 .agg({"count":"sum"}))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
college_parents,college,white_collar,high_income,Unnamed: 4_level_1
0,0,0,0,2992
0,0,0,1,173
0,0,1,0,136
0,0,1,1,201
0,1,0,0,229
0,1,0,1,9
0,1,1,0,61
0,1,1,1,91
1,0,0,0,635
1,0,0,1,329


In [14]:
data.corr()

Unnamed: 0,college_parents,college,white_collar,high_income
college_parents,1.0,0.580337,0.561998,0.566458
college,0.580337,1.0,0.530236,0.433949
white_collar,0.561998,0.530236,1.0,0.673496
high_income,0.566458,0.433949,0.673496,1.0


In [15]:
stats.pearsonr(
    data["college"],
    data["high_income"]
)

(0.43394874175991693, 0.0)

In [16]:
stats.pearsonr(
    data.query("college_parents==1")["college"],
    data.query("college_parents==1")["high_income"]
)

(0.16268610207349257, 1.6984422397843882e-37)

In [17]:
query = "college_parents==1 & white_collar==1"
stats.pearsonr(
    data.query(query)["college"],
    data.query(query)["high_income"]
)

(0.0001615529204817643, 0.9915657831040383)