# Homework Week 07

Material from Chapter 12 (Monsters and Mixtures)

In [2]:
import numpy as np
import pandas as pd
import arviz as az
import matplotlib.pyplot as plt
import pymc3 as pm
import scipy.stats as stats
from scipy.special import expit as logistic
import theano.tensor as tt

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)  #suppress Arviz warnings

RANDOM_SEED = 8927

In [3]:
d = pd.read_csv("Data/Trolley.csv",
               sep=";")
d.head()

Unnamed: 0,case,response,order,id,age,male,edu,action,intention,contact,story,action2
0,cfaqu,4,2,96;434,14,0,Middle School,0,0,1,aqu,1
1,cfbur,3,31,96;434,14,0,Middle School,0,0,1,bur,1
2,cfrub,4,16,96;434,14,0,Middle School,0,0,1,rub,1
3,cibox,3,32,96;434,14,0,Middle School,0,1,1,box,1
4,cibur,3,4,96;434,14,0,Middle School,0,1,1,bur,1


## Q1

>In the Trolley data—data(Trolley)—we saw how education level (modeled as
an ordered category) is associated with responses. Is this association causal? One
plausible confound is that education is also associated with age, through a causal
process: People are older when they finish school than when they begin it.
Reconsider the Trolley data in this light. Draw a DAG that represents hypothetical
causal relationships among response, education, and age. Which statical model
or models do you need to evaluate the causal influence of education on responses?
Fit these models to the trolley data. What do you conclude about the causal relationships
among these three variables?

### Setup data

Use same process as in Chapter 12 to reorder educational indexes.

In [8]:
d["edu_new"] = pd.Categorical(
    d.edu.values,
    categories=[
        "Elementary School",
        "Middle School",
        "Some High School",
        "High School Graduate",
        "Some College",
        "Bachelor's Degree",
        "Master's Degree",
        "Graduate Degree",
    ],
    ordered=True,
)

In [9]:
d["edu_new"] = d.edu_new.cat.codes

### Standardize age

In [10]:
d["age_s"] = (d["age"] - d["age"].mean())/d["age"].std()

In [11]:
# define some variables to save space/typing
R = d.response.values - 1  # want zero to be smallest value
A = d.action.values
I = d.intention.values
C = d.contact.values
E = d.edu_new.values
Age = d.age_s.values

In [13]:
with pm.Model() as q1_a:
    kappa = pm.Normal(  # used to be cutpoints
        "kappa",
        0.0,
        1.5,
        transform=pm.distributions.transforms.ordered,
        shape=6,
        testval=np.arange(6),
    )

    bA = pm.Normal("bA", 0.0, 0.5)
    bC = pm.Normal("bC", 0.0, 0.5)
    bI = pm.Normal("bI", 0.0, 0.5)
    bE = pm.Normal("bE", 0.0, 0.5)
    bAge = pm.Normal("bAge", 0.0, 0.5)

    delta = pm.Dirichlet("delta", np.repeat(2.0, 7), shape=7)
    delta_j = tt.concatenate([tt.zeros(1), delta])
    delta_j_cumulative = tt.cumsum(delta_j)

    phi = bE * delta_j_cumulative[E] + bA * A + bC * C + bI * I + bAge * Age

    resp_obs = pm.OrderedLogistic("resp_obs", phi, kappa, observed=R)

    trace1_a = pm.sample(2000, tune=2000, target_accept=0.9, random_seed=RANDOM_SEED)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [delta, bAge, bE, bI, bC, bA, kappa]


Sampling 2 chains for 2_000 tune and 2_000 draw iterations (4_000 + 4_000 draws total) took 1003 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [16]:
with q1_a:
    res1_a = az.summary(trace1_a, round_to=2)
res1_a

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
bA,-0.7,0.04,-0.77,-0.62,0.0,0.0,3063.98,2814.4,1.0
bC,-0.95,0.05,-1.04,-0.85,0.0,0.0,2691.66,2801.46,1.0
bI,-0.71,0.04,-0.78,-0.65,0.0,0.0,4085.28,2341.82,1.0
bE,0.23,0.11,0.03,0.43,0.01,0.0,690.8,387.1,1.0
bAge,-0.1,0.02,-0.14,-0.06,0.0,0.0,1269.57,1382.0,1.0
kappa[0],-2.68,0.09,-2.83,-2.5,0.0,0.0,687.91,472.76,1.0
kappa[1],-1.99,0.09,-2.14,-1.83,0.0,0.0,688.6,477.66,1.0
kappa[2],-1.41,0.09,-1.56,-1.26,0.0,0.0,697.11,446.41,1.0
kappa[3],-0.39,0.09,-0.53,-0.24,0.0,0.0,698.6,432.6,1.0
kappa[4],0.28,0.09,0.14,0.45,0.0,0.0,669.24,459.81,1.0


### Interpretation

Age sucks up some of the impact from education.

## Q2

> Consider one more variable in the Trolley data: Gender. Suppose that gender
might influence education as well as response directly. Draw the DAG now that
includes response, education, age, and gender.
Using only the DAG, is it possible that the inferences from Problem 1 are confounded
by gender? If so, define any additional models you need to infer the causal
influence of education on response. What do you conclude?

In [17]:
gid = d.male.values

In [18]:
with pm.Model() as q2_a:
    kappa = pm.Normal(  # used to be cutpoints
        "kappa",
        0.0,
        1.5,
        transform=pm.distributions.transforms.ordered,
        shape=6,
        testval=np.arange(6),
    )

    bA = pm.Normal("bA", 0.0, 0.5)
    bC = pm.Normal("bC", 0.0, 0.5)
    bI = pm.Normal("bI", 0.0, 0.5)
    bE = pm.Normal("bE", 0.0, 0.5)
    bAge = pm.Normal("bAge", 0.0, 0.5)
    a_g = pm.Normal("a_g", 0, 0.5, shape = 2)

    delta = pm.Dirichlet("delta", np.repeat(2.0, 7), shape=7)
    delta_j = tt.concatenate([tt.zeros(1), delta])
    delta_j_cumulative = tt.cumsum(delta_j)

    phi = bE * delta_j_cumulative[E] + bA * A + bC * C + bI * I + bAge * Age + a_g[gid]

    resp_obs = pm.OrderedLogistic("resp_obs", phi, kappa, observed=R)

    trace2_a = pm.sample(2000, tune=2000, target_accept=0.9, random_seed=RANDOM_SEED)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [delta, a_g, bAge, bE, bI, bC, bA, kappa]


Sampling 2 chains for 2_000 tune and 2_000 draw iterations (4_000 + 4_000 draws total) took 2851 seconds.


In [19]:
with q2_a:
    res = az.summary(trace2_a, round_to=2, var_names=["bA", "bC", "bI", "bE", "bAge", "a_g"])
res

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
bA,-0.71,0.04,-0.78,-0.63,0.0,0.0,4360.48,3221.47,1.0
bC,-0.96,0.05,-1.06,-0.87,0.0,0.0,4871.81,2945.9,1.0
bI,-0.72,0.04,-0.78,-0.65,0.0,0.0,4577.65,3271.35,1.0
bE,0.01,0.17,-0.3,0.29,0.0,0.0,1474.77,3086.15,1.0
bAge,-0.07,0.02,-0.11,-0.02,0.0,0.0,2522.83,2783.85,1.0
a_g[0],-0.05,0.31,-0.63,0.52,0.01,0.01,1338.57,1820.12,1.0
a_g[1],0.51,0.31,-0.08,1.06,0.01,0.01,1322.72,1991.94,1.0


### Interpretation
