# do-operator

## Causal network

In [1]:
import networkx as nx
from pybbn.pptc import create_pptc_model

d = nx.DiGraph()
d.add_nodes_from(['drug', 'gender', 'recovery'])
d.add_edges_from([('gender', 'drug'), ('gender', 'recovery'), ('drug', 'recovery')])

p = {
    'gender': {
        'columns': ['gender', '__p__'],
        'data': [
            ['male', 0.51], ['female', 0.49]
        ]
    },
    'drug': {
        'columns': ['gender', 'drug', '__p__'],
        'data': [
            ['female', 'no', 0.24],
            ['female', 'yes', 0.76],
            ['male', 'no', 0.76],
            ['male', 'yes', 0.24]
        ]
    },
    'recovery': {
        'columns': ['gender', 'drug', 'recovery', '__p__'],
        'data': [
            ['female', 'no', 'no', 0.31],
            ['female', 'no', 'yes', 0.69],
            ['female', 'yes', 'no', 0.27],
            ['female', 'yes', 'yes', 0.73],
            ['male', 'no', 'no', 0.13],
            ['male', 'no', 'yes', 0.87],
            ['male', 'yes', 'no', 0.07],
            ['male', 'yes', 'yes', 0.93]
        ]
    }
}

m = create_pptc_model(d, p)

### Marginal probabilities

In [2]:
q = m.query()

In [3]:
q['gender']

Unnamed: 0,gender,__p__
0,female,0.49
1,male,0.51


In [4]:
q['drug']

Unnamed: 0,drug,__p__
0,no,0.5052
1,yes,0.4948


In [5]:
q['recovery']

Unnamed: 0,recovery,__p__
0,no,0.19596
1,yes,0.80404


### Interventional probabilities

In [6]:
from pybbn.doop import do

p_Y_do_X_1 = do(Y=['recovery'], y=['yes'], X=['drug'], x=['yes'], Z=['gender'], model=m)
p_Y_do_X_0 = do(Y=['recovery'], y=['yes'], X=['drug'], x=['no'], Z=['gender'], model=m)

print(f'P(Y=yes | do(X=yes)) = {p_Y_do_X_1:.5f}')
print(f'P(Y=yes | do(X=no)) = {p_Y_do_X_0:.5f}')
print(f'ACE = P(Y=yes | do(X=yes)) - P(Y=yes | do(X=no)) = {p_Y_do_X_1 - p_Y_do_X_0:.5f}')

P(Y=yes | do(X=yes)) = 0.83200
P(Y=yes | do(X=no)) = 0.78180
ACE = P(Y=yes | do(X=yes)) - P(Y=yes | do(X=no)) = 0.05020


## Sampling

In [7]:
from pybbn.sampling import sample
import pandas as pd

df = sample(m, max_samples=1_000)
df.shape

(1000, 3)

In [8]:
df.head()

Unnamed: 0,gender,drug,recovery
0,male,no,no
1,male,yes,yes
2,female,yes,yes
3,male,yes,yes
4,female,yes,yes


In [9]:
m.query()['gender'].set_index(['gender'])['__p__'].to_dict()

{'female': 0.49, 'male': 0.51}

In [10]:
(df['gender'].value_counts() / df.shape[0]).to_dict()

{'female': 0.518, 'male': 0.482}

In [11]:
df['drug'].value_counts() / df.shape[0]

drug
no     0.501
yes    0.499
Name: count, dtype: float64

In [12]:
df['recovery'].value_counts() / df.shape[0]

recovery
yes    0.793
no     0.207
Name: count, dtype: float64

## Structural casual model

The functional form is written as follows.

- $G = e_G$
- $D = f_D (G) + e_D$
- $R = f_R (G, D) + e_R$

With algebraic manipulations, we can isolate $U = \{e_G, e_D, e_R\}$ as follows.

- $e_G = G$
- $e_D = D - f_D(G)$
- $e_R = R - f_R(G, D)$

In [13]:
Xy = df.assign(
    gender=lambda d: d['gender'].map({'male': 1, 'female': 0}),
    drug=lambda d: d['drug'].map({'yes': 1, 'no': 0}),
    recovery=lambda d: d['recovery'].map({'yes': 1, 'no': 0})
)
Xy.shape

(1000, 3)

In [14]:
from sklearn.ensemble import RandomForestClassifier

X, y = Xy[['gender']], Xy['drug']
m_d = RandomForestClassifier(class_weight='balanced', random_state=37).fit(X, y)

X, y = Xy[['gender', 'drug']], Xy['recovery']
m_r = RandomForestClassifier(class_weight='balanced', random_state=37).fit(X, y)

- Male, taken drug and recovered.
- What is probability of recovered had patient not taken drug given male, taken drug and recovered?

In [15]:
g = 1
d = 1
r = 1

Abduction

In [16]:
e_g = g
e_g

1

In [17]:
e_d = (d - m_d.predict_proba(pd.DataFrame([[g]], columns=['gender']))[:,1])[0]
e_d

0.5014892194192951

In [18]:
e_r = (r - m_r.predict_proba(pd.DataFrame([[g, d]], columns=['gender', 'drug']))[:,1])[0]
e_r

0.4571883977928637

Action

In [19]:
d = 0

Prediction

In [20]:
r_d = m_r.predict_proba(pd.DataFrame([[g, d]], columns=['gender', 'drug']))[:,1][0]
r_d

0.47898707520802136

In [21]:
r_d + e_r

0.9361754730008851

In [22]:
['female', 'no', 'no', 0.31],
['female', 'no', 'yes', 0.69],
['female', 'yes', 'no', 0.27],
['female', 'yes', 'yes', 0.73],
['male', 'no', 'no', 0.13],
['male', 'no', 'yes', 0.87],
['male', 'yes', 'no', 0.07],
['male', 'yes', 'yes', 0.93]

['male', 'yes', 'yes', 0.93]

In [23]:
m_r.predict_proba(pd.DataFrame([[0, 0], [0, 1], [1, 0], [1, 1]], columns=['gender', 'drug']))

array([[0.49269755, 0.50730245],
       [0.52313748, 0.47686252],
       [0.52101292, 0.47898708],
       [0.4571884 , 0.5428116 ]])

In [24]:
g = 1
d = 0
r = 0

e_g = g
e_d = (d - m_d.predict_proba(pd.DataFrame([[g]], columns=['gender']))[:,1])[0]
e_r = (r - m_r.predict_proba(pd.DataFrame([[g, d]], columns=['gender', 'drug']))[:,1])[0]

d = 1
r_d = m_r.predict_proba(pd.DataFrame([[g, d]], columns=['gender', 'drug']))[:,1][0]

r_d, e_r, r_d + e_r

(0.5428116022071363, -0.47898707520802136, 0.06382452699911495)

In [25]:
g = 0
d = 1
r = 1

e_g = g
e_d = (d - m_d.predict_proba(pd.DataFrame([[g]], columns=['gender']))[:,1])[0]
e_r = (r - m_r.predict_proba(pd.DataFrame([[g, d]], columns=['gender', 'drug']))[:,1])[0]

d = 0
r_d = m_r.predict_proba(pd.DataFrame([[g, d]], columns=['gender', 'drug']))[:,1][0]

r_d, e_r, r_d + e_r

(0.5073024487921536, 0.5231374773914773, 1.030439926183631)

In [26]:
g = 0
d = 0
r = 1

e_g = g
e_d = (d - m_d.predict_proba(pd.DataFrame([[g]], columns=['gender']))[:,1])[0]
e_r = (r - m_r.predict_proba(pd.DataFrame([[g, d]], columns=['gender', 'drug']))[:,1])[0]

d = 1
r_d = m_r.predict_proba(pd.DataFrame([[g, d]], columns=['gender', 'drug']))[:,1][0]

r_d, e_r, r_d + e_r

(0.4768625226085227, 0.4926975512078464, 0.9695600738163691)

In [27]:
from sklearn.linear_model import LinearRegression

X, y = Xy[['gender']], Xy['drug']
m_d = LinearRegression()
m_d.fit(X, y)

X, y = Xy[['gender', 'drug']], Xy['recovery']
m_r = LinearRegression().fit(X, y)

In [28]:
g = 1
d = 1
r = 1

e_g = g
e_d = (d - m_d.predict(pd.DataFrame([[g]], columns=['gender'])))[0]
e_r = (r - m_r.predict(pd.DataFrame([[g, d]], columns=['gender', 'drug'])))[0]

d = 0
r_d = m_r.predict(pd.DataFrame([[g, d]], columns=['gender', 'drug']))[0]

r_d, e_r, r_d + e_r

(0.7942583962022296, 0.19259864516488523, 0.9868570413671148)

In [29]:
g = 1
d = 0
r = 1

e_g = g
e_d = (d - m_d.predict(pd.DataFrame([[g]], columns=['gender'])))[0]
e_r = (r - m_r.predict(pd.DataFrame([[g, d]], columns=['gender', 'drug'])))[0]

d = 1
r_d = m_r.predict(pd.DataFrame([[g, d]], columns=['gender', 'drug']))[0]

r_d, e_r, r_d + e_r

(0.8074013548351148, 0.2057416037977704, 1.0131429586328853)