# Estimating counterfactuals


## Data simulation

In [1]:
import networkx as nx
from pybbn.probabilistic import create_reasoning_model

def get_model():
    d = nx.DiGraph()
    d.add_nodes_from(['drug', 'gender', 'recovery'])
    d.add_edges_from([('gender', 'drug'), ('gender', 'recovery'), ('drug', 'recovery')])
    
    p = {
        'gender': {
            'columns': ['gender', '__p__'],
            'data': [
                ['male', 0.51], ['female', 0.49]
            ]
        },
        'drug': {
            'columns': ['gender', 'drug', '__p__'],
            'data': [
                ['female', 'no', 0.24],
                ['female', 'yes', 0.76],
                ['male', 'no', 0.76],
                ['male', 'yes', 0.24]
            ]
        },
        'recovery': {
            'columns': ['gender', 'drug', 'recovery', '__p__'],
            'data': [
                ['female', 'no', 'no', 0.90],
                ['female', 'no', 'yes', 0.10],
                ['female', 'yes', 'no', 0.27],
                ['female', 'yes', 'yes', 0.73],
                ['male', 'no', 'no', 0.99],
                ['male', 'no', 'yes', 0.01],
                ['male', 'yes', 'no', 0.07],
                ['male', 'yes', 'yes', 0.93]
            ]
        }
    }
    
    model = create_reasoning_model(d, p)
    return model

model = get_model()

In [2]:
q = model.pquery()

In [3]:
q['gender']

Unnamed: 0,gender,__p__
0,female,0.49
1,male,0.51


In [4]:
q['drug']

Unnamed: 0,drug,__p__
0,no,0.5052
1,yes,0.4948


In [5]:
q['recovery']

Unnamed: 0,recovery,__p__
0,no,0.59868
1,yes,0.40132


In [6]:
import numpy as np

N = 10_000
Xy = model.sample(max_samples=N) \
    .assign(
        gender=lambda d: d['gender'].map({'male': 1, 'female': 0}),
        drug=lambda d: d['drug'].map({'yes': 1, 'no': 0}),
        recovery=lambda d: d['recovery'].map({'yes': 1, 'no': 0})
    )
Xy.shape

(10000, 3)

In [7]:
Xy.head()

Unnamed: 0,gender,drug,recovery
0,1,0,0
1,1,1,1
2,0,1,0
3,1,1,1
4,0,1,1


In [8]:
Xy.describe()

Unnamed: 0,gender,drug,recovery
count,10000.0,10000.0,10000.0
mean,0.5077,0.4947,0.4007
std,0.499966,0.499997,0.490065
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,1.0,0.0,0.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


## Prediction model

In [9]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

X_p, y_p = Xy[['gender', 'drug']], Xy['recovery']

p_model = RandomForestClassifier(n_estimators=500, class_weight='balanced', n_jobs=-1, random_state=37) \
    .fit(X_p, y_p)

In [10]:
from sklearn.metrics import roc_auc_score, average_precision_score

roc_auc_score(y_p, p_model.predict_proba(X_p)[:,1]), average_precision_score(y_p, p_model.predict_proba(X_p)[:,1])

(0.504534031072188, 0.4034913610586378)

## Counterfactual 1

- f: did not take pill, did not recover
- cf: take pill
- 16% chance recovery had taken pill (male)
- 67% chance recovery had taken pill (female)

In [20]:
import pandas as pd

def to_df(v, c):
    return pd.DataFrame([v], columns=c)

g, d, r = 1, 0, 0

a_pred = (r - p_model.predict_proba(to_df([g, d], ['gender', 'drug'])))[0]
p_pred = p_model.predict_proba(to_df([g, 1], ['gender', 'drug']))[0]

a_pred, p_pred, p_pred + a_pred

(array([-0.50435072, -0.49564928]),
 array([0.49951073, 0.50048927]),
 array([-0.00483998,  0.00483998]))

In [21]:
g, d, r = 0, 0, 0

a_pred = (r - p_model.predict_proba(to_df([g, d], ['gender', 'drug'])))[0]
p_pred = p_model.predict_proba(to_df([g, 1], ['gender', 'drug']))[0]

a_pred, p_pred, p_pred + a_pred

(array([-0.49351408, -0.50648592]),
 array([0.50449366, 0.49550634]),
 array([ 0.01097958, -0.01097958]))

In [23]:
g, d, r = 0, 1, 1

a_pred = (r - p_model.predict_proba(to_df([g, d], ['gender', 'drug'])))[0]
p_pred = p_model.predict_proba(to_df([g, 0], ['gender', 'drug']))[0]

a_pred, p_pred, p_pred + a_pred

(array([0.49550634, 0.50449366]),
 array([0.49351408, 0.50648592]),
 array([0.98902042, 1.01097958]))