# Estimating counterfactuals

## Data simulation

In [5]:
import networkx as nx
from pybbn.probabilistic import create_reasoning_model

def get_model():
    d = nx.DiGraph()
    d.add_nodes_from(['drug', 'gender', 'recovery'])
    d.add_edges_from([('gender', 'drug'), ('gender', 'recovery'), ('drug', 'recovery')])
    
    p = {
        'gender': {
            'columns': ['gender', '__p__'],
            'data': [
                ['male', 0.51], ['female', 0.49]
            ]
        },
        'drug': {
            'columns': ['gender', 'drug', '__p__'],
            'data': [
                ['female', 'no', 0.24],
                ['female', 'yes', 0.76],
                ['male', 'no', 0.76],
                ['male', 'yes', 0.24]
            ]
        },
        'recovery': {
            'columns': ['gender', 'drug', 'recovery', '__p__'],
            'data': [
                ['female', 'no', 'no', 0.90],
                ['female', 'no', 'yes', 0.10],
                ['female', 'yes', 'no', 0.27],
                ['female', 'yes', 'yes', 0.73],
                ['male', 'no', 'no', 0.99],
                ['male', 'no', 'yes', 0.01],
                ['male', 'yes', 'no', 0.07],
                ['male', 'yes', 'yes', 0.93]
            ]
        }
    }
    
    model = create_reasoning_model(d, p)
    return model

model = get_model()

In [10]:
import numpy as np

N = 10_000
Xy = model.sample(max_samples=N) \
    .assign(
        gender=lambda d: d['gender'].map({'male': 1, 'female': 0}),
        drug=lambda d: d['drug'].map({'yes': 1, 'no': 0}),
        recovery=lambda d: d['recovery'].map({'yes': 1, 'no': 0})
    )
Xy.shape

(10000, 3)

In [11]:
Xy.head()

Unnamed: 0,gender,drug,recovery
0,1,0,0
1,1,1,1
2,0,1,0
3,1,1,1
4,0,1,1


## Prediction models

In [54]:
from sklearn.ensemble import RandomForestClassifier

X, y = Xy[['gender', 'drug']], Xy['recovery']
p_model = RandomForestClassifier(n_jobs=-1, random_state=37) \
    .fit(X, y)

In [55]:
from sklearn.preprocessing import OneHotEncoder

u = OneHotEncoder(sparse_output=False).fit_transform(Xy[['recovery']]) - p_model.predict_proba(X)
u = [_u[_y] for _u, _y in zip(u, y)]
u = np.array(u)
u

array([0.39675612, 0.59693957, 0.39464661, ..., 0.40852612, 0.59693957,
       0.60535339])

In [56]:
X, y = Xy[['gender', 'drug']].assign(noise=u), Xy['recovery']
p_model = RandomForestClassifier(n_jobs=-1, random_state=37) \
    .fit(X, y)

## Abduction models

In [57]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

X, y = Xy[['gender', 'drug', 'recovery']], u
u_model = RandomForestRegressor(n_jobs=-1, random_state=37) \
    .fit(X, y)

## Counterfactual 1

- f: did not take pill, did not recover
- cf: take pill
- 16% chance recovery had taken pill (male)
- 67% chance recovery had taken pill (female)

In [65]:
import pandas as pd

def to_df(v, c):
    return pd.DataFrame([v], columns=c)

u = u_model.predict(to_df([1, 1, 1], ['gender', 'drug', 'recovery']))[0]
p = p_model.predict_proba(to_df([1, 0, u], ['gender', 'drug', 'noise']))

u, p

(0.5969395680230405, array([[0., 1.]]))

In [67]:
u = u_model.predict(to_df([0, 0, 1], ['gender', 'drug', 'recovery']))[0]
p = p_model.predict_proba(to_df([1, 1, u], ['gender', 'drug', 'noise']))

u, p

(0.5914738763863162, array([[0., 1.]]))