# Estimating counterfactuals

## Reasoning model

In [1]:
from pybbn.probabilistic import create_reasoning_model

def get_model():
    d = {
        'nodes': ['drug', 'gender', 'recovery'],
        'edges': [('gender', 'drug'), ('gender', 'recovery'), ('drug', 'recovery')]
    }
    
    p = {
        'gender': {
            'columns': ['gender', '__p__'],
            'data': [
                ['male', 0.51], ['female', 0.49]
            ]
        },
        'drug': {
            'columns': ['gender', 'drug', '__p__'],
            'data': [
                ['female', 'no', 0.24],
                ['female', 'yes', 0.76],
                ['male', 'no', 0.76],
                ['male', 'yes', 0.24]
            ]
        },
        'recovery': {
            'columns': ['gender', 'drug', 'recovery', '__p__'],
            'data': [
                ['female', 'no', 'no', 0.90],
                ['female', 'no', 'yes', 0.10],
                ['female', 'yes', 'no', 0.27],
                ['female', 'yes', 'yes', 0.73],
                ['male', 'no', 'no', 0.99],
                ['male', 'no', 'yes', 0.01],
                ['male', 'yes', 'no', 0.07],
                ['male', 'yes', 'yes', 0.93]
            ]
        }
    }
    
    model = create_reasoning_model(d, p)
    return model

model = get_model()

## Build SCM

In [2]:
import itertools
import pandas as pd
import numpy as np

def learn_hidden_node(n_clusters, model, N=10_000):
    Xy = model.sample(max_samples=N)
    
    if n_clusters == 'auto':
        u_records = Xy.drop_duplicates().shape[0]
        u_values = Xy['recovery'].unique().shape[0]
        n_clusters = (u_records + u_values) / 2.0
        n_clusters = np.round(n_clusters)
        n_clusters = int(n_clusters)
        
    n_unique = Xy.drop_duplicates().shape[0]
    if n_clusters > n_unique:
        n_clusters = n_unique

    n_unique = Xy['recovery'].unique().shape[0]
    if n_clusters < n_unique:
        n_clusters = n_unique
    
    c = np.random.randint(0, n_clusters, size=Xy.shape[0])
    
    return Xy.assign(**{'__u__': [f'c{label}' for label in c]})
    
def get_cpt_with_parents(y, parents, df):
    pa_domains = {pa: sorted(list(df[pa].unique())) for pa in parents}
    y_domain = sorted(list(df[y].unique()))

    N = df.shape[0]
    data = []
    for pa_v in itertools.product(*[pa_domains[pa] for pa in parents]):
        for y_v in y_domain:
            q_pa = ' and '.join([f'{_pa}=="{_pa_v}"' for _pa, _pa_v in zip(parents, pa_v)])
            q_joint = f'{q_pa} and {y}=="{y_v}"'

            p_pa = df.query(q_pa).shape[0]
            p_joint = df.query(q_joint).shape[0]

            p = p_joint / p_pa if p_pa != 0 else 0

            row = list(pa_v) + [y_v, p]
            data.append(row)

    return {
        'columns': parents + [y, '__p__'],
        'data': data
    }

def get_cpt_without_parents(y, df):
    n = df[y].value_counts().sort_index()
    p = n / df.shape[0]
    return {
        'columns': [y, '__p__'],
        'data': [[_i, _v] for _i, _v in zip(p.index, p.values)]
    }

def get_cpt(y, parents, df):
    if len(parents) == 0:
        return get_cpt_without_parents(y, df)
    else:
        return get_cpt_with_parents(y, parents, df)

def get_scm_parameters(y, parents, df, model):
    p = {p: model.node_potentials[p].to_dict('split', index=False) for p in parents}
    p['__u__'] = get_cpt('__u__', [], df)
    p[y] = get_cpt(y, parents + ['__u__'], df)

    return p

def get_scm_graph(y, parents, model):
    d = model.d.copy()

    include_nodes = set(parents + [y])
    all_nodes = set(d.nodes())
    for n in all_nodes:
        if n not in include_nodes:
            d.remove_node(n)
    
    d.add_node('__u__')
    d.add_edge('__u__', y)
    
    return d

def get_scm(y, parents, n_states, model, n_samples=10_000):
    Xy = learn_hidden_node(n_states, model)
    d = get_scm_graph(y, parents, model)
    p = get_scm_parameters(y, parents, Xy, model)
    m = create_reasoning_model(d, p)
    return m

scm_model = get_scm('recovery', ['gender', 'drug'], 'auto', model)

## Abduction

In [3]:
_e = scm_model.get_observation_evidences({'gender': 'male', 'drug': 'yes', 'recovery': 'yes'})
u = scm_model.pquery(['__u__'], evidences=_e)['__u__'] \
    .assign(**{'__p__': lambda d: d['__p__'] + 0.001}) \
    .assign(**{'__p__': lambda d: d['__p__'] / d['__p__'].sum()})
u

Unnamed: 0,__u__,__p__
0,c0,0.195225
1,c1,0.214358
2,c2,0.197877
3,c3,0.198219
4,c4,0.19432


## Prediction

In [4]:
_e = scm_model.get_observation_evidences({'gender': 'male', 'drug': 'no'})
_e['__u__'] = u

scm_model.pquery(['recovery'], evidences=_e)['recovery']

Unnamed: 0,recovery,__p__
0,no,0.603551
1,yes,0.396449
