In [88]:
from pybbn.probabilistic import create_reasoning_model

def get_model():
    d = {
        'nodes': ['drug', 'gender', 'recovery'],
        'edges': [('gender', 'drug'), ('gender', 'recovery'), ('drug', 'recovery')]
    }
    
    p = {
        'gender': {
            'columns': ['gender', '__p__'],
            'data': [
                ['male', 0.10], ['female', 0.80]
            ]
        },
        'drug': {
            'columns': ['gender', 'drug', '__p__'],
            'data': [
                ['female', 'no', 0.10],
                ['female', 'yes', 0.90],
                ['male', 'no', 0.10],
                ['male', 'yes', 0.90]
            ]
        },
        'recovery': {
            'columns': ['gender', 'drug', 'recovery', '__p__'],
            'data': [
                ['female', 'no', 'no', 0.90],
                ['female', 'no', 'yes', 0.10],
                ['female', 'yes', 'no', 0.10],
                ['female', 'yes', 'yes', 0.90],
                ['male', 'no', 'no', 0.99],
                ['male', 'no', 'yes', 0.01],
                ['male', 'yes', 'no', 0.07],
                ['male', 'yes', 'yes', 0.93]
            ]
        }
    }
    
    model = create_reasoning_model(d, p)
    return model

model = get_model()

In [89]:
N = 10_000
Xy = model.sample(max_samples=N)
Xy.shape

(10000, 3)

In [90]:
Xy.head()

Unnamed: 0,gender,drug,recovery
0,male,yes,yes
1,female,yes,yes
2,female,yes,yes
3,female,yes,yes
4,female,yes,yes


In [91]:
import pandas as pd

X = pd.get_dummies(Xy[['gender', 'drug']]).map(int)
X = Xy[['gender', 'drug']] \
    .assign(gender=lambda d: d['gender'].map({'male': 1, 'female': 0})) \
    .assign(drug=lambda d: d['drug'].map({'yes': 1, 'no': 0}))

y = Xy['recovery'].map({'yes': 1, 'no': 0})

X.shape, y.shape

((10000, 2), (10000,))

In [92]:
X.head()

Unnamed: 0,gender,drug
0,1,1
1,0,1
2,0,1
3,0,1
4,0,1


In [96]:
y.value_counts()

recovery
1    8152
0    1848
Name: count, dtype: int64

In [94]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# c_model = RandomForestClassifier(n_jobs=-1, random_state=37).fit(X, y)
c_model = LogisticRegression(solver='saga', random_state=37).fit(X, y)
# c_model = AdaBoostClassifier(n_estimators=1_000, random_state=37).fit(X, y)

y_pred = c_model.predict_proba(X)[:,1]
roc_auc_score(y, y_pred)

0.5055604101083738

In [35]:
import numpy as np

_Xy = pd.DataFrame({'y_true': y, 'y_pred': y_pred}) \
    .assign(diff=lambda d: d['y_true'] - d['y_pred']) \
    .assign(abs_diff=lambda d: np.abs(d['diff']))
_Xy.shape

(10000, 4)

In [36]:
_Xy.head()

Unnamed: 0,y_true,y_pred,diff,abs_diff
0,0,0.396756,-0.396756,0.396756
1,1,0.40306,0.59694,0.59694
2,0,0.394647,-0.394647,0.394647
3,1,0.40306,0.59694,0.59694
4,1,0.394647,0.605353,0.605353


In [66]:
_X = Xy[['gender', 'drug', 'recovery']] \
    .assign(gender=lambda d: d['gender'].map({'male': 1, 'female': 0})) \
    .assign(drug=lambda d: d['drug'].map({'yes': 1, 'no': 0})) \
    .assign(recovery=lambda d: d['recovery'].map({'yes': 1, 'no': 0}))
_y = _Xy['abs_diff']

_X.shape, _y.shape

((10000, 3), (10000,))

In [67]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

r_model = RandomForestRegressor(n_jobs=-1, random_state=37).fit(_X, _y)
_y_pred = r_model.predict(_X)

mean_absolute_error(_y, _y_pred)

2.0604074002505966e-15

In [73]:
_e = _X.drop_duplicates().sort_values(['gender', 'drug', 'recovery']).reset_index(drop=True).iloc[[7]]
_e

Unnamed: 0,gender,drug,recovery
7,1,1,1


In [76]:
_X \
    .drop_duplicates() \
    .sort_values(['gender', 'drug', 'recovery']) \
    .reset_index(drop=True) \
    .assign(u=r_model.predict(_X.drop_duplicates().sort_values(['gender', 'drug', 'recovery']).reset_index(drop=True)))

Unnamed: 0,gender,drug,recovery,u
0,0,0,0,0.408526
1,0,0,1,0.591474
2,0,1,0,0.394647
3,0,1,1,0.605353
4,1,0,0,0.396756
5,1,0,1,0.603244
6,1,1,0,0.40306
7,1,1,1,0.59694


In [50]:
c_model.predict_proba(X.drop_duplicates().reset_index(drop=True).iloc[[1]])

array([[0.59693957, 0.40306043]])

In [56]:
Xy.drop_duplicates().reset_index(drop=True) \
    .assign(gender=lambda d: d['gender'].map({'male': 1, 'female': 0})) \
    .assign(drug=lambda d: d['drug'].map({'yes': 1, 'no': 0})) \
    .assign(recovery=lambda d: d['recovery'].map({'yes': 1, 'no': 0})) \
    .iloc[[1]]

Unnamed: 0,gender,drug,recovery
1,1,1,1


In [62]:
r_model.predict(_Xy[['y_true']].drop_duplicates().reset_index(drop=True).iloc[[0]])

array([0.40078615])

In [63]:
r_model.predict(_Xy[['y_true']].drop_duplicates().reset_index(drop=True).iloc[[1]])

array([0.59909056])