In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm

np.random.seed(37)

g_df = pd.DataFrame([
    ['male', 0.51], 
    ['female', 0.49]], columns=['gender', '__p__'])

d_df = pd.DataFrame([
    ['female', 'no', 0.24],
    ['female', 'yes', 0.76],
    ['male', 'no', 0.76],
    ['male', 'yes', 0.24]], columns=['gender', 'drug', '__p__'])

r_df = pd.DataFrame([
    ['female', 'no', 'no', 0.90],
    ['female', 'no', 'yes', 0.10],
    ['female', 'yes', 'no', 0.27],
    ['female', 'yes', 'yes', 0.73],
    ['male', 'no', 'no', 0.99],
    ['male', 'no', 'yes', 0.01],
    ['male', 'yes', 'no', 0.07],
    ['male', 'yes', 'yes', 0.93]], columns=['gender', 'drug', 'recovery', '__p__'])

In [2]:
from itertools import cycle

def get_cpt(y, df):
    n = df[y].unique().shape[0]
    fields = df.columns.drop([y, '__p__']).tolist() + [y]
    df = df.sort_values(fields).reset_index(drop=True)

    cpt = np.array([df.iloc[i:i+2].__p__.values for i in range(0, df.shape[0], n)])
    return cpt

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def add_noise(p, u):
    b = np.log(p)
    n = u.rvs(size=p.shape)
    s = b + n
    return softmax(s)

def wiggle(p, u, max_samples=100):
    s = [add_noise(p, u) for _ in range(max_samples)]
    m = np.mean(s, axis=0)
    return m

def wiggle_cpt(cpt, u, max_samples=100):
    return np.ravel(np.array([wiggle(_cpt, u, max_samples) for _cpt in cpt]))

def create_potential(y, df, u_i, wpt):
    wf = df.assign(**{
        f'__u_{y}__': f'u{u_i}',
        '__p__': wpt
    })

    fields = [f'__u_{y}__'] + df.columns.tolist()
    return wf[fields]

def create_potential_with_hidden(y, df, u=[norm(0.05, 2)], max_samples=100):
    cpt = get_cpt(y, df)
    wpt = [wiggle_cpt(cpt, _u, max_samples) for _u, _ in zip(u, [1, 2])]
    pot = pd.concat([create_potential(y, df, _u, _wpt) for _u, _wpt in enumerate(wpt)]) \
        .reset_index(drop=True)
    return pot

In [3]:
create_potential_with_hidden('gender', g_df)

Unnamed: 0,__u_gender__,gender,__p__
0,u0,male,0.500352
1,u0,female,0.499648


In [4]:
create_potential_with_hidden('drug', d_df)

Unnamed: 0,__u_drug__,gender,drug,__p__
0,u0,female,no,0.411609
1,u0,female,yes,0.588391
2,u0,male,no,0.638819
3,u0,male,yes,0.361181


In [5]:
create_potential_with_hidden('recovery', r_df)

Unnamed: 0,__u_recovery__,gender,drug,recovery,__p__
0,u0,female,no,no,0.798954
1,u0,female,no,yes,0.201046
2,u0,female,yes,no,0.342357
3,u0,female,yes,yes,0.657643
4,u0,male,no,no,0.916903
5,u0,male,no,yes,0.083097
6,u0,male,yes,no,0.201615
7,u0,male,yes,yes,0.798385
