In [None]:
import numpy as np
import pandas as pd
from scipy.special import rel_entr
from IPython.display import display, HTML

In [None]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
import madbayes as mb

In [None]:
def test_em(name, network, sample, ratio):
    dataset = mb.forward_sampling(network, sample)
    dataset_nan = dataset.random_nan(ratio)
    network_nan = mb.expectation_maximization(network, dataset_nan)
    imputed = mb.impute(network_nan, dataset_nan)
    count_nan = dataset_nan.count_nan()
    correct_replacement = np.count_nonzero(dataset.data.values != imputed.data.values)
    absolute_difference = np.array([
        np.sum(np.fabs(network_nan[node]["CPT"] - network[node]["CPT"] ))
        for node in network.nodes()
    ])
    absolute_difference = np.sum(absolute_difference)
    kullback_leibler = np.array([
        np.sum(rel_entr(network_nan[node]["CPT"], network[node]["CPT"]))
        for node in network.nodes()
    ])
    kullback_leibler = np.sum(kullback_leibler)
    return {
        'name' : name,
        'samples': sample,
        'nan_ratio': ratio,
        'correct_replacement': 1 - (correct_replacement / count_nan),
        'absolute_difference': absolute_difference,
        'kullback_leibler': kullback_leibler
    }

In [None]:
networks = mb.data.network.NETWORKS
networks = [
    (network, getattr(mb.data.network, network))
    for network, _ in networks.items()
    if network in ['asia', 'survey']
]

In [None]:
for (name, network) in networks:
    for sample in [100, 250, 500, 1000, 2000]:
        for ratio in [0.05, 0.10, 0.15, 0.20]:
            data = pd.DataFrame([
                test_em(name, network, sample, ratio)
                for _ in range(10)
            ])
            data.to_csv(
                "./{}_{}_{}.csv".format(name, sample, int(round(ratio * 100)))
            )
            display(HTML(data.to_html()))