# Comparing the three algorithms by Neal

In [None]:
import numpy as np
import scipy.stats as stats
import subprocess
import matplotlib.pyplot as plt
from google.protobuf.internal.decoder import _DecodeVarint32
import sys
sys.path.insert(0, '..')
from proto.py.marginal_state_pb2 import MarginalState
import arviz as az
# import pip
# pip.main(["install", "arviz"])

In [None]:
# Utility to save files with Unix-like newlines
def save_np(filename, npobj):
    with open(filename, 'wb') as f:
        np.savetxt(f, npobj, fmt='%1.5f')

In [None]:
# Generate data
rng = 20201124
np.random.seed(rng)
n = 200
mean1 = -3.0
mean2 = +3.0
norm1 = np.random.normal(loc=mean1, scale=1.0, size=int(n/2))
norm2 = np.random.normal(loc=mean2, scale=1.0, size=int(n/2))
data_uni = np.concatenate((norm1, norm2))
# Generate grid
grid_uni = np.arange(-10, +10, 0.1)

In [None]:
# Save to file
save_np("../resources/csv/in/data_uni.csv", data_uni)
save_np("../resources/csv/in/grid_uni.csv", grid_uni)

In [None]:
# True density of data
true_pdf = 0.5 * stats.norm.pdf(grid_uni, mean1, 1.0) + \
           0.5 * stats.norm.pdf(grid_uni, mean2, 1.0)

In [None]:
# Define list of algorithms
algos = ["Neal2", "Neal3", "Neal8"]

In [None]:
# Run the executable
for algo in algos:
    cmd = ["../build/run",
        algo, str(rng), "0", "1000", "100",
        "NNIG", "../resources/asciipb/nnig_ngg_prior.asciipb",
        "DP",   "../resources/asciipb/dp_gamma_prior.asciipb",
        f"../{algo}.recordio",
        "../resources/csv/in/data_uni.csv",
        "../resources/csv/in/grid_uni.csv",
        f"../resources/csv/out/uni_{algo}_dens.csv",
        f"../resources/csv/out/uni_{algo}_mass.csv",
        f"../resources/csv/out/uni_{algo}_nclu.csv",
        f"../resources/csv/out/uni_{algo}_clus.csv"
    ]
    subprocess.run(cmd, capture_output=True)

## Clustering

In [None]:
# Read clusterings
clusterings = dict.fromkeys(algos)
for algo in algos:
    clusterings[algo] = np.loadtxt(f"../resources/csv/out/uni_{algo}_clus.csv")

In [None]:
# Compare clusterings
print(np.linalg.norm(clusterings["Neal2"]-clusterings["Neal3"], 1))
print(np.linalg.norm(clusterings["Neal2"]-clusterings["Neal8"], 1))
print(np.linalg.norm(clusterings["Neal3"]-clusterings["Neal8"], 1))

## Density estimation

In [None]:
# Densities
plt.figure(figsize=(16, 8))
for algo in algos:
    matr = np.genfromtxt(f"../resources/csv/out/uni_{algo}_dens.csv", delimiter=',')
    plt.plot(grid_uni, np.exp(np.mean(matr, axis=0)))
plt.plot(grid_uni, true_pdf, color="red", linestyle="--")
plt.legend(algos + ["true"])
plt.title("Univariate densities")

## Effective Sample Size

In [None]:
# Utility to read file collector, courtesy of
# github.com/mberaha/utils/blob/master/proto_utils/py/recordio.py
def readManyFromFile(filename, msgType):
    out = []
    with open(filename, "rb") as fp:
        buf = fp.read()
    n = 0
    while n < len(buf):
        msg_len, new_pos = _DecodeVarint32(buf, n)
        n = new_pos
        msg_buf = buf[n:n+msg_len]
        try:
            msg = msgType()
            msg.ParseFromString(msg_buf)
            out.append(msg)
            n += msg_len
        except Exception as e:
            break
    return out

In [None]:
# Compute Effective Sample Sizes for each algorithm
ESS = dict.fromkeys(algos)
for algo in algos:
    # Read chain
    chain = readManyFromFile(f"../{algo}.recordio", MarginalState)
    # Record number of clusters at each iteration
    n_clusters = np.empty(len(chain))
    for i in range(len(chain)):
        state = chain[i]
        n_clusters[i] = len(state.cluster_states)
    ESS[algo] = az.ess(n_clusters)

In [None]:
# Times of MCMC, collected via the progressbar
filecoll_times = dict(zip(algos, [5.690, 6.824, 8.636]))
memocoll_times = dict(zip(algos, [5.617, 6.040, 7.348]))

In [None]:
# Display computed ESS
for key, val in ESS.items():
    print(key, "ESS =", val, "-> ESS/time =", val/filecoll_times[key], sep="\t")