## PDF bi-variate normal plot

This notebook generates the bivariate normal plot, Fig 4 (Appendix) in the paper -- Synthsonic: Fast, Probabilistic modeling and Synthesis of Tabular Data

In [None]:
import logging

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from scipy.stats import norm
from scipy.stats import multivariate_normal
from sklearn.neural_network import MLPClassifier

from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf

## Config

In [None]:
np.random.seed(42)

In [None]:
SAVE_PLOTS = True

In [None]:
dataset_name = 'bivariate_normal'

In [None]:
logging.basicConfig(level=logging.INFO)

In [None]:
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
plt.rcParams['text.color'] = 'black'
plt.rcParams['figure.max_open_warning'] = 0
colors = [i['color'] for i in plt.rcParams['axes.prop_cycle']]
markers = ['o', 's', 'p', 'x', '^', '+', '*', '<', 'D', 'h', '>']
%matplotlib inline

## Data

In [None]:
# generate bivariate gaussian with correlation
mux = 0
muy = 0
sigmax = 1
sigmay = 1
rho = 0.7
N = 100000

X = np.random.multivariate_normal(
    [mux, muy],
    [
        [sigmax * sigmax, rho * sigmax * sigmay],
        [rho * sigmax * sigmay, sigmay * sigmay]
    ],
    size=N
)

## Fit

In [None]:
pdf = KDECopulaNNPdf(rho=0.4)
pdf = pdf.fit(X)

In [None]:
pdf._calibrate_classifier(pdf.hist_p0_, pdf.hist_p1_, pdf.bin_edges_, validation_plots=True)

In [None]:
pdf.score(X)

In [None]:
p = pdf.pdf(X)
logp = pdf.logpdf(X)

In [None]:
p[:10]

In [None]:
values = np.array([0.19397825, 0.05675388, 0.21877299, 0.04740699, 0.17337092,
       0.1757445 , 0.0333854 , 0.04177411, 0.12712715, 0.05748896])

In [None]:
plt.scatter(values, p[:10])
plt.plot(values, values)

In [None]:
# theoretical pdf values
rv = multivariate_normal(
    [mux, muy],
    [
        [sigmax * sigmax, rho * sigmax * sigmay],
        [rho * sigmax * sigmay, sigmay * sigmay]
    ]
)
p2 = rv.pdf(X)
logp2 = np.log(p2)

In [None]:
X_gen = pdf.sample_no_weights(n_samples=X.shape[0] * 10, mode='cheap')

s_cov = np.round(np.cov(X_gen.T), 3)[0, 1]
s_mu = np.round(X_gen.mean(), 3)
print('mu_hat: ', s_mu)
print('cov_hat: ', s_cov)

In [None]:
# compare the two
x = np.linspace(0, 0.223, 100)
fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(x, x, lw=3, ls='--', zorder=10, color='black')
ax.scatter(p2, p, s=0.005, color=colors[0], marker='x')
ax.set_xlabel(r'$X$', fontsize=18)
ax.set_ylabel(r'$X_{\rm syn}$', fontsize=18)
ax.set_ylim(-0.03, 0.35)
ax.tick_params(labelsize=16)
if SAVE_PLOTS:
    fig.savefig(f'{dataset_name}_pdf_rv_vs_gen.pdf', dpi=600, bbox_inches='tight')

In [None]:
# compare the two
x = np.linspace(-12.5, -1.47, 100)
fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(x, x, lw=3, ls='--', zorder=10, color='black')
ax.scatter(logp2, logp, s=2, color=colors[0])
ax.set_xlabel(r'$X$', fontsize=18)
ax.set_ylabel(r'$X_{\rm syn}$', fontsize=18)
ax.tick_params(labelsize=16)
if SAVE_PLOTS:
    fig.savefig(f'{dataset_name}_log_pdf_rv_vs_gen.pdf', dpi=600, bbox_inches='tight')

In [None]:
dp = p - p2
np.mean(dp), np.std(dp, ddof=1)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
ax.hist(dp, bins='auto');