# Numerical transformations plots on the ring dataset

This notebook generates the Numerical transformations plots on the titanic data, Fig 1 in the paper -- Synthsonic: Fast, Probabilistic modeling and Synthesis of Tabular Data

In [None]:
import logging

import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
from sdgym import load_dataset
from pgmpy.models import BayesianModel
from pgmpy.estimators import TreeSearch
from pgmpy.sampling import BayesianModelSampling
from pgmpy.inference import BayesianModelProbability

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.gridspec import GridSpec

from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf

## Config

In [None]:
SAVE_PLOTS = False

In [None]:
logging.basicConfig(level=logging.INFO)

In [None]:
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
plt.rcParams['text.color'] = 'black'
plt.rcParams['figure.max_open_warning'] = 0
colors = [i['color'] for i in plt.rcParams['axes.prop_cycle']]
markers = ['o', 's', 'p', 'x', '^', '+', '*', '<', 'D', 'h', '>']
%matplotlib inline

## Dataset

In [None]:
dataset_name = 'ring'
data, categorical_columns, ordinal_columns = load_dataset(dataset_name)

df = pd.DataFrame(data, columns=['x', 'y'])

## Fit

In [None]:
# clf = xgb.XGBClassifier(
#     n_estimators=100,
#     reg_lambda=1,
#     gamma=0,
#     max_depth=3
# )

In [None]:
kde = KDECopulaNNPdf(
    use_KDE=False,
    n_uniform_bins=100,
    n_quantiles=1000,
)
kde = kde.fit(data)

## Plots

#### Fig 1a -- original data

In [None]:
axs = sns.JointGrid(data=df, x='x', y='y', height=7)
axs.ax_joint.scatter(data=df, x='x', y='y', c=colors[0], marker='x', s=0.3)
cnt, bins, _ = axs.ax_marg_x.hist(df['x'], bins=50, color=colors[0])
cnt, bins, _ = axs.ax_marg_y.hist(df['y'], bins=50, color=colors[0], orientation='horizontal')
axs.ax_joint.tick_params(labelsize=14)
axs.ax_joint.set_xlabel('')
axs.ax_joint.set_ylabel('')
axs.ax_joint.set_xlim(-2.0, 2.0)
axs.ax_joint.set_ylim(-2.0, 2.0)
axs.ax_joint.yaxis.set_ticks([-2, -2,  -1, 0, 1, 2])
axs.ax_joint.xaxis.set_ticks([-2, -2,  -1, 0, 1, 2])
if SAVE_PLOTS:
    axs.savefig(f'{dataset_name}_joint_marginal_data.pdf', dpi=600, bbox_inches='tight')

#### Fig 1b quantile transformation to Gaussian

In [None]:
X_g = kde.pipe_[0].transform(data)
tdf = pd.DataFrame(X_g, columns=['x', 'y'])

In [None]:
axs = sns.JointGrid(data=tdf, x='x', y='y', height=7)
axs.ax_joint.scatter(data=tdf, x='x', y='y', c=colors[0], marker='x', s=0.3)
cnt, bins, _ = axs.ax_marg_x.hist(tdf['x'], bins='auto', color=colors[0])
cnt, bins, _ = axs.ax_marg_y.hist(tdf['y'], bins='auto', color=colors[0], orientation='horizontal')
axs.ax_joint.tick_params(labelsize=14)
axs.ax_joint.set_xlabel('')
axs.ax_joint.set_ylabel('')
if SAVE_PLOTS:
    axs.savefig(f'{dataset_name}_joint_marginal_quantile.pdf', dpi=600, bbox_inches='tight')

Fig 1c PC rotation

In [None]:
X_p = kde.pipe_[0:2].transform(data)
tdf = pd.DataFrame(X_p, columns=['x', 'y'])

In [None]:
axs = sns.JointGrid(data=tdf, x='x', y='y', height=7)
axs.ax_joint.scatter(data=tdf, x='x', y='y', c=colors[0], marker='x', s=0.3)
cnt, bins, _ = axs.ax_marg_x.hist(tdf['x'], bins='auto', color=colors[0])
cnt, bins, _ = axs.ax_marg_y.hist(tdf['y'], bins='auto', color=colors[0], orientation='horizontal')
axs.ax_joint.tick_params(labelsize=14)
axs.ax_joint.set_xlabel('')
axs.ax_joint.set_ylabel('')
if SAVE_PLOTS:
    axs.savefig(f'{dataset_name}_joint_marginal_pca.pdf', dpi=600, bbox_inches='tight')

#### Fig 1d Quantile transformation to uniform

In [None]:
X_u = kde.pipe_.transform(data)
tdf = pd.DataFrame(X_u, columns=['x', 'y'])

In [None]:
axs = sns.JointGrid(data=tdf, x='x', y='y', height=7)
axs.ax_joint.scatter(data=tdf, x='x', y='y', c=colors[0], marker='x', s=0.3)
cnt, bins, _ = axs.ax_marg_x.hist(tdf['x'], bins='auto', color=colors[0])
cnt, bins, _ = axs.ax_marg_y.hist(tdf['y'], bins='auto', color=colors[0], orientation='horizontal')
axs.ax_joint.tick_params(labelsize=14)
axs.ax_joint.set_xlabel('')
axs.ax_joint.set_ylabel('')
if SAVE_PLOTS:
    axs.savefig(f'{dataset_name}_joint_marginal_uniform.pdf', dpi=600, bbox_inches='tight')

#### Fig 1f synthetic sample

In [None]:
df = pd.DataFrame(data, columns=['x', 'y'])

X_gen = kde.sample_no_weights(df.shape[0] * 10)
df_gen = pd.DataFrame(X_gen, columns=['x', 'y']).sample(n=df.shape[0])

In [None]:
axs = sns.JointGrid(data=df_gen, x='x', y='y', height=7)
sns.kdeplot(data=df['x'], data2=df['y'], color=colors[0], ax=axs.ax_joint, label=r'$X$', shade=False, shade_lowest=False)
sns.kdeplot(data=df_gen['x'], data2=df_gen['y'], color=colors[1], ax=axs.ax_joint, label=r'$X_{\rm syn}$', zorder=10, shade=False, shade_lowest=False)
#axs.ax_joint.scatter(data=df_gen, x='x', y='y', c=colors[1], marker='x', alpha=0.9, s=0.3, label='generated')
axs.ax_joint.set_xlabel('')
axs.ax_joint.set_ylabel('')
axs.ax_joint.legend(fontsize=16)

cnt, bins, _ = axs.ax_marg_x.hist(df['x'], bins='auto', color=colors[0], lw=2)
ext_cnt = np.insert(cnt, 0, cnt[0])
centers = 0.5 * (bins[1:] + bins[:-1])
cnt_gen, *_ = axs.ax_marg_x.hist(df_gen['x'], bins=bins, histtype='step', lw=3, color=colors[1], ls='--')

cnt, bins, _ = axs.ax_marg_y.hist(df['y'], bins='auto', color=colors[0], orientation='horizontal')
ext_cnt = np.insert(cnt, 0, cnt[0])
centers = 0.5 * (bins[1:] + bins[:-1])
cnt_gen, *_ = axs.ax_marg_y.hist(df_gen['y'], bins=bins, histtype='step', lw=3, color=colors[1], orientation='horizontal', ls='--')

axs.ax_joint.set_xlim(-2.0, 2.0)
axs.ax_joint.set_ylim(-2.0, 2.0)
axs.ax_joint.yaxis.set_ticks([-2, -2,  -1, 0, 1, 2])
axs.ax_joint.xaxis.set_ticks([-2, -2,  -1, 0, 1, 2])
axs.ax_joint.tick_params(labelsize=14)
if SAVE_PLOTS:
    axs.savefig(f'{dataset_name}_joint_marginal_with_sample_contours.pdf', dpi=600, bbox_inches='tight')

#### Fig 1e Weight BN

In [None]:
nbins = 10
bin_width = 1. / nbins
X_num_discrete = np.floor(X_u / bin_width)
X_num_discrete[X_num_discrete >= nbins] = nbins - 1  # correct for values at 1.

df_dis = pd.DataFrame(X_num_discrete)
# "tan" bayesian network needs string column names
df_dis.columns = [str(c) for c in df_dis.columns]
est = TreeSearch(df_dis, root_node=df_dis.columns[0])
dag = est.estimate(
    estimator_type="tan",
    class_node='1',
    show_progress=False,
    edge_weights_fn='mutual_info'
)
# model the conditional probabilities
bn = BayesianModel(dag.edges())
bn.fit(df_dis)
bn_prob = BayesianModelProbability(bn)
bn_ordering = [str(i) for i in range(df_dis.shape[1])]


x = np.arange(0, nbins, 1)
xx, yy = np.meshgrid(x, x)
X_grid = np.hstack((yy.reshape(nbins ** 2, 1), xx.reshape(nbins ** 2, 1)[::-1]))
P_grid = np.exp(bn_prob.log_probability(X_grid)).reshape(nbins, nbins)
weight_grid = P_grid / ( 1 / (nbins ** 2))

In [None]:
axs = sns.JointGrid(data=tdf, x='x', y='y', height=7)
sns.heatmap(weight_grid, vmin=0, vmax=weight_grid.max(), fmt='.2f', cmap='Blues', cbar=False, ax=axs.ax_joint, annot=weight_grid, annot_kws={'fontsize': 14})
cnt, bins, _ = axs.ax_marg_x.hist(X_num_discrete[:, 0], bins=np.arange(0, nbins + 1), color=colors[0])
cnt, bins, _ = axs.ax_marg_y.hist(X_num_discrete[:, 1], bins=np.arange(0, nbins + 1), color=colors[0], orientation='horizontal')
axs.ax_joint.tick_params(labelsize=14)
axs.ax_joint.set_xlabel('')
axs.ax_joint.set_ylabel('')
axs.ax_joint.set_aspect("equal")
axs.ax_joint.xaxis.set_major_locator(ticker.MultipleLocator(2))
axs.ax_joint.set_xticklabels([0.0, 0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
axs.ax_joint.yaxis.set_major_locator(ticker.MultipleLocator(2))
axs.ax_joint.set_yticklabels([1.0, 1.0, 0.8, 0.6, 0.4, 0.2, 0.0])
if SAVE_PLOTS:
    axs.savefig(f'{dataset_name}_discrete_uniform_bn_weights_annotated.pdf', dpi=600, bbox_inches='tight')