# Phik remaining structure plots

This notebook generates the phik remaining structure plots on the titanic data, Fig 3 in the paper -- Synthsonic: Fast, Probabilistic modeling and Synthesis of Tabular Data


The titanic set was chosen due to its limited number of columns, allowing clearer visualizations.
Additionally it contains a mix of numerical and categorical variables that have non-linear dependencies

In [None]:
import logging

import scipy
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as plt_colors
import matplotlib.transforms as transforms
import seaborn as sns

from sklearn.utils import check_array
from sklearn.utils.validation import FLOAT_DTYPES

import phik
from phik import phik
from phik import config
from phik.phik import phik_matrix
from phik.phik import phik_from_binned_array
from phik.report import plot_correlation_matrix as phi_plot_correlation_matrix

from synthsonic.models.phik_utils import phik_matrix
from synthsonic.models.kde_copula_nn_pdf import KDECopulaNNPdf

## Config

In [None]:
SAVE_PLOTS = False

In [None]:
logging.basicConfig(level=logging.INFO)

In [None]:
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
plt.rcParams['text.color'] = 'black'
plt.rcParams['figure.max_open_warning'] = 0
colors = [i['color'] for i in plt.rcParams['axes.prop_cycle']]
markers = ['o', 's', 'p', 'x', '^', '+', '*', '<', 'D', 'h', '>']
%matplotlib inline

### Plotting funcs

In [None]:
def split_figure_vertical(figsize_1, frac, rect_1, rect_2):
    """
    figsize_1 is the size of the figure without the color bar
    frac is the additional width used for the color bar as fraction of original size
    rect_1, rect_2 define where the plotting area and color bar are located
    in their respective sections of the figure
    """
    additional_width = figsize_1[0] * frac
    oldWidth_1 = figsize_1[0]
    newWidth = oldWidth_1 + additional_width
    factor_1 = oldWidth_1 / newWidth
    factor_2 = additional_width / newWidth

    figsize = (newWidth, figsize_1[1])

    fig = plt.figure(figsize=figsize)

    rect_1[0] *= factor_1
    rect_1[2] *= factor_1

    rect_2[0] *= factor_2
    rect_2[2] *= factor_2
    rect_2[0] += factor_1

    ax1 = fig.add_axes(rect_1)
    ax2 = fig.add_axes(rect_2)

    return fig, ax1, ax2

In [None]:
def plot_correlation_matrix(
    matrix_colors:np.ndarray,
    x_labels:list,
    y_labels:list,
    fig,
    ax,
    pdf_file_name:str='',
    title:str='correlation',
    vmin:float=-1,
    vmax:float=1,
    color_map:str='RdYlGn',
    x_label:str='',
    y_label:str='',
    top:int=20,
    matrix_numbers:np.ndarray=None,
    print_both_numbers:bool=True,
    usetex:bool=False,
    identity_layout:bool=True,
    fontsize_factor:float=1
) -> None:
    """Create and plot correlation matrix.

    Copied with permission from the eskapade package (pip install eskapade)

    :param matrix_colors: input correlation matrix
    :param list x_labels: Labels for histogram x-axis bins
    :param list y_labels: Labels for histogram y-axis bins
    :param str pdf_file_name: if set, will store the plot in a pdf file
    :param str title: if set, title of the plot
    :param float vmin: minimum value of color legend (default is -1)
    :param float vmax: maximum value of color legend (default is +1)
    :param str x_label: Label for histogram x-axis
    :param str y_label: Label for histogram y-axis
    :param str color_map: color map passed to matplotlib pcolormesh. (default is 'RdYlGn')
    :param int top: only print the top 20 characters of x-labels and y-labels. (default is 20)
    :param matrix_numbers: input matrix used for plotting numbers. (default it matrix_colors)
    :param identity_layout: Plot diagonal from right top to bottom left (True) or bottom left to top right (False)
    """
    if not isinstance(matrix_colors, np.ndarray):
        raise TypeError('matrix_colors is not a numpy array.')
    
    # basic matrix checks
    assert (matrix_colors.shape[0] == len(y_labels)) or (matrix_colors.shape[0] + 1 == len(y_labels)), \
        'matrix_colors shape inconsistent with number of y-labels'
    assert (matrix_colors.shape[1] == len(x_labels)) or (matrix_colors.shape[1] + 1 == len(x_labels)), \
        'matrix_colors shape inconsistent with number of x-labels'
    if matrix_numbers is None:
        matrix_numbers = matrix_colors
        print_both_numbers = False  # only one set of numbers possible
    else:
        assert matrix_numbers.shape[0] == len(y_labels), 'matrix_numbers shape inconsistent with number of y-labels'
        assert matrix_numbers.shape[1] == len(x_labels), 'matrix_numbers shape inconsistent with number of x-labels'

    if identity_layout:
        matrix_colors = np.array([a[::-1] for a in matrix_colors])
        x_labels = x_labels[::-1]
        if matrix_numbers is not None:
            matrix_numbers = np.array([a[::-1] for a in matrix_numbers])

    plt.rc('text', usetex=usetex)

    norm = plt_colors.Normalize(vmin=vmin, vmax=vmax)
    img = ax.pcolormesh(matrix_colors, cmap=color_map, edgecolor='w', linewidth=1, norm=norm)

    # set x-axis properties
    def tick(lab):
        """Get tick."""
        if isinstance(lab, (float, int)):
            lab = 'NaN' if np.isnan(lab) else '{0:.0f}'.format(lab)
        lab = str(lab)
        if len(lab) > top:
            lab = lab[:17] + '...'
        return lab

    # axis ticks and tick labels
    if len(x_labels) == matrix_colors.shape[1] + 1:
        ax.set_xticks(np.arange(len(x_labels)))
    else:
        ax.set_xticks(np.arange(len(x_labels)) + 0.5)
    ax.set_xticklabels([tick(lab) for lab in x_labels], rotation='vertical', fontsize=10 * fontsize_factor)

    if len(y_labels) == matrix_colors.shape[0] + 1:
        ax.set_yticks(np.arange(len(y_labels)))
    else:
        ax.set_yticks(np.arange(len(y_labels)) + 0.5)
    ax.set_yticklabels([tick(lab) for lab in y_labels], rotation='horizontal', fontsize=10 * fontsize_factor)

    # Turn ticks off in case no labels are provided
    if len(x_labels)==1 and len(x_labels[0]) == 0:
        plt.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom=False,  # ticks along the bottom edge are off
            top=False,  # ticks along the top edge are off
            labelbottom=False)
    if len(y_labels)==1 and len(y_labels[0]) == 0:
        plt.tick_params(
            axis='y',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            left=False,  # ticks along the bottom edge are off
            right=False,  # ticks along the top edge are off
            labelbottom=False)

    # make plot look pretty
    ax.set_title(title, fontsize=14 * fontsize_factor)
    if x_label:
        ax.set_xlabel(x_label, fontsize=12 * fontsize_factor)
    if y_label:
        ax.set_ylabel(y_label, fontsize=12 * fontsize_factor)

    # annotate with correlation values
    numbers_set = [matrix_numbers] if not print_both_numbers else [matrix_numbers, matrix_colors]
    for i in range(matrix_numbers.shape[1]):
        for j in range(matrix_numbers.shape[0]):
            point_color = float(matrix_colors[j][i])
            white_cond = (point_color < 0.7 * vmin) or (point_color >= 0.7 * vmax) or np.isnan(point_color)
            y_offset = 0.5
            for m, matrix in enumerate(numbers_set):
                if print_both_numbers:
                    if m == 0:
                        y_offset = 0.7
                    elif m == 1:
                        y_offset = 0.25
                point = float(matrix[j][i])
                label = 'NaN' if np.isnan(point) else '{0:.2f}'.format(point)
                color = 'w' if white_cond else 'k'
                ax.annotate(label, xy=(i + 0.5, j + y_offset), color=color, horizontalalignment='center',
                            verticalalignment='center', fontsize=10 * fontsize_factor)

## Data set -- Titanic

In [None]:
data_set = 'titanic'

In [None]:
df = pd.read_csv('titanic.csv.gz')
df.info()

In [None]:
df.head()

In [None]:
df = df.drop(columns=['Name'])

In [None]:
df_num = df.copy()
for c in ['Sex']:
    codes, uniques = pd.factorize(df[c])
    df_num[c] = codes

In [None]:
df_num.head()

In [None]:
df_num.phik_matrix(interval_cols=['Age', 'Fare'])

## Fit

In [None]:
clf = xgb.XGBClassifier(
    n_estimators=100,
    reg_lambda=1,
    gamma=0,
    max_depth=5
)

In [None]:
cat_cols_idx = [0, 1, 2, 4, 5]
num_cols_idx = [3, 6]

In [None]:
kde = KDECopulaNNPdf(
    use_KDE=False,
    estimator_type='tan',
    n_uniform_bins=10,
    numerical_columns=num_cols_idx,
    categorical_columns=cat_cols_idx,
    n_quantiles=100,
    n_calibration_bins=40,
    root_node=0
)

In [None]:
kde = kde.fit(df_num.values)

In [None]:
kde._calibrate_classifier(kde.hist_p0_, kde.hist_p1_, kde.bin_edges_, validation_plots=True)

## Weighted sample

In [None]:
X = df_num.values

In [None]:
X_bn = kde._transform_and_slice(X, discretize=True)

In [None]:
X_noweight = kde._sample_bayesian_network(size=100*X.shape[0], add_uniform=False)

In [None]:
kde.nonlinear_indices_

In [None]:
X_weighted = kde._transform_and_slice(kde.sample_no_weights(n_samples=100*X.shape[0]), discretize=True)

In [None]:
X_bn.shape, X_noweight.shape, X_weighted.shape

In [None]:
i = 6
plt.hist(X_bn[:, i], bins=10, density=True, alpha=0.5)
plt.hist(X_noweight[:, i], bins=10, density=True, alpha=0.5);

## Heatmap plots

Shorten long column names

In [None]:
df = df.rename(columns={'Siblings/Spouses Aboard': 'Sib...Aboard', 'Parents/Children Aboard': 'Par...Aboard'})

mark discretized columns with names

In [None]:
columns = df.columns.to_list()
columns[3] = '$X_{bn}[0]$'
columns[6] = '$X_{bn}[1]$'

In [None]:
df_bn = pd.DataFrame(X_bn)
pkd = df_bn.phik_matrix(interval_cols=[])

vmin = 0
vmax = 1
color_map = 'Blues'
norm = plt_colors.Normalize(vmin=vmin, vmax=vmax)
img = plt.pcolormesh(pkd.values, cmap=color_map, edgecolor='w', linewidth=1, norm=norm);

In [None]:
pk_or = df.phik_matrix(interval_cols=['Fare', 'Age'])

In [None]:
figsize = (7, 7)
rect = [0.2, 0.2, 0.7, 0.7]

fig = plt.figure(figsize=figsize)
ax1 = fig.add_axes(rect)

plot_correlation_matrix(
    pk_or.values,
    fig=fig,
    ax=ax1,
    x_labels=df.columns.to_list(),
    y_labels=df.columns.to_list(), 
    vmin=vmin,
    vmax=vmax,
    color_map=color_map,
    title=r'$\phi_K$ Original',
    fontsize_factor=1.9,
)
#ax1.set_yticklabels(ax1.get_yticklabels(), rotation=45)
if SAVE_PLOTS:
    fig.savefig(f'{data_set}_phik_original.pdf', dpi=600, bbox_inches='tight')

### Bayesian network

In [None]:
pku = phik_matrix(X_bn, X_noweight)

In [None]:
figsize = (7, 7)
rect = [0.2, 0.2, 0.7, 0.7]

fig = plt.figure(figsize=figsize)
ax1 = fig.add_axes(rect)

plot_correlation_matrix(
    pku.values,
    fig=fig,
    ax=ax1,
    x_labels=columns,
    y_labels=columns, 
    vmin=vmin,
    vmax=vmax,
    color_map=color_map,
    title=r'$\phi_K$ Copula Bayesian Network',
    fontsize_factor=1.9,
)
ax1.set_yticklabels([])
if SAVE_PLOTS:
    fig.savefig(f'{data_set}_phik_copula_bn.pdf', dpi=600, bbox_inches='tight')

### Calibrated classifier

In [None]:
pkw = phik_matrix(X_bn, X_weighted)

In [None]:
figsize = (7, 7)
rect = [0.2, 0.2, 0.7, 0.7]

fig, ax1, ax2 = split_figure_vertical(figsize, 0.1, rect, [-0.4, 0.2, 0.7, 0.7])
plot_correlation_matrix(
    pkw.values,
    fig=fig,
    ax=ax1,
    x_labels=columns,
    y_labels=columns, 
    vmin=vmin,
    vmax=vmax,
    color_map=color_map,
    title=r'$\phi_K$ calibrated learner',
    fontsize_factor=1.9,
)
ax1.set_yticklabels([])
cb = plt.colorbar(img, cax=ax2, pad=0.01)
cb.ax.tick_params(labelsize=15)
if SAVE_PLOTS:
    fig.savefig(f'{data_set}_phik_calibrated_learner.pdf', dpi=600, bbox_inches='tight')