# Comparing the original data with the newly generated toy data
This notebook creates a bunch of plots to check that the toy data and original data are similar. There is code for the following:
1. Creating a table of means along each parameter
   - This does not work when I try to save the figure, because plt.table works differently from other types of plots.
   - I don't think this is that important, so I am leaving it broken.
2. Plotting two correlation coefficient matrices as heatmaps.
3. Creating overlayed histograms for each parameter in the datasets.
4. Creating overlayed histograms of the full data projected onto a bunch of random lines.
   - I did this because it is possible for two distinct multivariate distributions to have the same marginal distributions, although I'm not sure this is something we even need to worry about in our case. I made this just in case.
   - By checking along random lines, we can help reassure ourselves.

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp

def meantable(X, Y, mcip, params):
    
    #creates a table to compare the means of each parameter
    
    plt.figure(figsize=(6,0.1))
    plt.axis('off')
    
    table = [['parameter', 'TALE MC mean', 'toy data mean']]
    for p, x, y in zip(params, np.mean(X, axis=1), np.mean(Y, axis=1)):
        table.append([p, '{:.2f}'.format(x), '{:.2f}'.format(y)])
    
    plt.table(table)
    plt.title(f'Primary ID {mcip}')
    
    plt.show()
    return

def corrcoefplot(X, Y, mcip, params):

    #creates heatmaps of correlation coefficients
    
    fig, ax = plt.subplots(1,2, figsize = (12, 4))
    plt.suptitle(f'Primary ID {mcip}')
    
    ax[0].set_title('Correlation coefficients of TALE MC data')
    im1 = ax[0].imshow(np.corrcoef(X))
    ax[0].set_xticks(range(20), params)
    ax[0].tick_params(axis='x', labelrotation=90)
    ax[0].set_yticks(range(20), params)
    fig.colorbar(im1)
    
    ax[1].set_title('Correlation coefficients of toy data')
    im2 = ax[1].imshow(np.corrcoef(Y))
    ax[1].set_xticks(range(20), params)
    ax[1].tick_params(axis='x', labelrotation=90)
    ax[1].set_yticks(range(20), params)
    fig.colorbar(im2)
    
    #plt.show()
    fig.savefig(f'data_comparison_plots/primary_id_{mcip}_corr_coefs.png')
    return

def randomprojection(X, Y):
   
    rng = np.random.default_rng()
    N = rng.standard_normal(len(X))
   
    norm = np.linalg.norm(N)
   
    #this should never actually happen
    if norm == 0.0:
        return None
           
    u = N/norm #random unit vector

    #generate projection matrix P for unit vector, u.
    P = np.outer(u, u)

    #apply the projection
    Xp = np.linalg.norm(P @ X, axis=0)
    Yp = np.linalg.norm(P @ Y, axis=0)

    return Xp, Yp

def overlayhistograms(axis, X, Y, title):

    bins = np.linspace(min(min(X), min(Y)), max(max(X), max(Y)), 20)

    Xcounts, _ = np.histogram(X, bins=bins, density=True)
    Ycounts, _ = np.histogram(Y, bins=bins, density=True)
    
    axis.stairs(Xcounts, bins, color='b', label='TALE MC')
    axis.stairs(Ycounts, bins, color='r', label='toy data')
    
    #axis.legend()
    
    axis.set_title(title)
    
    return

def minihists(X, Y, mcip, params, suptitle, figtitle):
    
    fig, axs = plt.subplots(4,5, figsize=(12,8))
    fig.suptitle(suptitle)
    
    j = 0
    for row in axs:
        for ax in row:
            ax.tick_params(
                axis='both',
                which='both',
                bottom=False,
                top=False,
                left=False,
                right=False,
                labelbottom=False,
                labelleft=False
            )
            
            if j == len(params):
                continue
            overlayhistograms(ax, X[j], Y[j], params[j])
            j += 1

    fig.savefig(figtitle)
    return

In [46]:
plt.ioff() #prevents plots from displaying in the notebook

dfX = pd.read_csv('betterTALEdata/gw_mc_all.data-combined(core_correction).csv')
dfY = pd.read_csv('toy_data.csv')

params = ["xcore","ycore","th","phi","rp","psi","en","xf","xm","dxm","c2t","c2p","xl","sz","fscin","fckov","fscat","mir_id","mir_nmir","mir_ngtube"]

for i in [1.0, 2.0, 3.0, 4.0, 5.0]:
    dfXi = dfX[dfX['mcip'] == i]
    dfYi = dfY[dfY['mcip'] == i]

    print('Comparing data for primary ID: ', i)

    X = dfXi[params].values.T
    Y = dfYi[params].values.T

    #meantable(X, Y, i, params) #doesn't work well. plt.table() is very strange
    corrcoefplot(X, Y, i, params)

    minihists(X, Y, i, params, f'Histograms of Parameters for Primary {i} [TALE MC: red, toy data: blue]', f'data_comparison_plots/primary_id_{i}_param_hists.png')

    Xproj, Yproj = [], []
    for j in range(20):
        Xp, Yp = randomprojection(X, Y)
    
        Xproj.append(Xp)
        Yproj.append(Yp)

    blankparams = ['' for j in range(20)]
    minihists(Xproj, Yproj, i, blankparams, f'Histograms of 20 Random Projections for Primary {i} [TALE MC: red, toy data: blue]', f'data_comparison_plots/primary_id_{i}_random_projection_hists.png')

Comparing data for primary ID:  1.0
Comparing data for primary ID:  2.0
Comparing data for primary ID:  3.0
Comparing data for primary ID:  4.0
Comparing data for primary ID:  5.0
