In [3]:
def r_corr_test(df,PTable = False, CoefficientandPtable = False, lower = True ):
    '''Returns a table of Pearson's r correlation coefficients between every pair of columns in the dataframe
    
    Args:
    df: The input dataframe
    PTable: False (default) or True, if True, then the return is a table containing the p(probavility)-value of correlation test.
    CoefficientandPtable: False(default) or True, if true, then the return is a table containing tuples (p-value, r coefficient) from the correlation test.
    lower: True(default) or False. If True, the lower triangle part of the table is filled with the transpose of the upper triangle part rather than leaved with None.
    
    Returns:
    The requested table as specified in the args. If PTable and CoefficientandPtable are all False, then the return table consists of coefficient values only.
    
    '''
    from scipy.stats import pearsonr
    import pandas as pd
    import numpy as np

    df_index = (df.keys()).tolist()
    n = len(df_index)
    ini = [ [ None for y in range( n ) ] 
                 for x in range( n ) ]

    #pearsonr returns two values: the correlation coefficient and significance test probability p
    #so we create two empty dataframes to store them
    coefficient_table = pd.DataFrame(ini,index = df_index,columns = df_index)
    p_table = coefficient_table.copy()
    coe_and_p_table = coefficient_table.copy()

    for i in range(n):
        for j in range(i+1,n):
            name1 = df_index[i]
            name2 = df_index[j]
            obs_1 = df[name1].dropna()
            obs_2 = df[name2].dropna()
            dataframe = pd.DataFrame({name1: obs_1, name2: obs_2})

            values = dataframe.dropna().values
            (coe,p) = pearsonr(values[:,0],values[:,1])
            coefficient_table.loc[name1,name2]=coe
            p_table.loc[name1,name2]=p
            coe_and_p_table.loc[name1,name2]=(coe,p)
    
    if lower:
        #A function that can fill the lower part of the dataframe, because coe_table and p_table has their lower triangles empty
        #But for comparison reasons you may want them to be filled
        def fill_lower(df):
            n = df.values.shape[0]
            for j in range(n):
                for i in range(j+1,n):
                    df.iloc[i,j]=df.iloc[j,i]
            return df
        
        coefficient_table = fill_lower(coefficient_table)
        p_table = fill_lower(p_table)
        coe_and_p_table = fill_lower(coe_and_p_table)
    
    
    if PTable:
        return p_table
    elif CoefficientandPtable:
        return coe_and_p_table
    else:
        return coefficient_table

In [32]:
def SRA(R,S):
    '''Calculate the SRA
    
    Args:
    - R: A list of performance metrics of different predictive models from TSTS
    - S: A list of performance metrics of different predictive models from TRTR, len(S)=len(R)
    
    Returns:
    - SRA: SRA value
    
    '''
    def identity_function(statement):
        v = 0
        if statement:
            v = 1
        return v
            
    k = len(R)
    sum_ = 0
    for i in range(k):
        for j in range(k):
            if i != j:
                if (R[i]-R[j])==0:
                    if (S[i]-S[j])==0:
                        agree = True
                    else:
                        agree = False
                else:
                    agree = (R[i]-R[j])*(S[i]-S[j])>0
                sum_ += identity_function(agree)
    SRA = sum_ / (k*(k-1))
    return SRA

In [22]:
def CorrelationSRA(ori_correlation_df,gen_correlation_df,ColumnWise = False):
    '''Returns the value of SRA for the absolute Pearsons correlation coefficients for each column between \
    all other columns. SRA is between 0 and 1, the closer the SRA is to 1, the more the agreement between the ranking,\
    the more similar the synthetic data and the real data are.
    
    Args:
    ori_correlation_df: the correlation coefficient dataframe for the real data, usually generated from the function\
                        r_corr_test.
    gen_correlation_df: the correlation coefficient dataframe for the synthetic data, usually generated from the function\
                        r_corr_test. 
    ColumnWise: False(default) or True. If True, the return is a Series containing the SRA value for each column and the average.\
                Otherwise, the return is the average of SRA values for all columns
    
    Returns:
    s: It is either a column-wise SRA series or the average SRA values of them, determined by the arg ColumnWise.
    
    '''
    import numpy as np
    import pandas as pd
    
    columns = (ori_correlation_df.keys()).tolist()
    n = len(columns)
    ini = np.ones(n)
    
    for i in range(n):
        ori_values = ori_correlation_df.iloc[i,:].dropna()
        gen_values = gen_correlation_df.iloc[i,:].dropna()
        ini[i] = SRA(abs(ori_values), abs(gen_values))
    
    if ColumnWise:
        s = pd.Series(ini,index = columns)
        s['average'] = sum(ini)/n
    else:
        s = sum(ini)/n
    return s

In [50]:
def MSE(r_table_ori,r_table_gen):
    '''
    Returns the MSE for each position between two dataframes and an average value.
    '''
    import pandas as pd
    import numpy as np
    ori = r_table_ori.fillna(0).values
    gen = r_table_gen.fillna(0).values
    columns = (r_table_gen.keys()).tolist()
    matrix = (ori-gen)**2
    df = pd.DataFrame(matrix, index = columns, columns = columns)
    score = np.sum(matrix)/(len(ori)*(len(ori)-1)) #The diagonal is always zero so we don't count them
    return df, score

In [54]:
import numpy as np
import pandas as pd
dp_ori_df = pd.read_csv('synthetic data/doppelGANger/dp_ori.csv')
dp_gen_df = pd.read_csv('synthetic data/doppelGANger/dp_gen.csv')
r_table_dp_ori = r_corr_test(dp_ori_df)
r_table_dp_gen = r_corr_test(dp_gen_df)
display('DoppelGANger generated',r_table_dp_gen)
display('DoppelGANger real',r_table_dp_ori)

'DoppelGANger generated'

Unnamed: 0,dday,weight,height,age,temp
dday,,0.46059,0.579901,0.385757,0.572924
weight,0.46059,,0.85236,0.765801,0.725225
height,0.579901,0.85236,,0.700063,0.951644
age,0.385757,0.765801,0.700063,,0.561933
temp,0.572924,0.725225,0.951644,0.561933,


'DoppelGANger real'

Unnamed: 0,dday,weight,height,age,temp
dday,,0.547442,0.625742,0.43148,0.624604
weight,0.547442,,0.904009,0.888397,0.787127
height,0.625742,0.904009,,0.739106,0.964485
age,0.43148,0.888397,0.739106,,0.589636
temp,0.624604,0.787127,0.964485,0.589636,


In [55]:
CorrelationSRA(r_table_dp_ori,r_table_dp_gen,ColumnWise=True)

dday       1.0
weight     1.0
height     1.0
age        1.0
temp       1.0
average    1.0
dtype: float64

In [56]:
MSE(r_table_dp_gen,r_table_dp_ori)

(            dday    weight    height       age      temp
 dday    0.000000  0.007543  0.002101  0.002091  0.002671
 weight  0.007543  0.000000  0.002668  0.015030  0.003832
 height  0.002101  0.002668  0.000000  0.001524  0.000165
 age     0.002091  0.015030  0.001524  0.000000  0.000767
 temp    0.002671  0.003832  0.000165  0.000767  0.000000,
 0.0038391879541700945)

In [53]:
tgan_ori_df = pd.read_csv('synthetic data/TGAN/tgan_ori.csv')
tgan_gen_df = pd.read_csv('synthetic data/TGAN/tgan_gen.csv')
r_table_tgan_ori = r_corr_test(tgan_ori_df)
r_table_tgan_gen = r_corr_test(tgan_gen_df)
display('TGANs generated',r_table_tgan_gen)
display('TGANS real',r_table_tgan_ori)

'generated'

Unnamed: 0,dday,height,weight,temp,vomit_dur,cough_dur,diar_No,diar_Yes,head_No,head_Yes
dday,,0.630282,0.674201,-0.518343,0.509896,0.10626,-0.508896,0.505435,0.0137667,-0.0133881
height,0.630282,,0.967306,0.271986,0.840133,0.607276,-0.476903,0.444277,-0.627081,0.627144
weight,0.674201,0.967306,,0.264717,0.934518,0.704238,-0.642798,0.610614,-0.679343,0.679569
temp,-0.518343,0.271986,0.264717,,0.417741,0.737285,-0.165282,0.13641,-0.845591,0.845392
vomit_dur,0.509896,0.840133,0.934518,0.417741,,0.848396,-0.777692,0.744161,-0.802912,0.803213
cough_dur,0.10626,0.607276,0.704238,0.737285,0.848396,,-0.72696,0.697473,-0.96676,0.967068
diar_No,-0.508896,-0.476903,-0.642798,-0.165282,-0.777692,-0.72696,,-0.998401,0.575161,-0.575615
diar_Yes,0.505435,0.444277,0.610614,0.13641,0.744161,0.697473,-0.998401,,-0.541316,0.541765
head_No,0.0137667,-0.627081,-0.679343,-0.845591,-0.802912,-0.96676,0.575161,-0.541316,,-0.999998
head_Yes,-0.0133881,0.627144,0.679569,0.845392,0.803213,0.967068,-0.575615,0.541765,-0.999998,


'real'

Unnamed: 0,dday,height,weight,temp,vomit_dur,cough_dur,diar_No,diar_Yes,head_No,head_Yes
dday,,0.176702,0.216054,-0.0909514,-0.0390749,-0.0910155,0.0173973,-0.0173973,0.0698095,-0.0698095
height,0.176702,,0.873474,-0.143156,-0.0267113,-0.0593797,0.0530201,-0.0530201,-0.108343,0.108343
weight,0.216054,0.873474,,-0.122154,-0.0187375,-0.0419148,0.0156906,-0.0156906,-0.0930778,0.0930778
temp,-0.0909514,-0.143156,-0.122154,,0.125559,0.112293,-0.0484279,0.0484279,-0.281744,0.281744
vomit_dur,-0.0390749,-0.0267113,-0.0187375,0.125559,,0.0202579,-0.147209,0.147209,-0.0868671,0.0868671
cough_dur,-0.0910155,-0.0593797,-0.0419148,0.112293,0.0202579,,-0.015304,0.015304,-0.10895,0.10895
diar_No,0.0173973,0.0530201,0.0156906,-0.0484279,-0.147209,-0.015304,,-1.0,0.0406333,-0.0406333
diar_Yes,-0.0173973,-0.0530201,-0.0156906,0.0484279,0.147209,0.015304,-1.0,,-0.0406333,0.0406333
head_No,0.0698095,-0.108343,-0.0930778,-0.281744,-0.0868671,-0.10895,0.0406333,-0.0406333,,-1.0
head_Yes,-0.0698095,0.108343,0.0930778,0.281744,0.0868671,0.10895,-0.0406333,0.0406333,-1.0,


In [36]:
CorrelationSRA(r_table_tgan_ori,r_table_tgan_gen,ColumnWise=True)

dday         0.694444
height       0.638889
weight       0.555556
temp         0.750000
vomit_dur    0.166667
cough_dur    0.583333
diar_No      0.500000
diar_Yes     0.500000
head_No      0.805556
head_Yes     0.805556
average      0.600000
dtype: float64

In [57]:
MSE(r_table_tgan_gen,r_table_tgan_ori)

(               dday    height    weight      temp  vomit_dur  cough_dur  \
 dday       0.000000  0.205735  0.209899  0.182664   0.301369   0.038918   
 height     0.205735  0.000000  0.008805  0.172343   0.751419   0.444430   
 weight     0.209899  0.008805  0.000000  0.149669   0.908696   0.556745   
 temp       0.182664  0.172343  0.149669  0.000000   0.085370   0.390614   
 vomit_dur  0.301369  0.751419  0.908696  0.085370   0.000000   0.685812   
 cough_dur  0.038918  0.444430  0.556745  0.390614   0.685812   0.000000   
 diar_No    0.276985  0.280819  0.433608  0.013655   0.397508   0.506454   
 diar_Yes   0.273354  0.247304  0.392258  0.007741   0.356351   0.465354   
 head_No    0.003141  0.269089  0.343707  0.317924   0.512721   0.735837   
 head_Yes   0.003183  0.269154  0.343971  0.317700   0.513152   0.736366   
 
             diar_No  diar_Yes       head_No      head_Yes  
 dday       0.276985  0.273354  3.140790e-03  3.183373e-03  
 height     0.280819  0.247304  2.690889