In [1]:
# Importing the model
from model.eval.evaluation import get_utility_metrics,stat_sim,privacy_metrics
# Importing standard libraries
import numpy as np
import pandas as pd
import glob



In [2]:
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Adult.csv" 

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Adult/Stacked_Adult_fake_0_08_06_firstgen.csv")

classifiers_list = ["lr","dt","rf","mlp","svm"]

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
print(result_df)

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Adult/Stacked_Adult_fake_0_08_06_stacked.csv")

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
print(result_df)


          Acc       AUC  F1_Score
lr   3.040229  0.031298 -0.009030
dt   6.643464  0.055101  0.068646
rf   3.767018  0.043888  0.047563
mlp  2.026820  0.021140  0.012876
svm  1.453578  0.034990 -0.047089
           Acc       AUC  F1_Score
lr   20.616235  0.277999  0.155583
dt   16.838980  0.157002  0.173193
rf   25.314771  0.307402  0.255489
mlp  25.529737  0.470203  0.306502
svm  32.347221  0.468133  0.249420


In [3]:
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Adult.csv" 

# Specifying the categorical columns of the dataset used
adult_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Adult/Stacked_Adult_fake_0_08_06_firstgen.csv")

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)

stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
print(stat_results)

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Adult/Stacked_Adult_fake_0_08_06_stacked.csv")

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)


stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
print(stat_results)


   Average WD (Continuous Columns  Average JSD (Categorical Columns)  \
0                         0.01031                           0.101344   

   Correlation Distance  
0              0.669588  
   Average WD (Continuous Columns  Average JSD (Categorical Columns)  \
0                        0.042537                           0.228255   

   Correlation Distance  
0              3.731642  


In [4]:
real_path = "Real_Datasets/Adult.csv" 
fake_paths = glob.glob("FakeFullRuns2/Adult/Stacked_Adult_fake_0_08_06_firstgen.csv")
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
print(privacy_results)

fake_paths = glob.glob("FakeFullRuns2/Adult/Stacked_Adult_fake_0_08_06_stacked.csv")
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
print(privacy_results)

   DCR between Real and Fake (5th perc)  DCR within Real(5th perc)  \
0                              0.386695                   0.216545   

   DCR within Fake (5th perc)  NNDR between Real and Fake (5th perc)  \
0                    0.191876                                0.59256   

   NNDR within Real (5th perc)  NNDR within Fake (5th perc)  
0                     0.442052                     0.424418  
   DCR between Real and Fake (5th perc)  DCR within Real(5th perc)  \
0                              0.993638                   0.216545   

   DCR within Fake (5th perc)  NNDR between Real and Fake (5th perc)  \
0                    0.001946                               0.792187   

   NNDR within Real (5th perc)  NNDR within Fake (5th perc)  
0                     0.442052                     0.089173  


In [5]:
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Credit.csv" 

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Credit/Stacked_Credit_fake_2022-06-17T22-11-51_firstgen.csv")

classifiers_list = ["lr","dt","rf","mlp","svm"]

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
print(result_df)

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Credit/Stacked_Credit_fake_2022-06-17T22-11-51_stacked.csv")

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
print(result_df)


          Acc       AUC  F1_Score
lr   0.030093  0.092485  0.098257
dt   2.768583  0.248759  0.365205
rf   0.100311  0.049458  0.281636
mlp  0.090280  0.080031  0.336483
svm  0.100311  0.593738  0.312016
           Acc       AUC  F1_Score
lr    0.050155  0.025700  0.180125
dt   20.232721  0.336229  0.431800
rf    0.100311  0.164513  0.281636
mlp   0.060187  0.025091  0.207221
svm   0.110342  0.488705  0.359409


In [6]:

# Specifying the path of the dataset used 
real_path = "Real_Datasets/Credit.csv" 

credit_categorical = ["Class"]

# Specifying the categorical columns of the dataset used
stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Credit/Stacked_Credit_fake_2022-06-17T22-11-51_firstgen.csv")

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,credit_categorical)
    stat_res_avg.append(stat_res)

stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
print(stat_results)

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Credit/Stacked_Credit_fake_2022-06-17T22-11-51_stacked.csv")

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)


stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
print(stat_results)


   Average WD (Continuous Columns  Average JSD (Categorical Columns)  \
0                        0.007343                           0.076932   

   Correlation Distance  
0                2.1702  
   Average WD (Continuous Columns  Average JSD (Categorical Columns)  \
0                        0.008682                                NaN   

   Correlation Distance  
0              2.498667  


In [7]:
real_path = "Real_Datasets/Credit.csv" 
fake_paths = glob.glob("FakeFullRuns2/Credit/Stacked_Credit_fake_2022-06-17T22-11-51_firstgen.csv")
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
print(privacy_results)

fake_paths = glob.glob("FakeFullRuns2/Credit/Stacked_Credit_fake_2022-06-17T22-11-51_stacked.csv")
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
print(privacy_results)

   DCR between Real and Fake (5th perc)  DCR within Real(5th perc)  \
0                              2.057963                   0.352515   

   DCR within Fake (5th perc)  NNDR between Real and Fake (5th perc)  \
0                    2.228144                               0.848891   

   NNDR within Real (5th perc)  NNDR within Fake (5th perc)  
0                     0.433713                     0.852653  
   DCR between Real and Fake (5th perc)  DCR within Real(5th perc)  \
0                              2.128278                   0.352515   

   DCR within Fake (5th perc)  NNDR between Real and Fake (5th perc)  \
0                    2.288457                               0.857918   

   NNDR within Real (5th perc)  NNDR within Fake (5th perc)  
0                     0.433713                     0.848366  


In [8]:
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Covtype.csv" 

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Covtype/Stacked_Covtype_fake_2022-06-17T21-13-43_firstgen.csv")

classifiers_list = ["lr","dt","rf","mlp","svm"]

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
print(result_df)

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Covtype/Stacked_Covtype_fake_2022-06-17T21-13-43_stacked.csv")

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
print(result_df)


           Acc       AUC  F1_Score
lr    6.533920  0.055569  0.060162
dt   37.292375  0.261271  0.356532
rf   26.295777  0.171300  0.284122
mlp  14.008405  0.103277  0.136061
svm   8.735241  0.076510  0.095981


ValueError: Number of classes in y_true not equal to the number of columns in 'y_score'

In [9]:
real_path = "Real_Datasets/Covtype.csv" 

covtype_categorical = ['Wilderness_Area1','Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
                                        'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
                                        'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
                                        'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
                                        'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
                                        'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
                                        'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
                                        'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
                                        'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
                                        'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
                                        'Soil_Type39', 'Soil_Type40', 'Cover_Type']

# Specifying the categorical columns of the dataset used
stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Covtype/Stacked_Covtype_fake_2022-06-17T21-13-43_firstgen.csv")

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,covtype_categorical)
    stat_res_avg.append(stat_res)

stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
print(stat_results)

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Covtype/Stacked_Covtype_fake_2022-06-17T21-13-43_stacked.csv")

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,covtype_categorical)
    stat_res_avg.append(stat_res)


stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
print(stat_results)


   Average WD (Continuous Columns  Average JSD (Categorical Columns)  \
0                        0.024906                           0.033738   

   Correlation Distance  
0              2.802889  
   Average WD (Continuous Columns  Average JSD (Categorical Columns)  \
0                        0.023951                           0.058602   

   Correlation Distance  
0              5.135968  


In [10]:
real_path = "Real_Datasets/Covtype.csv" 
fake_paths = glob.glob("FakeFullRuns2/Covtype/Stacked_Covtype_fake_2022-06-17T21-13-43_firstgen.csv")
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
print(privacy_results)

fake_paths = glob.glob("FakeFullRuns2/Covtype/Stacked_Covtype_fake_2022-06-17T21-13-43_stacked.csv")
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
print(privacy_results)

   DCR between Real and Fake (5th perc)  DCR within Real(5th perc)  \
0                              1.174401                   0.355479   

   DCR within Fake (5th perc)  NNDR between Real and Fake (5th perc)  \
0                    0.820841                               0.776075   

   NNDR within Real (5th perc)  NNDR within Fake (5th perc)  
0                      0.38621                      0.60754  
   DCR between Real and Fake (5th perc)  DCR within Real(5th perc)  \
0                              1.464876                   0.355479   

   DCR within Fake (5th perc)  NNDR between Real and Fake (5th perc)  \
0                    0.216346                               0.829421   

   NNDR within Real (5th perc)  NNDR within Fake (5th perc)  
0                      0.38621                     0.489045  


In [11]:
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Intrusion.csv" 

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Intrusion/Stacked_Intrusion_fake_2022-06-17T23-22-19_firstgen.csv")

classifiers_list = ["lr","dt","rf","mlp","svm"]

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
print(result_df)

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Intrusion/Stacked_Intrusion_fake_2022-06-17T23-22-19_stacked.csv")

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
print(result_df)


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:

real_path = "Real_Datasets/Intrusion.csv" 

intrusion_categorical = [ 'protocol_type', 'service', 'flag', 'land', 'wrong_fragment', 'urgent', 'hot',
                                         'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
                                         'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
                                         'num_access_files', 'num_outbound_cmds', 'is_host_login',
                                         'is_guest_login', 'class']

# Specifying the categorical columns of the dataset used
stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Intrusion/Stacked_Intrusion_fake_2022-06-17T23-22-19_firstgen.csv")

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,intrusion_categorical)
    stat_res_avg.append(stat_res)

stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
print(stat_results)

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Intrusion/Stacked_Intrusion_fake_2022-06-17T23-22-19_stacked.csv")

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,intrusion_categorical)
    stat_res_avg.append(stat_res)


stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
print(stat_results)


   Average WD (Continuous Columns  Average JSD (Categorical Columns)  \
0                        0.022765                           0.085176   

   Correlation Distance  
0              3.680587  
   Average WD (Continuous Columns  Average JSD (Categorical Columns)  \
0                        0.027566                           0.050871   

   Correlation Distance  
0              5.655496  


In [12]:
real_path = "Real_Datasets/Intrusion.csv" 
fake_paths = glob.glob("FakeFullRuns2/Intrusion/Stacked_Intrusion_fake_2022-06-17T23-22-19_firstgen.csv")
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
print(privacy_results)

fake_paths = glob.glob("FakeFullRuns2/Intrusion/Stacked_Intrusion_fake_2022-06-17T23-22-19_stacked.csv")
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
print(privacy_results)

   DCR between Real and Fake (5th perc)  DCR within Real(5th perc)  \
0                              0.463484                   0.005297   

   DCR within Fake (5th perc)  NNDR between Real and Fake (5th perc)  \
0                    0.049733                               0.812841   

   NNDR within Real (5th perc)  NNDR within Fake (5th perc)  
0                     0.239995                     0.397246  
   DCR between Real and Fake (5th perc)  DCR within Real(5th perc)  \
0                              0.522349                   0.005297   

   DCR within Fake (5th perc)  NNDR between Real and Fake (5th perc)  \
0                    0.013951                               0.844914   

   NNDR within Real (5th perc)  NNDR within Fake (5th perc)  
0                     0.239995                     0.312809  


In [13]:
# Specifying the path of the dataset used 
real_path = "Real_Datasets/Loan.csv" 

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Loan/Stacked_Loan_fake_2022-06-17T22-18-10_firstgen.csv")

classifiers_list = ["lr","dt","rf","mlp","svm"]

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
print(result_df)

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Loan/Stacked_Loan_fake_2022-06-17T22-18-10_stacked.csv")

# Storing and presenting the results as a dataframe
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",classifiers_list, test_ratio = 0.20)
result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = classifiers_list
print(result_df)


      Acc       AUC  F1_Score
lr    1.9  0.011073  0.110673
dt   12.0  0.261892  0.281349
rf    4.7  0.044294  0.164411
mlp   3.2  0.029948  0.121450
svm   4.3  0.057868  0.197317
      Acc       AUC  F1_Score
lr    3.3  0.044570  0.109825
dt   10.1  0.218796  0.241087
rf    4.4  0.063226  0.157362
mlp   4.3  0.064839  0.159393
svm   2.8  0.056301  0.113383


In [14]:

real_path = "Real_Datasets/Loan.csv" 

loan_categorical = ["Family","Education","PersonalLoan","Securities Account","CD Account","Online","CreditCard"]

# Specifying the categorical columns of the dataset used
stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Loan/Stacked_Loan_fake_2022-06-17T22-18-10_firstgen.csv")

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,loan_categorical)
    stat_res_avg.append(stat_res)

stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
print(stat_results)

# Collecting the paths to all corresponding generated datasets for evaluation 
fake_paths = glob.glob("FakeFullRuns2/Loan/Stacked_Loan_fake_2022-06-17T22-18-10_stacked.csv")

# Storing and presenting the results as a dataframe
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,loan_categorical)
    stat_res_avg.append(stat_res)


stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
print(stat_results)


   Average WD (Continuous Columns  Average JSD (Categorical Columns)  \
0                        0.040319                             0.0695   

   Correlation Distance  
0              1.622212  
   Average WD (Continuous Columns  Average JSD (Categorical Columns)  \
0                         0.06053                           0.133287   

   Correlation Distance  
0              1.946194  


In [15]:
real_path = "Real_Datasets/Loan.csv" 
fake_paths = glob.glob("FakeFullRuns2/Loan/Stacked_Loan_fake_2022-06-17T22-18-10_firstgen.csv")
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
print(privacy_results)

fake_paths = glob.glob("FakeFullRuns2/Loan/Stacked_Loan_fake_2022-06-17T22-18-10_stacked.csv")
# Storing and presenting the results as a dataframe
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
print(privacy_results)

   DCR between Real and Fake (5th perc)  DCR within Real(5th perc)  \
0                              0.866941                   0.607868   

   DCR within Fake (5th perc)  NNDR between Real and Fake (5th perc)  \
0                    0.731929                               0.630668   

   NNDR within Real (5th perc)  NNDR within Fake (5th perc)  
0                     0.494843                     0.584496  
   DCR between Real and Fake (5th perc)  DCR within Real(5th perc)  \
0                              1.051202                   0.607868   

   DCR within Fake (5th perc)  NNDR between Real and Fake (5th perc)  \
0                    0.600003                               0.719512   

   NNDR within Real (5th perc)  NNDR within Fake (5th perc)  
0                     0.494843                     0.502076  
