# Supervised synthetic data validation

### Importing necessary libraries

In [1]:
import numpy as np
from library.generators.NextConvGeN import NextConvGeN
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
from fdc.clustering import *
import pandas as pd

2023-05-09 14:11:00.078668: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-09 14:11:00.078688: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Importing data

In [2]:
data=pd.read_csv('survey lung cancer.csv')

In [3]:
gender_mod= {'GENDER': {'F':1,'M':0}}
data.replace(gender_mod,inplace=True)
data['GENDER']

0      0
1      0
2      1
3      0
4      1
      ..
304    1
305    0
306    0
307    0
308    0
Name: GENDER, Length: 309, dtype: int64

In [4]:
lung_cancer_mod= {'LUNG_CANCER': {'YES':1,'NO':0}}
data.replace(lung_cancer_mod,inplace=True)
data['LUNG_CANCER']

0      1
1      1
2      0
3      0
4      0
      ..
304    1
305    1
306    1
307    1
308    1
Name: LUNG_CANCER, Length: 309, dtype: int64

In [5]:
data=data.rename(columns = {'LUNG_CANCER': 'Target'})

In [6]:
np.random.seed(42)
data=data.sample(frac=1)

In [7]:
values=['AGE','SMOKING','GENDER','CHRONIC DISEASE','ALCOHOL CONSUMING','SHORTNESS OF BREATH',
'YELLOW_FINGERS','ANXIETY','PEER_PRESSURE','WHEEZING','COUGHING','SWALLOWING DIFFICULTY',
'CHEST PAIN','FATIGUE ','ALLERGY ','Target']

In [8]:
data=data[values]

### Train-val split

In [9]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(np.array(data.drop(["Target"],axis=1)), np.array(data['Target']), test_size = 0.25, random_state = 42)

### Gradient boosting classifier for original training data (Model-1)

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
np.random.seed(42)
model_1 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)
model_1.fit(train_features,  train_labels)

In [11]:
from sklearn.model_selection import cross_val_score
np.random.seed(42)
score_lr=cross_val_score(model_1, train_features, train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))

[0.87234043 0.91304348 0.91304348 0.86956522 0.91304348]
Avg : 0.8962072155411656


### Preparing training data for synthetic data generation

In [12]:
train_features=pd.DataFrame(train_features)

In [13]:
train_features['Target']=train_labels

In [14]:
values=['AGE','SMOKING','GENDER','CHRONIC DISEASE','ALCOHOL CONSUMING','SHORTNESS OF BREATH',
'YELLOW_FINGERS','ANXIETY','PEER_PRESSURE','WHEEZING','COUGHING','SWALLOWING DIFFICULTY',
'CHEST PAIN','FATIGUE ','ALLERGY ','Target']

In [15]:
train_features.to_csv('Lung_cancer_training_data(sup).csv',index=False)

In [16]:
hold_out_data=pd.DataFrame(test_features)
hold_out_data['Target']=test_labels
hold_out_data.columns=values
hold_out_data.to_csv('Lung_cancer_hold_out__data(sup).csv',index=False)

In [17]:
dicts = {}
for i in range(len(values)):
        dicts[values[i]]=i


In [18]:
train_features.rename(columns=dicts,inplace=True)


In [19]:
train_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,51,2,1,1,1,2,2,2,2,1,1,2,1,2,2,1
1,64,2,0,2,2,1,2,2,2,1,1,2,2,1,1,1
2,61,1,0,2,2,2,1,2,1,2,2,2,1,2,1,1
3,64,2,0,2,2,1,2,2,2,1,1,2,2,1,1,1
4,77,2,0,1,2,1,1,1,1,2,2,1,1,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,62,2,0,1,2,2,1,2,1,2,2,1,2,2,1,1
227,52,2,0,1,2,2,1,1,1,2,2,1,2,2,1,1
228,64,2,1,2,1,1,2,1,2,1,1,1,1,1,1,0
229,59,2,0,1,2,2,1,1,1,2,2,1,2,2,2,1


### FDC for searcing neighbors for synthetic data generation

In [20]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
np.random.seed(42)
fdc = FDC()
fdc.cont_list = [0]
fdc.ord_list = list(range(data.shape[1]))[1:6]
fdc.nom_list = list(range(data.shape[1]))[6:]

In [21]:
train_features=np.array(train_features)

### Preapring and training NextConvGen model

In [22]:
gen = NextConvGeN(train_features.shape[1], neb=5, fdc=fdc)

In [23]:
gen.reset(train_features)

2023-05-09 14:11:05.534525: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-05-09 14:11:05.534551: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-05-09 14:11:05.534568: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (f61d203ff696): /proc/driver/nvidia/version does not exist
2023-05-09 14:11:05.534722: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [24]:
train_features

array([[51,  2,  1, ...,  2,  2,  1],
       [64,  2,  0, ...,  1,  1,  1],
       [61,  1,  0, ...,  2,  1,  1],
       ...,
       [64,  2,  1, ...,  1,  1,  0],
       [59,  2,  0, ...,  2,  2,  1],
       [70,  2,  1, ...,  1,  2,  1]])

In [25]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
np.random.seed(42)
gen.train(train_features)

FDC.normalize (init): 0.00001 / 0.000s
FDC.normalize (clustering CONT): 5.96351 / 5.964s
FDC.normalize (clustering ORD): 2.71818 / 8.682s
FDC.normalize (clustering NOM): 1.42787 / 10.110s
FDC.normalize (concat): 0.00023 / 10.110s
FDC.normalize (total): 0.00001 / 10.110s
|N| = (231, 5)
|D| = (231, 16)

KeyboardInterrupt: 

### Generating 1000 synthetic points and randomly selecting 300 points for evaluation

In [None]:
from fdc.NN_models import *
np.random.seed(42)
syn_train_data = syn_data_gen(gen,1000,values)

In [None]:
def balanced_synthetic_data(org_data,syn_data):
    import math
    org_data_len = org_data.shape[0]
    class_counts = org_data.iloc[:,-1].value_counts()
    column_names = list( syn_data.columns)
    df_list=[]
    for i in range(len(class_counts)):
        m = len(syn_data[syn_data['Target']==i])

        if m < class_counts[i]:
            run=math.ceil(class_counts[i]/m)
            syn_data = syn_data_gen(gen,org_data_len*run,values)
        c_i=syn_data[syn_data['Target']==i].sample(n=int(class_counts[i]),axis=0,random_state=42)
        df_list.append(c_i)
    balanced_df=pd.concat(df_list,names=column_names)
    return balanced_df

In [None]:
syn_train_data=balanced_synthetic_data(pd.DataFrame(train_features),syn_train_data)

In [None]:
syn_train_data.to_csv('Lung_cancer_NextConvGen_syn_data(supervised).csv',index=False)

In [None]:
syn_train_data['Target'].value_counts()

In [None]:
np.random.seed(42)
syn_train_data=syn_train_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(syn_train_data)[0])]

syn_train_data.set_index(pd.Series(i), inplace=True)

In [None]:
def exact_match_score(original_df, synthetic_df):
    original_data=original_df.drop('Target',axis=1,inplace=False)
    synthetic_data=synthetic_df.drop('Target',axis=1,inplace=False)
    n_matches = 0
    repeated=[]
    for i in range(synthetic_data.shape[0]):
        if (original_data == synthetic_data.iloc[i, :]).all(axis=1).any():
            repeated_data=synthetic_data.iloc[i, :]
            repeated.append(repeated_data)
            n_matches += 1    
    print('The number of data points repeated in the synthetic data are',n_matches / synthetic_data.shape[0])
    return pd.DataFrame(repeated)

In [None]:
original_df=pd.read_csv('Lung_cancer_training_data(sup).csv')
original_df.columns=values

In [None]:
exact_match_score(original_df,syn_train_data)

In [None]:
len(syn_train_data[syn_train_data.duplicated(keep=False)])

### Train test split on synthetic data

In [None]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(syn_train_data.drop(['Target'],axis=1)), np.array(syn_train_data['Target']), test_size = 0.25, random_state = 42)

### Gradient boosting classifier for synthetic samples traing data (Model-2)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
NextConvGen_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross Validation

In [None]:
score_lr=cross_val_score(NextConvGen_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


In [None]:
NextConvGen_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [None]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

### Case-2: Predicting validation data using Model-2


In [None]:
y=NextConvGen_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

### Comparing predicted values from Case-1 & Case-2

In [None]:
cluster_wise_F1score(list(x),list(y))

### CTGAN

In [None]:
train_features=pd.DataFrame(train_features)

In [None]:
dicts = {}
for i in range(len(values)):
        dicts[i]=values[i]

In [None]:
train_features.rename(columns=dicts,inplace=True)


In [None]:
train_features

In [None]:
from sdv.tabular import CTGAN

In [None]:
gen=CTGAN()

In [None]:
gen.fit(train_features)

### Generating 1000 synthetic points and randomly selecting 300 points for evaluation

In [None]:
CTGAN_syn_data= gen.sample(num_rows=462)

In [None]:
from fdc.NN_models import *
np.random.seed(42)
CTGAN_syn_data=balanced_synthetic_data_CTGAN(train_features,CTGAN_syn_data,gen)

In [None]:
CTGAN_syn_data.to_csv('Lung_cancer_CTGAN_syn_data(supervised).csv',index=False)

In [None]:
np.random.seed(42)
CTGAN_syn_data=CTGAN_syn_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(syn_train_data)[0])]

CTGAN_syn_data.set_index(pd.Series(i), inplace=True)

### Train test split on synthetic data

In [None]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(CTGAN_syn_data.drop(["Target"],axis=1)), np.array(CTGAN_syn_data["Target"]), test_size = 0.25, random_state = 42)

### Gradient boosting classifier on CTGAN generated synthetic training data  (Model-2)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
CTGAN_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross validation

In [None]:
score_lr=cross_val_score(CTGAN_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


In [None]:
CTGAN_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [None]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

### Case-2: predicting validation data using Model-2


In [None]:
y=CTGAN_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

### Comparing predicted values from Case-1 & Case-2

In [None]:
cluster_wise_F1score(list(x),list(y))

### CTABGAN

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from library.generators.ctab import CtabGan

In [None]:
gen=CtabGan()

In [None]:
gen.reset(train_features)

In [None]:
gen.train(train_features)

### Generating 1000 synthetic points and randomly selecting 300 points for evaluation

In [None]:
CTABGAN_syn_data= gen.generateData(1000)

In [None]:
CTABGAN_syn_data=pd.DataFrame(CTABGAN_syn_data)

In [None]:
CTABGAN_syn_data.rename(columns=dicts,inplace=True)


In [None]:
CTABGAN_syn_data

In [None]:
for i in values[7:]:
    CTABGAN_syn_data[i]=abs(np.round(CTABGAN_syn_data[i]))

In [None]:
from fdc.NN_models import *
np.random.seed(42)
CTABGAN_syn_data=balanced_synthetic_data_CTABGAN(train_features,CTABGAN_syn_data,gen)

In [None]:
CTABGAN_syn_data.to_csv('Lung_cancer_CTABGAN_syn_data(supervised).csv',index=False)

In [None]:
np.random.seed(42)
CTABGAN_syn_data=CTABGAN_syn_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(CTABGAN_syn_data)[0])]

CTABGAN_syn_data.set_index(pd.Series(i), inplace=True)

### Train test split on synthetic data

In [None]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(CTABGAN_syn_data.drop(["Target"],axis=1)), np.array(CTABGAN_syn_data["Target"]), test_size = 0.25, random_state = 42)

### Gradient boosting classifier on CTGAN generated synthetic training data  (Model-2)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
CTABGAN_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross validation

In [None]:
score_lr=cross_val_score(CTABGAN_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


In [None]:
CTABGAN_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [None]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

### Case-2: predicting validation data using Model-2


In [None]:
y=CTABGAN_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

### Comparing predicted values from Case-1 & Case-2

In [None]:
cluster_wise_F1score(list(x),list(y))

### Tabddpm

In [None]:

Tabddpm_syn_data=pd.read_csv('Lung_cancer_synth_data_tddpm.csv')
Tabddpm_syn_data.drop(['Unnamed: 0'],axis=1,inplace=True)



In [None]:
Tabddpm_syn_data['Target'].value_counts()

In [None]:
def balanced_synthetic_data(org_data,syn_data):
    import math
    org_data_len = org_data.shape[0]
    class_counts = org_data.iloc[:,-1].value_counts()
    column_names = list( syn_data.columns)
    df_list=[]
    for i in range(len(class_counts)):
        c_i=syn_data[syn_data['Target']==i].sample(n=int(class_counts[i]),axis=0,random_state=42)
        df_list.append(c_i)
    balanced_df=pd.concat(df_list,names=column_names)
    return balanced_df

In [None]:
Tabddpm_syn_data=balanced_synthetic_data(pd.DataFrame(train_features),Tabddpm_syn_data)

In [None]:
Tabddpm_syn_data['Target'].value_counts()

In [None]:
Tabddpm_syn_data.to_csv('Lung_cancer_Tabddpm_syn_data(sup).csv',index=False)

In [None]:
np.random.seed(42)
Tabddpm_syn_data=Tabddpm_syn_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(Tabddpm_syn_data)[0])]

Tabddpm_syn_data.set_index(pd.Series(i), inplace=True)

### Train test split on synthetic data

In [None]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(Tabddpm_syn_data.drop(['Target'],axis=1)), np.array(Tabddpm_syn_data['Target']), test_size = 0.25, random_state = 42)

### Gradient boosting classifier for synthetic samples training data (Model-2)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
Tabddpm_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross Validation

In [None]:
score_lr=cross_val_score(Tabddpm_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


In [None]:
Tabddpm_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [None]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

### Case-2: Predicting validation data using Model-2


In [None]:
y=Tabddpm_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

### Comparing predicted values from Case-1 & Case-2

In [None]:
cluster_wise_F1score(list(x),list(y))