# Semi-supervised synthetic data validation

### Importing necessary libraries

In [1]:
import numpy as np
from library.generators.NextConvGeN import NextConvGeN


2023-05-23 13:24:27.349449: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-23 13:24:27.397484: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-23 13:24:27.398351: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ModuleNotFoundError: No module named 'sklearn'

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
from fdc.clustering import *

In [None]:
import pandas as pd

### Importing data

In [None]:
data=pd.read_csv('survey lung cancer.csv')

In [None]:
gender_mod= {'GENDER': {'F':1,'M':0}}
data.replace(gender_mod,inplace=True)


In [None]:
lung_cancer_mod= {'LUNG_CANCER': {'YES':1,'NO':0}}
data.replace(lung_cancer_mod,inplace=True)


In [None]:
data=data.drop(['LUNG_CANCER'],axis=1)

In [None]:
data

In [None]:
np.random.seed(42)
data=data.sample(frac=1)

In [None]:
values=['AGE','SMOKING','GENDER','CHRONIC DISEASE','ALCOHOL CONSUMING','SHORTNESS OF BREATH',
'YELLOW_FINGERS','ANXIETY','PEER_PRESSURE','WHEEZING','COUGHING','SWALLOWING DIFFICULTY',
'CHEST PAIN','FATIGUE ','ALLERGY ']

In [None]:
data=data[values]

### UMAP on entire data

In [None]:
from fdc.fdc import feature_clustering

In [None]:
np.random.seed(42)
umap_emb=feature_clustering(20,0.001,'euclidean',data,True)

In [None]:
from fdc.clustering import Clustering

In [None]:
np.random.seed(42)
umap_clustering=Clustering(umap_emb,umap_emb,True)
umap_cluster_list,umap_cluster_counts=umap_clustering.K_means(2)

### Feature-type Distributed Clustering (FDC)

In [None]:
cont_list=['AGE']

ord_list=['SMOKING','GENDER','CHRONIC DISEASE','ALCOHOL CONSUMING','SHORTNESS OF BREATH']

nom_list=['YELLOW_FINGERS','ANXIETY','PEER_PRESSURE','WHEEZING','COUGHING','SWALLOWING DIFFICULTY','CHEST PAIN','FATIGUE ','ALLERGY ']

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC

np.random.seed(42)

fdc = FDC(clustering_cont=Clustering('euclidean',20,0.01,max_components=1)
          , clustering_ord=Clustering('canberra',20,0.01)
          , clustering_nom=Clustering('hamming',20,0.01)
          , visual=True
          , use_pandas_output=True
          , with_2d_embedding=True
          )

fdc.selectFeatures(continueous=cont_list, nomial=nom_list, ordinal=ord_list)

FDC_emb_high,FDC_emb_low = fdc.normalize(data,cont_list=cont_list, nom_list=nom_list, ord_list=ord_list,
                  with_2d_embedding=True,
                  visual=True)

### Clustering using K-means

In [None]:
from fdc.clustering import *
np.random.seed(42)
clustering=Clustering(FDC_emb_low,FDC_emb_low,True)
cluster_list,cluster_counts=clustering.K_means(4)

In [None]:
cluster_counts

### Concatinating cluster labels as Target to original data

In [None]:
data["Target"]=cluster_list

In [None]:
data

### Train-val split

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
np.random.seed(42)
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(np.array(data.drop(["Target"],axis=1)), np.array(cluster_list), test_size = 0.25, random_state = 42)

### Gradient boosting classifier for original traing data (Model-1)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
np.random.seed(42)
model_1 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


In [None]:
from sklearn.model_selection import cross_val_score
score_lr=cross_val_score(model_1, train_features, train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))

In [None]:
model_1.fit(train_features,  train_labels)

### Preparing training data for synthetic data generation

In [None]:
train_features=pd.DataFrame(train_features)

In [None]:
train_features["Target"]=train_labels

In [None]:
train_features.to_csv('Lung_cancer_training_data(semi_sup).csv',index=False)

In [None]:
values=['AGE','SMOKING','GENDER','CHRONIC DISEASE','ALCOHOL CONSUMING','SHORTNESS OF BREATH',
'YELLOW_FINGERS','ANXIETY','PEER_PRESSURE','WHEEZING','COUGHING','SWALLOWING DIFFICULTY',
'CHEST PAIN','FATIGUE ','ALLERGY ','Target']

In [None]:
hold_out_data=pd.DataFrame(test_features)
hold_out_data['Target']=test_labels
hold_out_data.columns=values
hold_out_data.to_csv('Lung_hold_out__data(semi_sup).csv',index=False)

In [None]:
dicts = {}
for i in range(len(values)):
        dicts[values[i]]=i


In [None]:
train_features.rename(columns=dicts,inplace=True)


In [None]:
train_features

## NextConvGen

### FDC for searcing neighbors for synthetic data generation

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
np.random.seed(42)
fdc = FDC()
fdc.cont_list = [0]
fdc.ord_list = list(range(data.shape[1]))[1:6]
fdc.nom_list = list(range(data.shape[1]))[6:]

In [None]:
train_features=np.array(train_features)

### Preapring and training NextConvGen model

In [None]:
gen = NextConvGeN(train_features.shape[1], neb=5, fdc=fdc)

In [None]:
gen.reset(train_features)

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
np.random.seed(42)
gen.train(train_features)

### Generating 1000 synthetic points and randomly selecting the same legth of original training points for evaluation

In [None]:
from fdc.NN_models import *
np.random.seed(42)
syn_train_data = syn_data_gen(gen,1000,values)

In [None]:
def balanced_synthetic_data(org_data,syn_data):
    import math
    org_data_len = org_data.shape[0]
    class_counts = org_data.iloc[:,-1].value_counts()
    column_names = list( syn_data.columns)
    df_list=[]
    for i in range(len(class_counts)):
        m = len(syn_data[syn_data['Target']==i])

        if m < class_counts[i]:
            run=math.ceil(class_counts[i]/m)
            syn_data = syn_data_gen(gen,org_data_len*run,values)
        c_i=syn_data[syn_data['Target']==i].sample(n=int(class_counts[i]),axis=0,random_state=42)
        df_list.append(c_i)
    balanced_df=pd.concat(df_list,names=column_names)
    return balanced_df

In [None]:
syn_train_data=balanced_synthetic_data(pd.DataFrame(train_features),syn_train_data)

In [None]:
syn_train_data['Target'].value_counts()

In [None]:
syn_train_data.to_csv('Lung_cancer_NextConvGen_syn_data.csv',index=False)

In [None]:
np.random.seed(42)
syn_train_data=syn_train_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(syn_train_data)[0])]

syn_train_data.set_index(pd.Series(i), inplace=True)

In [None]:
syn_train_data

In [None]:
def exact_match_score(original_df, synthetic_df):
    original_data=original_df.drop('Target',axis=1,inplace=False)
    synthetic_data=synthetic_df.drop('Target',axis=1,inplace=False)
    n_matches = 0
    repeated=[]
    for i in range(synthetic_data.shape[0]):
        if (original_data == synthetic_data.iloc[i, :]).all(axis=1).any():
            repeated_data=synthetic_data.iloc[i, :]
            repeated.append(repeated_data)
            n_matches += 1    
    print('The number of data points repeated in the synthetic data are',n_matches / synthetic_data.shape[0])
    return pd.DataFrame(repeated)

In [None]:
original_df=pd.read_csv('Lung_cancer_training_data(semi_sup).csv')
original_df.columns=values

In [None]:
exact_match_score(original_df,syn_train_data)

In [None]:
len(syn_train_data[syn_train_data.duplicated(keep=False)])

### Train test split on synthetic data

In [None]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(syn_train_data.drop(["Target"],axis=1)), np.array(syn_train_data["Target"]), test_size = 0.25, random_state = 42)

### Gradient boosting classifier on NextConvGen generated synthetic training data  (Model-2)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
NextConvGen_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross Validation

In [None]:
score_lr=cross_val_score(NextConvGen_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


In [None]:
NextConvGen_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [None]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

### Case-2: Predicting validation data using Model-2


In [None]:
y=NextConvGen_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

### Comparing predicted values from Case-1 & Case-2

In [None]:
cluster_wise_F1score(list(x),list(y))

### UMAP on synthetic data

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
np.random.seed(42)
syn_emb_low=feature_clustering(30,0.01,'euclidean',syn_train_data.drop(["Target"],axis=1),True)

### FDC on synthetic data

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC

np.random.seed(42)
fdc = FDC(clustering_cont=Clustering('euclidean',20,0.01,max_components=1)
          , clustering_ord=Clustering('canberra',20,0.01)
          , clustering_nom=Clustering('hamming',20,0.01)
          , visual=True
          , use_pandas_output=True
          , with_2d_embedding=True
          )

fdc.selectFeatures(continueous=cont_list, nomial=nom_list, ordinal=ord_list)

FDC_emb_high,FDC_emb_low = fdc.normalize(syn_train_data.drop(['Target'],axis=1),cont_list=cont_list, nom_list=nom_list, ord_list=ord_list,
                  with_2d_embedding=True,
                  visual=True)

In [None]:
from fdc.clustering import *
clustering=Clustering(FDC_emb_low,FDC_emb_low,True)
cluster_list,cluster_counts=clustering.K_means(4)

In [None]:
cluster_counts

## CTGAN

In [None]:
train_features= pd.DataFrame(train_features)

In [None]:
dicts = {}
for i in range(len(values)):
        dicts[i]=values[i]


In [None]:
train_features.rename(columns=dicts,inplace=True)


In [None]:
from sdv.tabular import CTGAN

In [None]:
gen=CTGAN()

In [None]:
gen.fit(train_features)

### Generating 1000 synthetic points and randomly selecting the same legth of original training points for evaluation

In [None]:
syn_train_data= gen.sample(num_rows=1000)

In [None]:
from fdc.NN_models import *
CTGAN_syn_data=balanced_synthetic_data_CTGAN(train_features,syn_train_data,gen)

In [None]:
CTGAN_syn_data.to_csv('Lung_cancer_CTGAN_syn_data.csv',index=False)

In [None]:
np.random.seed(42)
CTGAN_syn_data=CTGAN_syn_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(CTGAN_syn_data)[0])]

CTGAN_syn_data.set_index(pd.Series(i), inplace=True)

In [None]:
exact_match_score(original_df,CTGAN_syn_data)

In [None]:
len(CTGAN_syn_data[CTGAN_syn_data.duplicated(keep=False)])

### Train-test split on CTGAN generated synthetic data points

In [None]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(CTGAN_syn_data.drop(["Target"],axis=1)), np.array(CTGAN_syn_data["Target"]), test_size = 0.25, random_state = 42)

### Gradient boosting classifier on CTGAN generated synthetic training data  (Model-2)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
CTGAN_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross validation

In [None]:
score_lr=cross_val_score(CTGAN_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


In [None]:
CTGAN_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [None]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

### Case-2: predicting validation data using Model-2


In [None]:
y=CTGAN_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

### Comparing predicted values from Case-1 & Case-2

In [None]:
cluster_wise_F1score(list(x),list(y))

### UMAP on CTGAN synthetic data

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
np.random.seed(42)
syn_emb_low=feature_clustering(30,0.01,'euclidean',CTGAN_syn_data.drop(["Target"],axis=1),True)

### FDC on CTGAN synthetic data

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC

np.random.seed(42)
fdc = FDC(clustering_cont=Clustering('euclidean',20,0.01,max_components=1)
          , clustering_ord=Clustering('canberra',20,0.01)
          , clustering_nom=Clustering('hamming',20,0.01)
          , visual=True
          , use_pandas_output=True
          , with_2d_embedding=True
          )

fdc.selectFeatures(continueous=cont_list, nomial=nom_list, ordinal=ord_list)

FDC_emb_high,FDC_emb_low = fdc.normalize(CTGAN_syn_data.drop(['Target'],axis=1),cont_list=cont_list, nom_list=nom_list, ord_list=ord_list,
                  with_2d_embedding=True,
                  visual=True)

In [None]:
from fdc.clustering import *
clustering=Clustering(FDC_emb_low,FDC_emb_low,True)
cluster_list,cluster_counts=clustering.K_means(4)

### CTAB GAN

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from library.generators.ctab import CtabGan

In [None]:
gen=CtabGan()

In [None]:
gen.reset(train_features)

In [None]:
gen.train(train_features)

### Generating 1000 synthetic points and randomly selecting the same legth of original training points for evaluation

In [None]:
CTABGAN_syn_data=  gen.generateData(1000)

In [None]:
CTABGAN_syn_data=pd.DataFrame(CTABGAN_syn_data)

In [None]:
CTABGAN_syn_data.rename(columns=dicts,inplace=True)
CTABGAN_syn_data

In [None]:
for i in values[7:]:
    CTABGAN_syn_data[i]=abs(np.round(CTABGAN_syn_data[i]))

In [None]:
CTABGAN_syn_data

In [None]:
from fdc.NN_models import *
CTABGAN_syn_data=balanced_synthetic_data_CTABGAN(train_features,CTABGAN_syn_data,gen)

In [None]:
CTABGAN_syn_data.to_csv('Lung_cancer_CTABGAN_syn_data.csv',index=False)

In [None]:
np.random.seed(42)
CTABGAN_syn_data=CTABGAN_syn_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(CTABGAN_syn_data)[0])]

CTABGAN_syn_data.set_index(pd.Series(i), inplace=True)

In [None]:
exact_match_score(original_df,CTABGAN_syn_data)

In [None]:
len(CTABGAN_syn_data[CTABGAN_syn_data.duplicated(keep=False)])

### Train-test split on CTABGAN generated synthetic data points

In [None]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(CTABGAN_syn_data.drop(["Target"],axis=1)), np.array(CTABGAN_syn_data["Target"]), test_size = 0.25, random_state = 42)

### Gradient boosting classifier on CTABGAN generated synthetic training data  (Model-2)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
CTABGAN_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross validation

In [None]:
score_lr=cross_val_score(CTABGAN_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


In [None]:
CTABGAN_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [None]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

### Case-2: predicting validation data using Model-2


In [None]:
y=CTABGAN_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

### Comparing predicted values from Case-1 & Case-2

In [None]:
cluster_wise_F1score(list(x),list(y))

### UMAP on CTABGAN synthetic data

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
np.random.seed(42)
syn_emb_low=feature_clustering(30,0.01,'euclidean',CTABGAN_syn_data.drop(["Target"],axis=1),True)

### FDC on CTABGAN synthetic data

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC

np.random.seed(42)
fdc = FDC(clustering_cont=Clustering('euclidean',20,0.01,max_components=1)
          , clustering_ord=Clustering('canberra',20,0.01)
          , clustering_nom=Clustering('hamming',20,0.01)
          , visual=True
          , use_pandas_output=True
          , with_2d_embedding=True
          )

fdc.selectFeatures(continueous=cont_list, nomial=nom_list, ordinal=ord_list)

FDC_emb_high,FDC_emb_low = fdc.normalize(CTABGAN_syn_data.drop(['Target'],axis=1),cont_list=cont_list, nom_list=nom_list, ord_list=ord_list,
                  with_2d_embedding=True,
                  visual=True)

In [None]:
from fdc.clustering import *
clustering=Clustering(FDC_emb_low,FDC_emb_low,True)
cluster_list,cluster_counts=clustering.K_means(4)

### Tabddpm

In [None]:

Tabddpm_syn_data=pd.read_csv('Lung_cancer_synth_data_semi_tddpm.csv')
Tabddpm_syn_data.drop(['Unnamed: 0'],axis=1,inplace=True)



In [None]:
Tabddpm_syn_data['Target'].value_counts()

In [None]:
def balanced_synthetic_data(org_data,syn_data):
    import math
    org_data_len = org_data.shape[0]
    class_counts = org_data.iloc[:,-1].value_counts()
    column_names = list( syn_data.columns)
    df_list=[]
    for i in range(len(class_counts)):
        c_i=syn_data[syn_data['Target']==i].sample(n=int(class_counts[i]),axis=0,random_state=42)
        df_list.append(c_i)
    balanced_df=pd.concat(df_list,names=column_names)
    return balanced_df

In [None]:
Tabddpm_syn_data=balanced_synthetic_data(pd.DataFrame(train_features),Tabddpm_syn_data)

In [None]:
Tabddpm_syn_data['Target'].value_counts()

In [None]:
Tabddpm_syn_data.to_csv('Lung_cancer_Tabddpm_syn_data(semi_sup).csv',index=False)

In [None]:
np.random.seed(42)
Tabddpm_syn_data=Tabddpm_syn_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(Tabddpm_syn_data)[0])]

Tabddpm_syn_data.set_index(pd.Series(i), inplace=True)

In [None]:
exact_match_score(original_df,Tabddpm_syn_data)

In [None]:
len(Tabddpm_syn_data[Tabddpm_syn_data.duplicated(keep=False)])

### Train test split on synthetic data

In [None]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(Tabddpm_syn_data.drop(["Target"],axis=1)), np.array(Tabddpm_syn_data["Target"]), test_size = 0.25, random_state = 42)

### Gradient boosting classifier on Tabddpm generated synthetic training data  (Model-2)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
Tabddpm_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross Validation

In [None]:
score_lr=cross_val_score(Tabddpm_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


In [None]:
Tabddpm_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [None]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

### Case-2: Predicting validation data using Model-2


In [None]:
y=Tabddpm_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

### Comparing predicted values from Case-1 & Case-2

In [None]:
cluster_wise_F1score(list(x),list(y))

### UMAP on synthetic data

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
np.random.seed(42)
syn_emb_low=feature_clustering(30,0.01,'euclidean',Tabddpm_syn_data.drop(["Target"],axis=1),True)

### FDC on synthetic data

In [None]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC

np.random.seed(42)
fdc = FDC(clustering_cont=Clustering('euclidean',20,0.01,max_components=1)
          , clustering_ord=Clustering('canberra',20,0.01)
          , clustering_nom=Clustering('hamming',20,0.01)
          , visual=True
          , use_pandas_output=True
          , with_2d_embedding=True
          )

fdc.selectFeatures(continueous=cont_list, nomial=nom_list, ordinal=ord_list)

FDC_emb_high,FDC_emb_low = fdc.normalize(Tabddpm_syn_data.drop(['Target'],axis=1),cont_list=cont_list, nom_list=nom_list, ord_list=ord_list,
                  with_2d_embedding=True,
                  visual=True)

In [None]:
from fdc.clustering import *
clustering=Clustering(FDC_emb_low,FDC_emb_low,True)
cluster_list,cluster_counts=clustering.K_means(4)

In [None]:
cluster_counts