# Supervised synthetic data validation

### Importing necessary libraries

In [1]:
import numpy as np
from library.generators.NextConvGeN import NextConvGeN
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
from fdc.clustering import *
import pandas as pd

2023-05-09 13:47:04.081397: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-09 13:47:04.081422: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Importing data

In [2]:
data=pd.read_csv('heart_failure_clinical_records_dataset.csv')

In [3]:
data=data.rename(columns = {'DEATH_EVENT': 'Target'})

In [4]:
np.random.seed(42)
data=data.sample(frac=1)

In [5]:
values= ['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time','anaemia','diabetes','high_blood_pressure','sex','smoking','Target']

In [6]:
data=data[values]

### Train-val split

In [7]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(np.array(data.drop(["Target"],axis=1)), np.array(data['Target']), test_size = 0.25, random_state = 42)

### Gradient boosting classifier for original training data (Model-1)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
np.random.seed(42)
model_1 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)
model_1.fit(train_features,  train_labels)

In [9]:
from sklearn.model_selection import cross_val_score
np.random.seed(42)
score_lr=cross_val_score(model_1, train_features, train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))

[0.82222222 0.75555556 0.75555556 0.82222222 0.81818182]
Avg : 0.7947474747474746


### Preparing training data for synthetic data generation

In [10]:
train_features=pd.DataFrame(train_features)

In [11]:
train_features['Target']=train_labels

In [12]:
train_features.to_csv('Heart_failure_training_data(sup).csv',index=False)

In [13]:
hold_out_data=pd.DataFrame(test_features)
hold_out_data['Target']=test_labels
hold_out_data.columns=values
hold_out_data.to_csv('Heart_failure_hold_out_data(sup).csv',index=False)

In [14]:
dicts = {}
for i in range(len(values)):
        dicts[values[i]]=i


In [15]:
train_features.rename(columns=dicts,inplace=True)


In [16]:
train_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,67.0,582.0,50.0,263358.03,1.18,137.0,76.0,0.0,0.0,0.0,1.0,1.0,0
1,45.0,2060.0,60.0,742000.00,0.80,138.0,278.0,0.0,1.0,0.0,0.0,0.0,0
2,53.0,446.0,60.0,263358.03,1.00,139.0,215.0,1.0,0.0,1.0,1.0,0.0,0
3,55.0,748.0,45.0,263000.00,1.30,137.0,88.0,0.0,0.0,0.0,1.0,0.0,0
4,70.0,81.0,35.0,533000.00,1.30,139.0,212.0,0.0,1.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,50.0,318.0,40.0,216000.00,2.30,131.0,60.0,0.0,0.0,1.0,0.0,0.0,1
220,65.0,56.0,25.0,237000.00,5.00,130.0,207.0,0.0,0.0,0.0,0.0,0.0,0
221,70.0,125.0,25.0,237000.00,1.00,140.0,15.0,1.0,0.0,1.0,0.0,0.0,1
222,68.0,577.0,25.0,166000.00,1.00,138.0,43.0,1.0,0.0,1.0,1.0,0.0,1


### FDC for searcing neighbors for synthetic data generation

In [17]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
np.random.seed(42)
fdc = FDC()
fdc.cont_list =list(range(data.shape[1]))[0:7]
fdc.ord_list=list(range(data.shape[1]))[7:12]
fdc.nom_list=list(range(data.shape[1]))[12:]

In [18]:
train_features=np.array(train_features)

### Preapring and training NextConvGen model

In [19]:
gen = NextConvGeN(train_features.shape[1], neb=5, fdc=fdc)

In [20]:
gen.reset(train_features)

2023-05-09 13:47:08.637573: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-05-09 13:47:08.637589: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-05-09 13:47:08.637601: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (f61d203ff696): /proc/driver/nvidia/version does not exist
2023-05-09 13:47:08.637709: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
train_features

array([[6.70e+01, 5.82e+02, 5.00e+01, ..., 1.00e+00, 1.00e+00, 0.00e+00],
       [4.50e+01, 2.06e+03, 6.00e+01, ..., 0.00e+00, 0.00e+00, 0.00e+00],
       [5.30e+01, 4.46e+02, 6.00e+01, ..., 1.00e+00, 0.00e+00, 0.00e+00],
       ...,
       [7.00e+01, 1.25e+02, 2.50e+01, ..., 0.00e+00, 0.00e+00, 1.00e+00],
       [6.80e+01, 5.77e+02, 2.50e+01, ..., 1.00e+00, 0.00e+00, 1.00e+00],
       [7.00e+01, 1.71e+02, 6.00e+01, ..., 1.00e+00, 1.00e+00, 0.00e+00]])

In [22]:
from fdc.fdc import feature_clustering, canberra_modified, Clustering, FDC
np.random.seed(42)
gen.train(train_features)

FDC.normalize (init): 0.00001 / 0.000s
FDC.normalize (clustering CONT): 4.97014 / 4.970s
FDC.normalize (clustering ORD): 2.36401 / 7.334s
FDC.normalize (clustering NOM): 2.23305 / 9.567s
FDC.normalize (concat): 0.00035 / 9.568s
FDC.normalize (total): 0.00005 / 9.568s
|N| = (224, 5)
|D| = (224, 13)

### Generating 1000 synthetic points and randomly selecting 300 points for evaluation

In [23]:
from fdc.NN_models import *
np.random.seed(42)
syn_train_data = syn_data_gen(gen,1000,values)

In [24]:
def balanced_synthetic_data(org_data,syn_data):
    import math
    org_data_len = org_data.shape[0]
    class_counts = org_data.iloc[:,-1].value_counts()
    column_names = list( syn_data.columns)
    df_list=[]
    for i in range(len(class_counts)):
        m = len(syn_data[syn_data['Target']==i])

        if m < class_counts[i]:
            run=math.ceil(class_counts[i]/m)
            syn_data = syn_data_gen(gen,org_data_len*run,values)
        c_i=syn_data[syn_data['Target']==i].sample(n=int(class_counts[i]),axis=0,random_state=42)
        df_list.append(c_i)
    balanced_df=pd.concat(df_list,names=column_names)
    return balanced_df

In [25]:
syn_train_data=balanced_synthetic_data(pd.DataFrame(train_features),syn_train_data)

In [26]:
syn_train_data.to_csv('Heart_failure_NextConvGen_syn_data(supervised).csv',index=False)

In [27]:
syn_train_data['Target'].value_counts()

0.0    151
1.0     73
Name: Target, dtype: int64

In [28]:
np.random.seed(42)
syn_train_data=syn_train_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(syn_train_data)[0])]

syn_train_data.set_index(pd.Series(i), inplace=True)

In [29]:
def exact_match_score(original_df, synthetic_df):
    original_data=original_df.drop('Target',axis=1,inplace=False)
    synthetic_data=synthetic_df.drop('Target',axis=1,inplace=False)
    n_matches = 0
    repeated=[]
    for i in range(synthetic_data.shape[0]):
        if (original_data == synthetic_data.iloc[i, :]).all(axis=1).any():
            repeated_data=synthetic_data.iloc[i, :]
            repeated.append(repeated_data)
            n_matches += 1    
    print('The number of data points repeated in the synthetic data are',n_matches / synthetic_data.shape[0])
    return pd.DataFrame(repeated)

In [30]:
original_df=pd.read_csv('Heart_failure_training_data(sup).csv')
original_df.columns=values

In [31]:
exact_match_score(original_df,syn_train_data)

The number of data points repeated in the synthetic data are 0.0


In [32]:
len(syn_train_data[syn_train_data.duplicated(keep=False)])

31

### Train test split on synthetic data

In [33]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(syn_train_data.drop(['Target'],axis=1)), np.array(syn_train_data['Target']), test_size = 0.25, random_state = 42)

### Gradient boosting classifier for synthetic samples traing data (Model-2)

In [34]:
from sklearn.ensemble import GradientBoostingClassifier
NextConvGen_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross Validation

In [35]:
score_lr=cross_val_score(NextConvGen_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


[0.88235294 0.82352941 0.82352941 0.75757576 0.81818182]
Avg : 0.8210338680926916


In [36]:
NextConvGen_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [37]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

F1_Score of cluster 0 is 0.8269230769230769
Geometric mean of cluster 0 is 0.8269230769230769
Correctly predicted data points in cluster 0 is 82.6923076923077%


F1_Score of cluster 1 is 0.6086956521739131
Geometric mean of cluster 1 is 0.6086956521739131
Correctly predicted data points in cluster 1 is 60.86956521739131%


weigted average F1_Score of all clusters is 0.76
weighted average Geometric mean of all clusters is 0.76
weighted average of Correctly predicted data points in all clusters is 76.0%


### Case-2: Predicting validation data using Model-2


In [38]:
y=NextConvGen_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

F1_Score of cluster 0 is 0.8349514563106797
Geometric mean of cluster 0 is 0.8349908101841637
Correctly predicted data points in cluster 0 is 82.6923076923077%


F1_Score of cluster 1 is 0.6382978723404256
Geometric mean of cluster 1 is 0.6384423980690614
Correctly predicted data points in cluster 1 is 65.21739130434783%


weigted average F1_Score of all clusters is 0.7746443572264685
weighted average Geometric mean of all clusters is 0.774715963802199
weighted average of Correctly predicted data points in all clusters is 77.33333333333334%


### Comparing predicted values from Case-1 & Case-2

In [39]:
cluster_wise_F1score(list(x),list(y))

F1_Score of cluster 0 is 0.8737864077669903
Geometric mean of cluster 0 is 0.8738275920531946
Correctly predicted data points in cluster 0 is 86.53846153846155%


F1_Score of cluster 1 is 0.723404255319149
Geometric mean of cluster 1 is 0.7235680511449363
Correctly predicted data points in cluster 1 is 73.91304347826086%


weigted average F1_Score of all clusters is 0.8276692143496522
weighted average Geometric mean of all clusters is 0.8277479995079954
weighted average of Correctly predicted data points in all clusters is 82.66666666666667%


### CTGAN

In [40]:
train_features=pd.DataFrame(train_features)

In [41]:
dicts = {}
for i in range(len(values)):
        dicts[i]=values[i]

In [42]:
train_features.rename(columns=dicts,inplace=True)


In [43]:
train_features

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium,time,anaemia,diabetes,high_blood_pressure,sex,smoking,Target
0,67.0,582.0,50.0,263358.03,1.18,137.0,76.0,0.0,0.0,0.0,1.0,1.0,0.0
1,45.0,2060.0,60.0,742000.00,0.80,138.0,278.0,0.0,1.0,0.0,0.0,0.0,0.0
2,53.0,446.0,60.0,263358.03,1.00,139.0,215.0,1.0,0.0,1.0,1.0,0.0,0.0
3,55.0,748.0,45.0,263000.00,1.30,137.0,88.0,0.0,0.0,0.0,1.0,0.0,0.0
4,70.0,81.0,35.0,533000.00,1.30,139.0,212.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,50.0,318.0,40.0,216000.00,2.30,131.0,60.0,0.0,0.0,1.0,0.0,0.0,1.0
220,65.0,56.0,25.0,237000.00,5.00,130.0,207.0,0.0,0.0,0.0,0.0,0.0,0.0
221,70.0,125.0,25.0,237000.00,1.00,140.0,15.0,1.0,0.0,1.0,0.0,0.0,1.0
222,68.0,577.0,25.0,166000.00,1.00,138.0,43.0,1.0,0.0,1.0,1.0,0.0,1.0


In [44]:
from sdv.tabular import CTGAN

In [45]:
gen=CTGAN()

In [46]:
gen.fit(train_features)

### Generating 1000 synthetic points and randomly selecting 300 points for evaluation

In [47]:
syn_train_data= gen.sample(num_rows=462)

In [48]:
from fdc.NN_models import *
np.random.seed(42)
syn_train_data=balanced_synthetic_data_CTGAN(train_features,syn_train_data,gen)

In [49]:
syn_train_data.to_csv('Heart_failure_CTGAN_syn_data(supervised).csv',index=False)

In [50]:
np.random.seed(42)
syn_train_data=syn_train_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(syn_train_data)[0])]

syn_train_data.set_index(pd.Series(i), inplace=True)

### Train test split on synthetic data

In [51]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(syn_train_data.drop(["Target"],axis=1)), np.array(syn_train_data["Target"]), test_size = 0.25, random_state = 42)

### Gradient boosting classifier on CTGAN generated synthetic training data  (Model-2)

In [52]:
from sklearn.ensemble import GradientBoostingClassifier
CTGAN_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross validation

In [53]:
score_lr=cross_val_score(CTGAN_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


[0.38235294 0.52941176 0.58823529 0.60606061 0.51515152]
Avg : 0.5242424242424242


In [54]:
CTGAN_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [55]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

F1_Score of cluster 0 is 0.8269230769230769
Geometric mean of cluster 0 is 0.8269230769230769
Correctly predicted data points in cluster 0 is 82.6923076923077%


F1_Score of cluster 1 is 0.6086956521739131
Geometric mean of cluster 1 is 0.6086956521739131
Correctly predicted data points in cluster 1 is 60.86956521739131%


weigted average F1_Score of all clusters is 0.76
weighted average Geometric mean of all clusters is 0.76
weighted average of Correctly predicted data points in all clusters is 76.0%


### Case-2: predicting validation data using Model-2


In [56]:
y=CTGAN_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

F1_Score of cluster 0 is 0.45238095238095233
Geometric mean of cluster 0 is 0.46577582095318704
Correctly predicted data points in cluster 0 is 36.53846153846153%


F1_Score of cluster 1 is 0.30303030303030304
Geometric mean of cluster 1 is 0.31798150037444667
Correctly predicted data points in cluster 1 is 43.47826086956522%


weigted average F1_Score of all clusters is 0.40658008658008654
weighted average Geometric mean of all clusters is 0.42045222930903997
weighted average of Correctly predicted data points in all clusters is 38.666666666666664%


### Comparing predicted values from Case-1 & Case-2

In [57]:
cluster_wise_F1score(list(x),list(y))

F1_Score of cluster 0 is 0.4761904761904762
Geometric mean of cluster 0 is 0.4902903378454601
Correctly predicted data points in cluster 0 is 38.46153846153847%


F1_Score of cluster 1 is 0.33333333333333337
Geometric mean of cluster 1 is 0.34977965041189135
Correctly predicted data points in cluster 1 is 47.82608695652174%


weigted average F1_Score of all clusters is 0.4323809523809524
weighted average Geometric mean of all clusters is 0.4472003936991657
weighted average of Correctly predicted data points in all clusters is 41.333333333333336%


### CTABGAN

In [58]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from library.generators.ctab import CtabGan

In [59]:
gen=CtabGan()

In [60]:
gen.reset(train_features)

In [61]:
gen.train(train_features)

100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


### Generating 1000 synthetic points and randomly selecting 300 points for evaluation

In [62]:
CTABGAN_syn_data= gen.generateData(1000)

In [63]:
CTABGAN_syn_data=pd.DataFrame(CTABGAN_syn_data)

In [64]:
CTABGAN_syn_data.rename(columns=dicts,inplace=True)


In [65]:
CTABGAN_syn_data

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium,time,anaemia,diabetes,high_blood_pressure,sex,smoking,Target
0,76.502902,1451.955211,24.813334,423318.811480,0.945805,136.348853,56.636855,0.015975,0.967052,-0.019230,0.992520,0.975939,0.972991
1,49.174668,1654.505833,26.328112,324536.881587,1.042887,137.469756,81.540074,0.991753,0.955791,-0.010348,-0.009122,-0.009819,0.996541
2,63.300615,3347.607446,49.379856,315735.383503,4.386351,126.993264,61.796209,-0.042856,-0.015103,0.996991,0.970601,-0.007299,-0.009257
3,77.525052,1544.190729,37.189865,236608.070377,5.523387,135.641191,202.191547,-0.019992,-0.001169,1.009209,0.988566,0.997567,0.972794
4,63.854281,561.549702,49.922254,209255.467541,1.992139,137.084330,197.024269,0.014237,0.978707,0.996688,-0.015984,0.960316,-0.013109
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,76.249015,134.889573,54.301679,255108.453524,1.796815,137.138836,205.828236,-0.003399,-0.007037,0.982228,0.992188,0.962155,0.991160
996,63.275214,146.967008,52.827019,329687.076213,0.934849,135.963923,213.652785,-0.007372,-0.006293,0.001292,0.036789,0.972441,1.020177
997,63.278361,154.426283,50.742566,328220.832928,1.948289,136.320381,59.477613,-0.004380,0.004456,-0.000647,1.008869,-0.007742,0.000739
998,76.042296,4457.592509,38.796080,240772.251519,0.962533,122.014042,216.874923,-0.010556,-0.008604,-0.002725,0.039808,-0.007284,0.938209


In [66]:
for i in values[7:]:
    CTABGAN_syn_data[i]=abs(np.round(CTABGAN_syn_data[i]))

In [67]:
from fdc.NN_models import *
np.random.seed(42)
CTABGAN_syn_data=balanced_synthetic_data_CTABGAN(train_features,CTABGAN_syn_data,gen)

In [68]:
CTABGAN_syn_data.to_csv('Heart_failure_CTABGAN_syn_data(supervised).csv',index=False)

In [69]:
np.random.seed(42)
CTABGAN_syn_data=CTABGAN_syn_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(CTABGAN_syn_data)[0])]

CTABGAN_syn_data.set_index(pd.Series(i), inplace=True)

### Train test split on synthetic data

In [70]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(CTABGAN_syn_data.drop(["Target"],axis=1)), np.array(CTABGAN_syn_data["Target"]), test_size = 0.25, random_state = 42)

### Gradient boosting classifier on CTABGAN generated synthetic training data  (Model-2)

In [71]:
from sklearn.ensemble import GradientBoostingClassifier
CTABGAN_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross validation

In [72]:
score_lr=cross_val_score(CTABGAN_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


[0.52941176 0.70588235 0.47058824 0.48484848 0.36363636]
Avg : 0.5108734402852051


In [73]:
CTABGAN_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [74]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

F1_Score of cluster 0 is 0.8269230769230769
Geometric mean of cluster 0 is 0.8269230769230769
Correctly predicted data points in cluster 0 is 82.6923076923077%


F1_Score of cluster 1 is 0.6086956521739131
Geometric mean of cluster 1 is 0.6086956521739131
Correctly predicted data points in cluster 1 is 60.86956521739131%


weigted average F1_Score of all clusters is 0.76
weighted average Geometric mean of all clusters is 0.76
weighted average of Correctly predicted data points in all clusters is 76.0%


### Case-2: predicting validation data using Model-2


In [75]:
y=CTABGAN_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

F1_Score of cluster 0 is 0.475
Geometric mean of cluster 0 is 0.4979352974429112
Correctly predicted data points in cluster 0 is 36.53846153846153%


F1_Score of cluster 1 is 0.39999999999999997
Geometric mean of cluster 1 is 0.42580934527073144
Correctly predicted data points in cluster 1 is 60.86956521739131%


weigted average F1_Score of all clusters is 0.45199999999999996
weighted average Geometric mean of all clusters is 0.4758166721101094
weighted average of Correctly predicted data points in all clusters is 44.0%


### Comparing predicted values from Case-1 & Case-2

In [76]:
cluster_wise_F1score(list(x),list(y))

F1_Score of cluster 0 is 0.45
Geometric mean of cluster 0 is 0.47172817652486326
Correctly predicted data points in cluster 0 is 34.61538461538461%


F1_Score of cluster 1 is 0.37142857142857144
Geometric mean of cluster 1 is 0.39539439203710774
Correctly predicted data points in cluster 1 is 56.52173913043478%


weigted average F1_Score of all clusters is 0.4259047619047619
weighted average Geometric mean of all clusters is 0.44831914928195155
weighted average of Correctly predicted data points in all clusters is 41.33333333333333%


In [77]:
CTABGAN_syn_data.shape

(224, 13)

### Tabddpm

In [78]:

Tabddpm_syn_data=pd.read_csv('Heart_failure_synth_data_tddpm.csv')
Tabddpm_syn_data.drop(['Unnamed: 0'],axis=1,inplace=True)



In [79]:
Tabddpm_syn_data['Target'].value_counts()

0    617
1    383
Name: Target, dtype: int64

In [80]:
def balanced_synthetic_data(org_data,syn_data):
    import math
    org_data_len = org_data.shape[0]
    class_counts = org_data.iloc[:,-1].value_counts()
    column_names = list( syn_data.columns)
    df_list=[]
    for i in range(len(class_counts)):
        c_i=syn_data[syn_data['Target']==i].sample(n=int(class_counts[i]),axis=0,random_state=42)
        df_list.append(c_i)
    balanced_df=pd.concat(df_list,names=column_names)
    return balanced_df

In [81]:
Tabddpm_syn_data=balanced_synthetic_data(pd.DataFrame(train_features),Tabddpm_syn_data)

In [82]:
Tabddpm_syn_data['Target'].value_counts()

0    151
1     73
Name: Target, dtype: int64

In [83]:
Tabddpm_syn_data.to_csv('Heart_failure_Tabddpm_syn_data(sup).csv',index=False)

In [84]:
np.random.seed(42)
Tabddpm_syn_data=Tabddpm_syn_data.sample(frac=1) #Shuffle the data set
np.random.seed(42)
i=[x for x in range(np.shape(Tabddpm_syn_data)[0])]

Tabddpm_syn_data.set_index(pd.Series(i), inplace=True)

### Train test split on synthetic data

In [85]:
syn_train_features, syn_test_features, syn_train_labels, syn_test_labels = train_test_split(np.array(Tabddpm_syn_data.drop(["Target"],axis=1)), np.array(Tabddpm_syn_data["Target"]), test_size = 0.25, random_state = 42)

### Gradient boosting classifier for synthetic samples training data (Model-2)

In [86]:
from sklearn.ensemble import GradientBoostingClassifier
tabddpm_model_2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=42)


### 5 Fold Cross Validation

In [87]:
score_lr=cross_val_score(tabddpm_model_2, syn_train_features, syn_train_labels,cv=5)
print(score_lr)
print("Avg :",np.average(score_lr))


[0.82352941 0.91176471 0.82352941 0.78787879 0.84848485]
Avg : 0.8390374331550803


In [88]:
tabddpm_model_2.fit(syn_train_features,syn_train_labels )

### Case-1: predicting validation data using Model-1


In [89]:
x=model_1.predict(np.array(test_features))
cluster_wise_F1score(list(test_labels),list(x))

F1_Score of cluster 0 is 0.8269230769230769
Geometric mean of cluster 0 is 0.8269230769230769
Correctly predicted data points in cluster 0 is 82.6923076923077%


F1_Score of cluster 1 is 0.6086956521739131
Geometric mean of cluster 1 is 0.6086956521739131
Correctly predicted data points in cluster 1 is 60.86956521739131%


weigted average F1_Score of all clusters is 0.76
weighted average Geometric mean of all clusters is 0.76
weighted average of Correctly predicted data points in all clusters is 76.0%


### Case-2: Predicting validation data using Model-2


In [90]:
y=tabddpm_model_2.predict(np.array(test_features))

cluster_wise_F1score(list(test_labels),list(y))

F1_Score of cluster 0 is 0.86
Geometric mean of cluster 0 is 0.8606888267023434
Correctly predicted data points in cluster 0 is 82.6923076923077%


F1_Score of cluster 1 is 0.72
Geometric mean of cluster 1 is 0.7223151185146152
Correctly predicted data points in cluster 1 is 78.26086956521739%


weigted average F1_Score of all clusters is 0.8170666666666667
weighted average Geometric mean of all clusters is 0.8182542228581068
weighted average of Correctly predicted data points in all clusters is 81.33333333333333%


### Comparing predicted values from Case-1 & Case-2

In [91]:
cluster_wise_F1score(list(x),list(y))

F1_Score of cluster 0 is 0.86
Geometric mean of cluster 0 is 0.8606888267023434
Correctly predicted data points in cluster 0 is 82.6923076923077%


F1_Score of cluster 1 is 0.72
Geometric mean of cluster 1 is 0.7223151185146152
Correctly predicted data points in cluster 1 is 78.26086956521739%


weigted average F1_Score of all clusters is 0.8170666666666667
weighted average Geometric mean of all clusters is 0.8182542228581068
weighted average of Correctly predicted data points in all clusters is 81.33333333333333%
