In [1]:
# Supress warnings

import warnings
warnings.filterwarnings('ignore')

# Import necessary libraries
import pandas as pd, matplotlib.pyplot as plt, seaborn as sns, datetime, pytz, re, os

from sdv.tabular import CTGAN, CopulaGAN

In [2]:
folder_path = '' #r"drive/MyDrive/LJMUdataset/Insurancedata/"

In [3]:
def token():
  current_time = datetime.datetime.now(pytz.timezone('Asia/Kolkata'))
  return re.sub("[^0-9]",'',str(current_time).split('+')[0])

def create_directory_for_each_run(token_code):
    new_direc_path = os.path.join(folder_path,token_code)
    os.mkdir(new_direc_path)
    return new_direc_path

In [4]:
# load 10% train data

df_10 = pd.read_excel(folder_path+'10_modelling_data.xlsx')
df_10.head()

Unnamed: 0,Gender,Age,Previously_Insured,Vehicle_Damage,Annual_Premium,Vintage,Response,Less_than_1_year,More_than_2_years,Region_Code_10,...,Region_Code_6,Region_Code_8,Policy_Sales_Channel_122,Policy_Sales_Channel_124,Policy_Sales_Channel_152,Policy_Sales_Channel_154,Policy_Sales_Channel_156,Policy_Sales_Channel_157,Policy_Sales_Channel_160,Policy_Sales_Channel_26
0,1,22,0,1,2630,207,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,29,1,0,36203,200,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,21,1,0,27240,33,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,59,0,1,37559,154,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,1,24,1,0,23031,198,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## **2. Synthetic Data Generation using CTGAN and CopulaGAN**

In [5]:
# Create instances of CTGAN and CopulaGAN model.

ctgan_model = CTGAN(embedding_dim=128, generator_lr=0.0002, discriminator_lr=0.0002, batch_size=500, discriminator_steps=5, epochs=300)
cpgan_model = CopulaGAN(embedding_dim=128, generator_lr=0.0002, discriminator_lr=0.0002, batch_size=500, discriminator_steps=5, epochs=300)

In [6]:
%%time
# Fit the data using CTGAN models

ctgan_model.fit(df_10)

CPU times: total: 19h 56min 53s
Wall time: 5h 1min 18s


In [7]:
%%time
# Fit the data using CopulaGAN models

cpgan_model.fit(df_10)

CPU times: total: 18h 32min 15s
Wall time: 4h 40min 12s


In [8]:
# save models

token_code = token()
model_folder_path = os.path.join(create_directory_for_each_run(token_code), token_code)
ctgan_model.save(model_folder_path+'ctgan_model.pkl')
cpgan_model.save(model_folder_path+'cpgan_model.pkl')

In [9]:
# sample synthetic data

ct_syn_data = ctgan_model.sample(num_rows=len(df_10))
cp_syn_data = cpgan_model.sample(num_rows=len(df_10))

In [10]:
# Evaluate synthetic data from CTGAN model

from sdv.metrics.tabular import KSTest, LogisticDetection

kst_ct = KSTest.compute(df_10, ct_syn_data)
ld_ct = LogisticDetection.compute(df_10, ct_syn_data)
kst_ct, ld_ct

(0.9843880738951312, 0.7587586164715724)

In [11]:
# Evaluate synthetic data from CopulaGAN model

kst_cp = KSTest.compute(df_10, cp_syn_data)
ld_cp = LogisticDetection.compute(df_10, cp_syn_data)
kst_cp, ld_cp

(0.9906376949416493, 0.8234830470424263)

In [16]:
# Condition to synthesize data to handle class imbalance

from sdv.sampling import Condition

condtition = Condition({'Response':1}, num_rows=223686)

In [17]:
%%time
# Resample synthetic data points from CTGAN model

ct_syn_data_10 = ctgan_model.sample(num_rows=26750)
ct_syn_data_20 = ctgan_model.sample(num_rows=53501)
ct_syn_data_40 = ctgan_model.sample(num_rows=107002)
ct_syn_data_1 = ctgan_model.sample_conditions(conditions=[condtition])

Sampling conditions: 100%|████████████████████████████████████████████████████| 223686/223686 [06:20<00:00, 587.67it/s]

CPU times: total: 21min 1s
Wall time: 6min 58s





In [18]:
%%time
# Resample synthetic data points from CopulaGAN model

cp_syn_data_10 = cpgan_model.sample(num_rows=26750)
cp_syn_data_20 = cpgan_model.sample(num_rows=53501)
cp_syn_data_40 = cpgan_model.sample(num_rows=107002)
cp_syn_data_1 = cpgan_model.sample_conditions(conditions=[condtition])

Sampling conditions: 100%|████████████████████████████████████████████████████| 223686/223686 [05:39<00:00, 658.85it/s]

CPU times: total: 16min 37s
Wall time: 6min 11s





In [21]:
# saving data to excel

ct_syn_data_10.to_excel(model_folder_path+'ct_syn_data_10_train.xlsx')
ct_syn_data_20.to_excel(model_folder_path+'ct_syn_data_20_train.xlsx')
ct_syn_data_40.to_excel(model_folder_path+'ct_syn_data_40_train.xlsx')
ct_syn_data_1.to_excel(model_folder_path+'ct_syn_data_1_train.xlsx')

cp_syn_data_10.to_excel(model_folder_path+'cp_syn_data_10_train.xlsx')
cp_syn_data_20.to_excel(model_folder_path+'cp_syn_data_20_train.xlsx')
cp_syn_data_40.to_excel(model_folder_path+'cp_syn_data_40_train.xlsx')
cp_syn_data_1.to_excel(model_folder_path+'cp_syn_data_1_train.xlsx')