In [None]:
import argparse
import copy
import gc
import math
import os
import random
import time
import h5py

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn

from torch.utils.data import DataLoader
from torch.autograd import Variable

In [None]:
parser = argparse.ArgumentParser()

experimentName = 'medGAN.MIMIC-III'

parser.add_argument("--dataset-path", type=str, default=os.path.expanduser('~/workspace/data/mimic-iii-processed/BINARY.h5'), help="Dataset file")

parser.add_argument("--n-epochs", type=int, default=100, help="number of epochs of training")
parser.add_argument("--n-epochs-ae", type=int, default=100, help="number of epochs of autoencoder training")
parser.add_argument("--batch-size", type=int, default=64, help="size of the batches")
parser.add_argument("--lr", type=float, default=0.001, help="adam: learning rate")
parser.add_argument("--b1", type=float, default=0.9, help="adam: decay of first order momentum of gradient")
parser.add_argument("--b2", type=float, default=0.999, help="adam: decay of first order momentum of gradient")
parser.add_argument("--n-cpu", type=int, default=32, help="number of cpu threads to use during batch generation")
parser.add_argument('--n-critic', type=int, default=5, help='number of Discriminator iterations per each Generator iteration')
parser.add_argument('--clamp', type=float, default=0.01, help='weight clipping value')
parser.add_argument("--cuda", type=bool, default=True, help="CUDA activation")
parser.add_argument("--multiple-gpu", type=bool, default=True, help="number of cpu threads to use during batch generation")
parser.add_argument("--num-gpu", type=int, default=1, help="Number of GPUs in case of multiple GPU")
parser.add_argument("--latent-dim", type=int, default=128, help="dimensionality of the latent space")
parser.add_argument("--weight-decay", type=float, default=0.0001, help="l2 regularization")

parser.add_argument("--sample_interval", type=int, default=100, help="interval between samples")
parser.add_argument("--epoch-time-show", type=bool, default=True, help="interval betwen image samples")
parser.add_argument("--epoch-save-model-freq", type=int, default=100, help="number of epops per model save")
parser.add_argument("--minibatch-averaging", type=bool, default=False, help="Minibatch averaging")

parser.add_argument("--pretrained", type=bool, default=False, help="Use pretrained model")
parser.add_argument("--pretrained-ae", type=bool, default=False, help="Use pretrained model for autoencoder")

parser.add_argument("--expPATH", type=str, default=os.path.expanduser('~/workspace/pytorch-exports/models/{}'.format(experimentName)), help="Export Path")

opt = parser.parse_args([])
print(opt)

In [None]:
######################
### Initialization ###
######################

# Create experiments DIR
if not os.path.exists(opt.expPATH):
    os.system('mkdir -p {0}'.format(opt.expPATH))

# opt.seed = 1024 # fix seed
opt.seed = random.randint(1, 10000)

print('Random Seed: {}'.format(opt.seed))
random.seed(opt.seed)
torch.manual_seed(opt.seed)
np.random.seed(opt.seed)
cudnn.benchmark = True

if torch.cuda.is_available() and not opt.cuda:
    print("WARNING: You have a CUDA device BUT it is not in use...")

device = torch.device("cuda:0" if opt.cuda else "cpu")
print('using \'{}\' as the tensor processor'.format(device))

In [None]:
#################################
### Reading Dataset from File ###
#################################

input_data = None
with h5py.File(opt.dataset_path, 'r') as hf:
    input_data = hf.get('dataset')[()]

total_samples = input_data.shape[0]
feature_size = input_data.shape[1]

In [None]:
#####################
### Dataset Model ###
#####################

class EHRDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.sample_size = dataset.shape[0]
        self.feature_size = dataset.shape[1]

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, idx):
        return self.dataset[idx]

In [None]:
##########################
### Dataset Processing ###
##########################

train_data = input_data[:int(0.8 * total_samples)]
test_data = input_data[int(0.8 * total_samples):]
print('total samples: {}, features: {}'.format(total_samples, feature_size))
print('training data shape: {}, testing data shape: {}, dataset type: {}'.format(train_data.shape, test_data.shape, input_data.dtype))

In [None]:
training_dataloader = DataLoader(
    EHRDataset(dataset=train_data),
    batch_size=opt.batch_size,
    shuffle=True,
    num_workers=opt.n_cpu
)

testing_dataloader = DataLoader(
    EHRDataset(dataset=test_data),
    batch_size=opt.batch_size,
    shuffle=True,
    num_workers=opt.n_cpu
)

In [None]:
def weightsInit(m):
    """
    Custom weight initialization.
    :param m: Input argument to extract layer type
    :return: Initialized architecture
    """
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

In [None]:
########################
### AutoEncoder Loss ###
########################

class AutoEncoderLoss(nn.Module):
    def __init__(self):
        super(AutoEncoderLoss, self).__init__()

    def forward(self, input, target):
        epsilon = 1e-12
        term = target * torch.log(input + epsilon) + (1. - target) * torch.log(1. - input + epsilon)
        return torch.mean(-torch.sum(term, 1), 0)

In [None]:
#########################
### AutoEncoder Model ###
#########################

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(feature_size, 128),
            nn.Tanh())
        self.decoder = nn.Sequential(nn.Linear(128, feature_size)
                                     , nn.Sigmoid())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

    def decode(self, x):
        x = self.decoder(x)
        return x

In [None]:
#############################
### Generator Model ###
#############################

# Output should be 64 * 20
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.genDim = 128
        self.linear1 = nn.Linear(opt.latent_dim, self.genDim)
        self.bn1 = nn.BatchNorm1d(self.genDim, eps=0.001, momentum=0.01)
        self.activation1 = nn.ReLU()
        self.linear2 = nn.Linear(opt.latent_dim, self.genDim)
        self.bn2 = nn.BatchNorm1d(self.genDim, eps=0.001, momentum=0.01)
        self.activation2 = nn.Tanh()

    def forward(self, x):
        # Layer 1
        residual = x
        temp = self.activation1(self.bn1(self.linear1(x)))
        out1 = temp + residual

        # Layer 2
        residual = out1
        temp = self.activation2(self.bn2(self.linear2(out1)))
        out2 = temp + residual
        return out2

In [None]:
###########################
### Discriminator Model ###
###########################

class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()

        # Discriminator's parameters
        self.disDim = 256

        self.model = nn.Sequential(
            nn.Linear(feature_size, self.disDim),
            nn.ReLU(),
            nn.Linear(self.disDim, int(self.disDim / 2)),
            nn.ReLU(),
            nn.Linear(int(self.disDim / 2), 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # Feeding the model
        output = self.model(x)
        return output


In [None]:
############################
### Model Initialization ###
############################

gc.collect()
torch.cuda.empty_cache()

autoencoder = Autoencoder()
generator = Generator()
discriminator = Discriminator()

Tensor = torch.FloatTensor

one = torch.FloatTensor([1])
mone = one * -1

if opt.cuda:
    autoencoder.cuda()
    generator.cuda()
    discriminator.cuda()
    one = one.cuda()
    mone = mone.cuda()
    Tensor = torch.cuda.FloatTensor

generator_params = [{'params': generator.parameters()}, {'params': autoencoder.decoder.parameters(), 'lr': 1e-4}]

optimizer_A = torch.optim.Adam(autoencoder.parameters(), lr=opt.lr)
optimizer_G = torch.optim.Adam(generator_params, lr=opt.lr)
optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=opt.lr)

generator.apply(weightsInit)
discriminator.apply(weightsInit)
autoencoder.apply(weightsInit)

In [None]:
#####################################
###### AutoEncoder Training #########
#####################################

criterion = AutoEncoderLoss()

if not opt.pretrained_ae:
    for epoch in range(opt.n_epochs_ae):
        autoencoder.train()
        for batch in training_dataloader:
            batch = Variable(batch.type(Tensor))
            generated = autoencoder(batch)
            loss_A = criterion(generated, batch)
            optimizer_A.zero_grad()
            loss_A.backward()
            optimizer_A.step()

        errors = 0
        testing_loss = 0
        autoencoder.eval()
        for batch in testing_dataloader:
            batch = Variable(batch.type(Tensor))
            generated = autoencoder(batch)
            res = generated.round()
            diff = torch.abs(res - batch).view(1, 1, -1)[0][0].cpu().detach().numpy()
            bad_diffs = diff[diff > 0.5]
            errors += len(bad_diffs)
            testing_loss += criterion(generated, batch)

        print("[Epoch {:3d}/{:3d} of autoencoder training] [Loss: {:10.2f}] [errors: {:6d}]".format(epoch + 1, opt.n_epochs_ae, testing_loss, errors), flush=True)
    torch.save(autoencoder.state_dict(), opt.expPATH + '/autoencoder.model')
else:
    autoencoder.load_state_dict(torch.load(opt.expPATH + '/autoencoder.model'))

In [None]:
errors = 0
for batch in testing_dataloader:
    batch = Variable(batch.type(Tensor))
    generated = autoencoder(batch)
    res = generated.round()
    diff = torch.abs(res - batch).view(1, 1, -1)[0][0].cpu().detach().numpy()
    bad_diffs = diff[diff > 0.5]
    errors += len(bad_diffs)
print("total number of bad digits: {}".format(errors))

In [None]:
if not opt.pretrained:
    batches_done = 0

    discriminator.train()
    generator.train()

    gen_iterations = 0
    for epoch in range(opt.n_epochs):
        epoch_start = time.time()

        for batch in training_dataloader:
            # ---------------------
            #  Train Discriminator
            # ---------------------
            batch = Variable(batch.type(Tensor))

            for dp in discriminator.parameters():
                dp.requires_grad = True

            for _ in range(opt.n_critic):
                for dp in discriminator.parameters():
                    dp.data.clamp_(-opt.clamp, opt.clamp)

                # reset gradients of discriminator
                optimizer_D.zero_grad()

                loss_D_real = torch.mean(discriminator(batch), dim=0)
                loss_D_real.backward(one)

                # Sample noise as generator input
                z = torch.randn(batch.shape[0], opt.latent_dim, device=device)
                # Generate a batch of images
                fake_batch = autoencoder.decoder(generator(z))

                # Error
                loss_D_fake = torch.mean(discriminator(fake_batch.detach()), dim=0)
                loss_D_fake.backward(mone)

                # Optimizer stepz
                optimizer_D.step()

            # -----------------
            #  Train Generator
            # -----------------

            for dp in discriminator.parameters():
                dp.requires_grad = False

            optimizer_G.zero_grad()

            # Sample noise as generator input
            z = torch.randn(batch.shape[0], opt.latent_dim, device=device)

            # Generate a batch of images
            fake_batch = autoencoder.decoder(generator(z))

            # uncomment if there is no autoencoder
            loss_G = torch.mean(discriminator(fake_batch), dim=0)
            loss_G.backward(one)
            optimizer_G.step()
            batches_done += 1

            if batches_done % 100 == 0:
                print('[Epoch {:3d}/{:3d}] [Batch {:3d}/{:3d}] [D loss: {:.5f}] [G loss: {:.5f}]'.format(epoch + 1, opt.n_epochs, batches_done % len(training_dataloader), len(training_dataloader), loss_D_real.item() + loss_D_fake.item(), loss_G.item()))

        print('[Epoch {:3d}/{:3d}] [Time: {:.2f}] [D loss: {:.5f}] [G loss: {:.5f}]'.format(epoch + 1, opt.n_epochs, time.time() - epoch_start, loss_D_real.item() + loss_D_fake.item(), loss_G.item()))

    torch.save(generator.state_dict(), opt.expPATH + '/generator.model')
    torch.save(discriminator.state_dict(), opt.expPATH + '/discriminator.model')
else:
    generator.load_state_dict(torch.load(opt.expPATH + '/generator.model'))
    discriminator.load_state_dict(torch.load(opt.expPATH + '/discriminator.model'))

In [None]:
autoencoder.eval()
generator.eval()
discriminator.eval()

In [None]:
num_fake_batches = 80
fake_data = torch.zeros((0, feature_size), device='cpu')
for _ in range(num_fake_batches):
  z = torch.randn(opt.batch_size, 128, device=device)
  generated_batch = generator(z)
  fake_batch = autoencoder.decoder(generator(z))
  fake_data = torch.cat((fake_data, fake_batch.round().to('cpu')), 0)
np.save(os.path.join(opt.expPATH, "synthetic.npy"), fake_data.detach().cpu().numpy(), allow_pickle=False)