In [1]:
from data_loader import *

In [2]:
import torch
import pandas as pd
import numpy as np
import torchvision

In [3]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: GeForce GTX 1080 Ti


In [4]:
df_train = pd.read_csv("../datasets/AAAI_dataset/gossip_train.csv")
df_test = pd.read_csv("../datasets/AAAI_dataset/gossip_test.csv")

In [5]:
# define a callable image_transform with Compose
image_transform = torchvision.transforms.Compose(
    [
        torchvision.transforms.Resize(size=(224, 224)),
        torchvision.transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]
)

In [6]:
root_dir = "../datasets/AAAI_dataset/Images/"

In [17]:
import os  # when loading file paths
import pandas as pd  # for lookup in annotation file
import spacy  # for tokenizer
import torch
import random
from torch.nn.utils.rnn import pad_sequence  # pad batch
from torch.utils.data import DataLoader, Dataset
from PIL import Image  # Load img
import torchvision.transforms as transforms
import torchtext.vocab as vocab
from torchtext.data import Field

from gensim.models import Word2Vec


spacy_eng = spacy.load("en")

class Vocabulary:
    def __init__(self, freq_threshold, embed_size):
        self.itos = {0: "<PAD>"}
        self.stoi = {"<PAD>": 0}
        self.freq_threshold = freq_threshold
        self.embed_size = embed_size
        self.pre_trained_embed = None

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_w2v(self, sentence_list):
        sentences = [self.tokenizer_eng(sentence) for sentence in sentence_list]
        # train model
        model = Word2Vec(sentences, min_count=self.freq_threshold, size=self.embed_size)  

#         print(model.wv.vocab)  

        model.save('embeddings.txt')

        TEXT = Field()
        TEXT.build_vocab(sentences, min_freq=self.freq_threshold)

        w2v_vec = []
        for token, idx in self.stoi.items():
            if token in model.wv.vocab.keys():
                w2v_vec.append(torch.FloatTensor(model.wv[token]))
            else:
                w2v_vec.append(torch.zeros(self.embed_size))

        TEXT.vocab.set_vectors(self.stoi, w2v_vec, self.embed_size)

        self.pre_trained_embed = torch.FloatTensor(TEXT.vocab.vectors)
        

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 1

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)

        return [
            self.stoi[token] if token in self.stoi else random.choice(list(transformed_dataset_train.vocab.stoi.values()))
            for token in tokenized_text
        ]


class FakeNewsDataset(Dataset):
    """Fake News Dataset"""

    def __init__(self, df, root_dir, image_transform, vocab=None, freq_threshold=5, embed_size=32):
        """
        Args:
            csv_file (string): Path to the csv file with text and img name.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.df = df
        self.root_dir = root_dir
        self.image_transform = image_transform
        self.vocab = vocab

        # Get img, caption columns
        self.imgs = self.df["image"]
        self.txt = self.df["content"]
        self.label = self.df['label']
        
        if self.vocab == None:
        # Initialize vocabulary and build vocab
            self.vocab = Vocabulary(freq_threshold,  embed_size)
            self.vocab.build_vocabulary(self.txt.tolist())

            self.vocab.build_w2v(self.txt.tolist())

    def __len__(self):
        return len(self.df)
    
    def pre_processing_text(self, sent):
        pass
        
    def __getitem__(self, idx):
        
        img_name = self.root_dir + self.imgs[idx]

        image = Image.open(img_name).convert("RGB")
        
        image = self.image_transform(image)
        
        text = self.txt[idx]
        
        # numericalized_text = [self.vocab.stoi["<SOS>"]]
        # numericalized_text += self.vocab.numericalize(text)
        numericalized_text = self.vocab.numericalize(text)
        # numericalized_text.append(self.vocab.stoi["<EOS>"])


        label = self.label[idx]
        label = torch.tensor(label, dtype=torch.float32).unsqueeze(0)

        sample = {'image': image, 'text': torch.tensor(numericalized_text), 'label': label}

        return sample


class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        
#         print(batch)
        imgs = [item['image'].unsqueeze(0) for item in batch]
        
        labels = [item['label'].unsqueeze(0) for item in batch]
        text = [item['text'] for item in batch]
    
#         batch_imgs, batch_text, batch_labels = batch['image'], batch['text'], batch['label']

#         imgs = [item.unsqueeze(0) for item in batch_imgs]
        imgs = torch.cat(imgs, dim=0)

#         labels = [item.unsqueeze(0) for item in batch_labels]
        labels = torch.cat(labels, dim=0)

#         text = [item for item in batch_text]
        text = pad_sequence(text, batch_first=True, padding_value=self.pad_idx)


        sample = {'image': imgs, 'text': text, 'label': labels}

        return sample


def get_loader(
    df,
    root_dir,
    image_transform,
    vocab=None,
    batch_size=8,
    num_workers=8,
    shuffle=True,
    pin_memory=True,
):
    dataset = FakeNewsDataset(df, root_dir, image_transform, vocab=None, freq_threshold=5, embed_size=32)

    pad_idx = dataset.vocab.stoi["<PAD>"]

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
#         num_workers=num_workers,
        shuffle=shuffle,
#         pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx),
    )

    return loader, dataset

In [18]:
# Run function `preprocessing_for_bert` on the dataset
train_dataloader, transformed_dataset_train = get_loader(df=df_train, root_dir=root_dir+"gossip_train/", image_transform=image_transform, vocab=None)



In [27]:
test_dataloader, transformed_dataset_test = get_loader(df=df_test, root_dir=root_dir+"gossip_test/", image_transform=image_transform, vocab=transformed_dataset_train.vocab)



In [24]:
# transformed_dataset_train.__getitem__(2)['text']
print(transformed_dataset_train.__getitem__(0)['text'])

tensor([   19,    20,    21,    24,    15,    16,  1594,     8,     3,    36,
           23,    13,    28,  5062,     7,   777,    14, 15283,  1059,    55,
           20,    25,  1318,    10,     4,    25,    15,    16, 13167,    58,
           26,     9,  1231,     3,     6,    75,    13,   439,     1,    30,
           29,     1,    73, 16744, 16410,   646,     2,    17,    21,     5,
          280,    27,  2412,  2414,    35,    19,    20,    21,  2156,    37,
           10,     4,     1,  2125,     2,  1866,     2,   223, 11305,    17,
            7,  9569,  4448,     2,    99,    75,    13, 26614,     1,  1247,
           14,    34,     9, 10809,    11,    18,     1,    36,    23,     3,
            1,   119,  3455,    32,    28,   511,     2,    26,     1,    83,
          627,    33,     8,     5,     9,    31,  3034,     6,    10,     4,
         1889,   466,    27,   136,     4,    11,    24,    15,    16,    48,
            7,    11,    18,     1,    36,    23,     3,     1, 

In [25]:
for i, x in enumerate(train_dataloader):
    print(x)
    break

{'image': tensor([[[[ 2.1462,  2.1633,  2.1633,  ...,  1.7352,  1.7523,  1.7523],
          [ 2.1462,  2.1633,  2.1633,  ...,  1.7523,  1.7523,  1.7352],
          [ 2.1462,  2.1804,  2.1633,  ...,  1.7352,  1.7352,  1.7180],
          ...,
          [ 1.6495,  1.6838,  1.7009,  ..., -1.2103, -1.2103, -1.2103],
          [ 1.6838,  1.7180,  1.6838,  ..., -0.5253, -0.5596, -0.5596],
          [ 1.6667,  1.7009,  1.6838,  ..., -0.1314, -0.1314, -0.1143]],

         [[ 2.0784,  2.0959,  2.0959,  ...,  1.4482,  1.4132,  1.4132],
          [ 2.0784,  2.0959,  2.0959,  ...,  1.4657,  1.4132,  1.3957],
          [ 2.0784,  2.1134,  2.0959,  ...,  1.4657,  1.4482,  1.4307],
          ...,
          [ 1.7283,  1.7458,  1.7808,  ..., -1.1779, -1.1779, -1.1779],
          [ 1.7283,  1.7458,  1.7633,  ..., -0.5651, -0.5826, -0.6001],
          [ 1.7108,  1.7283,  1.7283,  ..., -0.1975, -0.1625, -0.1450]],

         [[ 1.6814,  1.6988,  1.6988,  ...,  0.5659,  0.5659,  0.5834],
          [ 1.6814, 

In [23]:
data = iter(train_dataloader)
batch = iter(test_dataloader).next()
# print(batch['label'])

TypeError: an integer is required (got type list)

In [16]:
random.choice(list(transformed_dataset_train.vocab.stoi.values()))

21428

In [30]:
import torch.nn as nn

In [31]:
# embedding = nn.Embedding.from_pretrained(transformed_dataset_train.vocab.pre_trained_embed)

In [32]:
# embedding(batch['text']).shape

In [33]:
fnd_loss_fn = nn.BCELoss()

In [34]:
def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

In [35]:
params_dict_model = {
    'pre_trained_embed': transformed_dataset_train.vocab.pre_trained_embed,
    'latent_dim': 32,
    'combined_fc_out': 64,
    'dec_fc_img_1': 1024,
    'enc_img_dim': 4096,
    'vocab_size': len(transformed_dataset_train.vocab.stoi),
    'embedding_size': 32,
    'max_len': 20,
    'text_enc_dim': 32,
    'latent_size': 32,
    'hidden_size': 32,
    'num_layers': 1,
    'bidirectional': True,
    'img_fc1_out': 1024,
    'img_fc2_out': 32,
    'fnd_fc1': 64,
    'fnd_fc2': 32
}

parameter_dict_opt={'l_r': 3e-5,
                    'eps': 1e-8
                    }

EPOCHS = 1

set_seed(42)    # Set seed for reproducibility


In [36]:
params_dict_model['vocab_size']

31298

In [37]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [38]:
from train_val import *

In [39]:
from torch.utils.tensorboard import SummaryWriter

In [40]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# from torchvision import models, transforms


# class TextEncoder(nn.Module):
#     def __init__(self, pre_trained_embed, vocab_size, embedding_size, text_enc_dim, num_layers, hidden_size, bidirectional):
#         super(TextEncoder, self).__init__()

#         self.bidirectional = bidirectional
#         self.num_layers = num_layers
#         self.hidden_size = hidden_size

#         self.embedding = nn.Embedding.from_pretrained(pre_trained_embed)

#         self.hidden_factor = (2 if bidirectional else 1) * num_layers

#         self.text_encoder = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, bidirectional=True,
#                                batch_first=True)

#         self.text_enc_fc = torch.nn.Linear(self.hidden_size*self.hidden_factor, text_enc_dim)

#     def forward(self, x):
        
#         x = self.embedding(x)
        
# #         print("emb ", x.shape)

#         _, (hidden, _not) = self.text_encoder(x)
        
# #         print("encoding hidden", hidden.shape)

#         if self.bidirectional or self.num_layers > 1:
#             # flatten hidden state
#             hidden = hidden.view(x.shape[0], self.hidden_size*self.hidden_factor)
#         else:
#             hidden = hidden.squeeze()
        
# #         print("encoding hidden", hidden.shape)
        
#         x = self.text_enc_fc(hidden)

#         return x

# class VisualEncoder(nn.Module):
#     def __init__(self, enc_img_dim, img_fc1_out, img_fc2_out):
#         super(VisualEncoder, self).__init__()
        
#         vgg = models.vgg19(pretrained=True)
#         vgg.classifier = nn.Sequential(*list(vgg.classifier.children())[:1])
        
#         self.vis_encoder = vgg

#         self.vis_enc_fc1 = torch.nn.Linear(enc_img_dim, img_fc1_out)

#         self.vis_enc_fc2 = torch.nn.Linear(img_fc1_out, img_fc2_out)
    
#     def forward(self, x):

#         x_cnn = self.vis_encoder(x)

#         x = self.vis_enc_fc1(x_cnn)

#         x = self.vis_enc_fc2(x)

#         return x, x_cnn

# class VisualDecoder(nn.Module):
#     def __init__(self, latent_dim, dec_fc_img_1, decoded_img):
#         super(VisualDecoder, self).__init__()

#         self.vis_dec_fc1 = nn.Linear(latent_dim, dec_fc_img_1)

#         self.vis_dec_fc2 = nn.Linear(dec_fc_img_1, decoded_img)
    
#     def forward(self, x):

#         x = self.vis_dec_fc1(x)

#         x = self.vis_dec_fc2(x)

#         return x


# class TextDecoder(nn.Module):
#     def __init__(self, vocab_size, embedding_size, max_len, latent_size, hidden_size, num_layers, bidirectional):
#         super(TextDecoder, self).__init__()

#         self.max_len = max_len
#         self.bidirectional = bidirectional
#         self.num_layers = num_layers
#         self.hidden_size = hidden_size
#         self.vocab_size = vocab_size

#         self.hidden_factor = (2 if bidirectional else 1) * num_layers

#         self.text_decoder = nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, bidirectional=bidirectional,
#                                batch_first=True)

#         self.latent2hidden = nn.Linear(latent_size, hidden_size )  ## dec text fc 

#         self.outputs2vocab = nn.Linear(hidden_size * (2 if bidirectional else 1), vocab_size)
    
#     def forward(self, x, max_length):

#         hidden = self.latent2hidden(x)
# #         print("hidden shgape ",hidden.shape)
# #         print("max len ", max_length)

# #         if self.bidirectional or self.num_layers > 1:
# #             # unflatten hidden state
# #             hidden = hidden.view(self.hidden_factor, x.shape[0], self.hidden_size)
# #         else:
# #             hidden = hidden.unsqueeze(0)
        
# #         print("hidden shgape ",hidden.shape)
        
# #         hidden = hidden.unsqueeze(1)
        
# #         print("hidden shgape unsqueezed",hidden.shape)
        
#         repeat_hidden = hidden.unsqueeze(1).repeat(1, max_length, 1)  ## repeat the hidden input to the max_len

#         # decoder forward pass
#         outputs, _ = self.text_decoder(repeat_hidden)
        
#         outputs = outputs.contiguous()
#         print("outputs shape after lstm ", outputs.shape)

#         b,s,_ = outputs.size()

#         # project outputs to vocab
# #         logp = nn.functional.log_softmax(self.outputs2vocab(outputs.view(-1, outputs.size(2))), dim=1)
#         logp = nn.functional.log_softmax(self.outputs2vocab(outputs), dim=-1)
#         print("logp shape before ", logp.shape)
#         logp = logp.view(b, s, self.vocab_size)
#         print("logp shape after ", logp.shape)
        
#         return logp


# class MVAE(nn.Module):

#     def __init__(self, params_dict):
#         super(MVAE, self).__init__()

#         self.text_encoder = TextEncoder(params_dict['pre_trained_embed'], params_dict['vocab_size'], params_dict['embedding_size'], params_dict['text_enc_dim'], params_dict['num_layers'], params_dict['hidden_size'], params_dict['bidirectional'])

#         self.visual_encoder = VisualEncoder(params_dict['enc_img_dim'], params_dict['img_fc1_out'], params_dict['img_fc2_out'])

#         self.text_decoder = TextDecoder(params_dict['vocab_size'], params_dict['embedding_size'], params_dict['max_len'], params_dict['latent_dim'], params_dict['hidden_size'], params_dict['num_layers'], params_dict['bidirectional'])

#         self.visual_decoder = VisualDecoder(params_dict['latent_dim'], params_dict['dec_fc_img_1'], params_dict['enc_img_dim'])

#         self.combined_fc = torch.nn.Linear((params_dict['text_enc_dim'] + params_dict['img_fc2_out']), params_dict['combined_fc_out'])

#         self.fc_mu = nn.Linear(params_dict['combined_fc_out'], params_dict['latent_dim'])
#         self.fc_var = nn.Linear(params_dict['combined_fc_out'], params_dict['latent_dim'])

        


#         self.fnd_module = nn.Sequential(
#                             nn.Linear(params_dict['latent_dim'], params_dict['fnd_fc1']),
#                             nn.Tanh(),
#                             nn.Linear(params_dict['fnd_fc1'], params_dict['fnd_fc2']),
#                             nn.Tanh(),
#                             nn.Linear(params_dict['fnd_fc2'], 1),
#                             nn.Sigmoid()
#         )

        

#     def encode(self, text_ip, img_ip):
#         encoded_text = self.text_encoder(text_ip)

#         encoded_img, cnn_enc_img = self.visual_encoder(img_ip)

#         combined = torch.cat(
#             [encoded_text, encoded_img], dim=1
#         )

#         result = self.combined_fc(combined)

#         # Split the result into mu and var components
#         # of the latent Gaussian distribution
#         mu = self.fc_mu(result)
#         log_var = self.fc_var(result)

#         return cnn_enc_img, mu, log_var


#     def decode(self, z, max_len):
#         recon_text = self.text_decoder(z, max_len)

#         recon_img = self.visual_decoder(z)

#         return [recon_text, recon_img]
    
#     def reparameterize(self, mu, logvar):
#         """
#         Reparameterization trick to sample from N(mu, var) from
#         N(0,1).
#         :param mu: (Tensor) Mean of the latent Gaussian [B x D]
#         :param logvar: (Tensor) Standard deviation of the latent Gaussian [B x D]
#         :return: (Tensor) [B x D]
#         """
#         std = torch.exp(0.5 * logvar)
#         eps = torch.randn_like(std)
#         return eps * std + mu
    
    
#     def forward(self, text_ip, img_ip):

#         ## encoder network 
#         cnn_enc_img, mu, log_var = self.encode(text_ip, img_ip)

#         z = self.reparameterize(mu, log_var)
        
# #         print("text ip shape",text_ip.shape)

#         recon_text, recon_img = self.decode(z, text_ip.shape[1])

#         fnd_out = self.fnd_module(z)

#         return  [fnd_out, recon_text, recon_img, mu, log_var, cnn_enc_img]

In [41]:
from models import *
final_model = MVAE(params_dict_model)
final_model.to(device)

MVAE(
  (text_encoder): TextEncoder(
    (embedding): Embedding(31299, 32)
    (text_encoder): LSTM(32, 32, batch_first=True, bidirectional=True)
    (text_enc_fc): Linear(in_features=64, out_features=32, bias=True)
  )
  (visual_encoder): VisualEncoder(
    (vis_encoder): VGG(
      (features): Sequential(
        (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): ReLU(inplace=True)
        (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (6): ReLU(inplace=True)
        (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (8): ReLU(inplace=True)
        (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), pad

In [42]:
## Instantiate the tensorboard summary writer
writer = SummaryWriter('runs/mvae_exp1')

In [43]:
# Create the optimizer
optimizer = AdamW(final_model.parameters(),
                  lr=parameter_dict_opt['l_r'],
                  eps=parameter_dict_opt['eps'])

# Total number of training steps
total_steps = len(train_dataloader) * EPOCHS

# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0, # Default value
                                            num_training_steps=total_steps)

In [44]:
def loss_function(ip_text, ip_img, ip_label, mu, log_var, rec_text, rec_img, fnd_label, lambda_wts) -> dict:
    """
    Computes the VAE loss function.
    KL(N(\mu, \sigma), N(0, 1)) = \log \frac{1}{\sigma} + \frac{\sigma^2 + \mu^2}{2} - \frac{1}{2}
    """
    
    print("correct ", ip_text.shape)
    print(" rec  ", rec_text)
    fnd_loss = fnd_loss_fn(fnd_label, ip_label)

    recons_loss =F.mse_loss(ip_img, rec_img)

    text_loss = nn.NLLLoss(rec_text, ip_text)

    kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim = 1), dim = 0)

    loss = lambda_wts['fnd'] * fnd_loss + lambda_wts['img'] * recons_loss + lambda_wts['kld'] * kld_loss + lambda_wts['text'] * text_loss
    
    return loss

In [45]:
train(model=final_model, loss_fn=loss_function, optimizer=optimizer, scheduler=scheduler, train_dataloader=train_dataloader, val_dataloader=test_dataloader, epochs=EPOCHS, evaluation=True, device=device, param_dict_model=params_dict_model, param_dict_opt=parameter_dict_opt, save_best=True, file_path='./saved_models/mvae_model.pt', writer=writer)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
outputs shape after lstm  torch.Size([8, 6169, 64])


RuntimeError: CUDA out of memory. Tried to allocate 5.76 GiB (GPU 0; 10.92 GiB total capacity; 7.01 GiB already allocated; 3.14 GiB free; 7.04 GiB reserved in total by PyTorch)

In [26]:
m = nn.LogSoftmax(dim=1)
loss = nn.NLLLoss()

In [32]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.tensor([1, 0, 4])

In [33]:
output = loss(m(input), target)

In [46]:
!nvidia-smi

Thu Nov 19 11:23:53 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 108...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   34C    P2    56W / 250W |   7965MiB / 11177MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    