In [1]:
import math
import torch
import glob
import os
import torch.nn as nn
import numpy as np
import torch.utils.data as td
import torch.nn.functional as F
from PIL import Image


from torchvision import datasets, transforms
import torchvision.models as models
from torch.autograd import Variable

# import torchtext
# from torchtext.data.utils import get_tokenizer

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

In [2]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [196]:
TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"),
                            #tokenize=get_tokenizer("spacy"),
                            init_token='<sos>',
                            eos_token='<eos>',
                            lower=True)
train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(train_txt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def batchify(data, bsz):
    data = TEXT.numericalize([data.examples[0].text])
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)

In [8]:
resnet18 = models.resnet18(pretrained=True)
modules=list(resnet18.children())[:-1]
resnet18=nn.Sequential(*modules).double()
for p in resnet18.parameters():
    p.requires_grad = False

In [9]:
class MyDataset(td.Dataset):
    def __init__(self, root_dir, mode='train', audio_mode='audio_clean', vid_out='resnet'):
        super(MyDataset, self).__init__()
        self.mode = mode
        self.audio_mode = audio_mode
        self.vid_out = vid_out
        
        if(mode=='train'):
            self.folder_dir = os.path.join(root_dir, 'train_npy')
            self.maxlen_label=129 
            self.maxlen_audio=204 
            self.maxlen_video=155 
        else:
            self.folder_dir = os.path.join(root_dir, 'test_npy')
            self.maxlen_label=129 
            self.maxlen_audio=204 
            self.maxlen_video=155
        
        self.files = glob.glob(self.folder_dir + "/*/*.npy", recursive=True)
        
    def __len__(self):
        return len(self.files)
    
    def __repr__(self):
        return "MyDataset(mode={})".format(self.mode)
    
    def __getitem__(self, idx):

        sample = np.load(self.files[idx],allow_pickle=True)

        audio_dim = sample.item().get("audio_dim")
        audio = sample.item().get(self.audio_mode)
        
        if(self.maxlen_audio-audio_dim[0]>0):
            audio_padding = np.zeros((self.maxlen_audio-audio_dim[0],audio_dim[1]))
            audio = np.concatenate((audio,audio_padding),axis=0)

        labels_length = sample.item().get("labels_length")
        labels = sample.item().get("labels")
        
        if(self.maxlen_label-labels_length):
            label_padding = -np.ones((self.maxlen_label-labels_length))
            labels = np.concatenate((labels,label_padding),axis=0)

        video_dim = sample.item().get("video_dim")
        video = sample.item().get("video")
        aus = sample.item().get("aus")
        
        mean = np.mean(video,axis=(0,1,2))
        std = np.std(video,axis=(0,1,2))

        new_mean=[0.485, 0.456, 0.406]
        new_std=[0.229, 0.224, 0.225]

        normalized_video = new_mean + (video-mean)*(new_std/std)

        rolled_video = np.rollaxis(normalized_video, 3, 1) 
        video_dim = (video_dim[0],video_dim[3],video_dim[1],video_dim[2])
        
        if (self.vid_out=='resnet'):
            tensor_video = torch.from_numpy(rolled_video).type(torch.DoubleTensor)
            renset_video = resnet18(tensor_video.double()).numpy()
            renset_video = renset_video.squeeze((2,3))
        
            resnet_video_dim = (video_dim[0],512)
            
            if(self.maxlen_video-video_dim[0]):
                video_padding = np.zeros((self.maxlen_video-resnet_video_dim[0],resnet_video_dim[1]))
                renset_video = np.concatenate((renset_video,video_padding),axis=0)
            return (audio, audio_dim), (renset_video, aus, resnet_video_dim), (labels, labels_length)
        else:
            if(self.maxlen_video-video_dim[0]):
                video_padding = np.zeros((self.maxlen_video-video_dim[0],video_dim[1],video_dim[2],video_dim[3]))
                final_video = np.concatenate((rolled_video,video_padding),axis=0)

            return (audio, audio_dim), (final_video, aus, video_dim), (labels, labels_length)

In [10]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [11]:
test_data = MyDataset('/home/pi242/xai/CSE291E_AVSR_project/gdrive',mode='test',vid_out='resnet')

In [12]:
dummydataloader = td.DataLoader(test_data, batch_size=5, shuffle=True, drop_last=True)

In [50]:
import importlib
import encoders
import decoder
importlib.reload(encoders)
importlib.reload(decoder)
aenc = encoders.AudioTEncoder(240, 512, 4, 512, 4)
venc = encoders.VideoTEncoder(512, 512, 4, 512, 4)

declayer = decoder.CrossModalTDecoderLayer(512, 4)

In [55]:
for i, sample in enumerate(dummydataloader):
    print(i)
    # print(sample[1][0].shape)
    # print(sample[1][2])
    # print(sample[0])
    print(sample[0][1])
    print(sample[1][2])
    aip = sample[0][0].permute(1, 0, 2)
    vip = sample[1][0].permute(1, 0, 2)
    print(aip.shape, vip.shape)

    aop = aenc.forward(torch.tensor(aip, dtype=torch.float32), sample[0][1][0])
    vop = venc(torch.tensor(vip, dtype=torch.float32), sample[1][2][0])
    print(aop.shape, vop.shape)
    print(torch.sum(aop[:sample[0][1][0][0].item(), 0, :]))
    print()
    dummy = declayer.forward(aop, vop, sample[0][1][0], sample[1][2][0])
    print(dummy.shape)
    break

0
[tensor([100,  53,  77, 129,  59]), tensor([240, 240, 240, 240, 240])]
[tensor([77, 42, 60, 99, 46]), tensor([512, 512, 512, 512, 512])]
torch.Size([204, 5, 240]) torch.Size([155, 5, 512])
torch.Size([204, 5, 512]) torch.Size([155, 5, 512])
100

torch.Size([563, 5, 512])


In [13]:
(audio, audio_dim), (final_video, aus, video_dim), (labels, labels_length) = test_data[0]

In [16]:
print(audio.shape)

(204, 240)


In [14]:
print(audio_dim, video_dim, labels_length)
print(labels, len(labels))

(92, 240) (71, 512) 53
[17. 16.  7.  1. 22. 10. 17. 23. 21.  3. 16.  6.  1.  7. 11.  9. 10. 22.
  1. 10. 23. 16.  6. 20.  7.  6.  1.  3. 16.  6.  1. 22. 10. 11. 20. 22.
 27.  1. 21. 11. 26.  1. 18.  7. 17. 18. 14.  7.  1.  6. 11.  7.  6. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1.] 129


In [9]:
print(aus)

[]


In [83]:
def generate_square_subsequent_mask(sz):
    """Generate a square mask for the sequence. The masked positions are filled with float('-inf').
        Unmasked positions are filled with float(0.0).
    """
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [84]:
print(generate_square_subsequent_mask(3))

tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]])


In [21]:
a = torch.zeros((10, ))
a[2:5] = 1
print(a)

tensor([0., 0., 1., 1., 1., 0., 0., 0., 0., 0.])
