In [1]:
import random
import sys
import os
import json
import re
import numpy as np
from torch.utils.data import DataLoader, Dataset
import torch

In [2]:
filepath = '/scratch1/nsuresh/DL/'
with open(filepath + 'training_label.json', 'r') as f:
    file = json.load(f)

word_count = {}
for d in file:
    for s in d['caption']:
        word_sentence = re.sub('[.!,;?]]', ' ', s).split()
        for word in word_sentence:
            word = word.replace('.', '') if '.' in word else word
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1

In [3]:
word_dict = {}
for word in word_count:
    if word_count[word] > 10:
        word_dict[word] = word_count[word]
useful_tokens = [('<PAD>', 0), ('<SOS>', 1), ('<EOS>', 2), ('<UNK>', 3)]
i2w = {i + len(useful_tokens): w for i, w in enumerate(word_dict)}
w2i = {w: i + len(useful_tokens) for i, w in enumerate(word_dict)}
for token, index in useful_tokens:
    i2w[index] = token
    w2i[token] = index

In [4]:
def s_split(sentence):
    sentence = re.sub(r'[.!,;?]', ' ', sentence).split()
    for i in range(len(sentence)):
        if sentence[i] not in word_dict:
            sentence[i] = 3
        else:
            sentence[i] = w2i[sentence[i]]
    sentence.insert(0, 1)
    sentence.append(2)
    return sentence

In [5]:
def annotate(label_file):
    #label_json = filepath + 'training_label.json'
    label_json = filepath + label_file
    annotated_caption = []
    with open(label_json, 'r') as f:
        label = json.load(f)
    for d in label:
        for s in d['caption']:
            s = s_split(s)
            annotated_caption.append((d['id'], s))
    return annotated_caption

In [6]:
def avi(files_dir):
    avi_data = {}
    #training_feats = filepath + '/training_data/feat'
    training_feats = filepath + files_dir
    files = os.listdir(training_feats)
    for file in files:
        value = np.load(os.path.join(training_feats, file))
        avi_data[file.split('.npy')[0]] = value
    return avi_data

In [7]:
def minibatch(data):
    data.sort(key=lambda x: len(x[1]), reverse=True)
    avi_data, captions = zip(*data) 
    avi_data = torch.stack(avi_data, 0)

    # Merge captions (from tuple of 1D tensor to 2D tensor).
    lengths = [len(cap) for cap in captions]
    targets = torch.zeros(len(captions), max(lengths)).long()
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = cap[:end]
    return avi_data, targets, lengths

In [8]:
class training_data(Dataset):
    def __init__(self):
        self.label_file = label_file
        self.files_dir = files_dir
        self.avi = avi(label_file)
        self.data_pair = annotate(files_dir)
    def __len__(self):
        return len(self.data_pair)
    def __getitem__(self, idx):
        assert (idx < self.__len__())
        avi_file_name, sentence = self.data_pair[idx]
        data = torch.Tensor(self.avi[avi_file_name])
        data += torch.Tensor(data.size()).random_(0, 2000)/10000.
        return torch.Tensor(data), torch.Tensor(sentence)

In [9]:
label_file = '/training_data/feat'
files_dir = 'training_label.json'
train_dataset = training_data()
train_dataloader = DataLoader(dataset = train_dataset, batch_size=128, shuffle=True, num_workers=8, collate_fn=minibatch)

label_file = '/testing_data/feat'
files_dir = 'testing_label.json'
test_dataset = training_data()
test_dataloader = DataLoader(dataset = test_dataset, batch_size=128, shuffle=True, num_workers=8, collate_fn=minibatch)