In [1]:
from nltk.tokenize import word_tokenize
import numpy as np
import pickle
import json

In [2]:
num_data = 5000

f_name = 'processed_data/data_%d.json' % (num_data)

with open(f_name, 'rb') as f:
    data = json.load(f)

In [3]:
data[:2]

[{'ques_id': 448309007,
  'img_path': 'val2014/COCO_val2014_000000448309.jpg',
  'question': "what kind of plant is the table's centerpiece?",
  'question_type': 'what kind of',
  'ann': 'fern'},
 {'ques_id': 58254000,
  'img_path': 'val2014/COCO_val2014_000000058254.jpg',
  'question': 'what is it pointing at?',
  'question_type': 'what is',
  'ann': 'tv'}]

In [4]:
def token_ques_build_vocab(data, vocab, ans_vocab, max_length=20):
    
    quess_length = []
    
    for i in range(len(data)):
        ques = data[i]['question']
        ques_tokens = word_tokenize(ques)
        
        # with <start> and <end>
        quess_length.append(len(ques_tokens) + 2)
        
        ques_tokens = ['<start>'] + ques_tokens
        
        while len(ques_tokens) < max_length - 1:
            ques_tokens += ['<pad>']
        
        ques_tokens += ['<end>']        
        
        data[i]['ques_tokens'] = ques_tokens
                
        for w in ques_tokens:
            vocab.add(w)
        
        ans_vocab.add(data[i]['ann'])
    
    return data, vocab, ans_vocab, quess_length

In [5]:
vocab = set(['<start>', '<end>', '<unk>', '<pad>'])
ans_vocab = set()
data, vocab, ans_vocab, quess_length = token_ques_build_vocab(data, vocab, ans_vocab)

In [6]:
wtoi = {w: i+1 for i, w in enumerate(vocab)}
itow = {i+1: w for i, w in enumerate(vocab)}

atoi = {w: i for i, w in enumerate(ans_vocab)}
itoa = {i: w for i, w in enumerate(ans_vocab)}

In [7]:
def encode_ques_ann(data, wtoi):
    
    processed_quess = []
    processed_anns = []
    img_pathes = []
        
    for i in range(len(data)):
        
        ques_tokens = data[i]['ques_tokens']
        ques_encode = [wtoi[w] for w in ques_tokens]
                
        processed_quess.append(ques_encode)
        
        processed_anns.append(atoi[data[i]['ann']])
        
        img_pathes.append(data[i]['img_path'])
                
    return processed_quess, processed_anns, img_pathes

In [8]:
processed_quess, processed_anns, img_pathes = encode_ques_ann(data, wtoi)

processed_quess = np.array(processed_quess)
processed_anns = np.array(processed_anns)
img_pathes = np.array(img_pathes)

quess_length = np.array(quess_length)

processed_quess.shape, processed_anns.shape, img_pathes.shape, quess_length.shape

((5000, 20), (5000,), (5000,), (5000,))

In [9]:
train_per, val_per, test_per = int(num_data * 0.8), int(num_data * 0.995), num_data

In [10]:
train_quess, val_quess, test_quess, _ = np.split(processed_quess, [train_per, val_per, test_per])

train_anns, val_anns, test_anns, _ = np.split(processed_anns, [train_per, val_per, test_per])

train_img_pathes, val_img_pathes, test_img_pathes, _ = np.split(img_pathes, [train_per, val_per, test_per])

train_quess_length, val_quess_length, test_quess_length, _ = np.split(quess_length, [train_per, val_per, test_per])

In [11]:
train_quess.shape, val_quess.shape, test_quess.shape

((4000, 20), (975, 20), (25, 20))

In [12]:
train_anns.shape, val_anns.shape, test_anns.shape

((4000,), (975,), (25,))

In [13]:
train_img_pathes.shape, val_img_pathes.shape, test_img_pathes.shape

((4000,), (975,), (25,))

In [14]:
train_quess_length.shape, val_quess_length.shape, test_quess_length.shape

((4000,), (975,), (25,))

In [15]:
# Train data

with open('processed_data/train/train_quess_%d.pkl' % (len(train_quess)), 'wb') as f:
    pickle.dump(train_quess, f)
    
with open('processed_data/train/train_anns_%d.pkl' % (len(train_anns)), 'wb') as f:
    pickle.dump(train_anns, f)

with open('processed_data/train/train_imgs_path_%d.pkl' % (len(train_img_pathes)), 'wb') as f:
    pickle.dump(train_img_pathes, f)
    
with open('processed_data/train/train_quess_length_%d.pkl' % (len(train_quess_length)), 'wb') as f:
    pickle.dump(train_quess_length, f)

In [16]:
# Val data

with open('processed_data/val/train_quess_%d.pkl' % (len(val_quess)), 'wb') as f:
    pickle.dump(val_quess, f)
    
with open('processed_data/val/train_anns_%d.pkl' % (len(val_anns)), 'wb') as f:
    pickle.dump(val_anns, f)

with open('processed_data/val/train_imgs_path_%d.pkl'% (len(val_img_pathes)), 'wb') as f:
    pickle.dump(val_img_pathes, f)
    
with open('processed_data/val/val_quess_length_%d.pkl'% (len(val_quess_length)), 'wb') as f:
    pickle.dump(val_quess_length, f)

In [17]:
# Test data

with open('processed_data/test/train_quess_%d.pkl' % (len(test_quess)), 'wb') as f:
    pickle.dump(test_quess, f)
    
with open('processed_data/test/train_anns_%d.pkl' % (len(test_anns)), 'wb') as f:
    pickle.dump(test_anns, f)

with open('processed_data/test/train_imgs_path_%d.pkl' % (len(test_img_pathes)), 'wb') as f:
    pickle.dump(test_img_pathes, f)
    
with open('processed_data/test/test_quess_length_%d.pkl' % (len(test_quess_length)), 'wb') as f:
    pickle.dump(test_quess_length, f)

In [18]:
# wtoi, itow, ans_vocab, #train, #val, #test

with open('processed_data/utility.pkl', 'wb') as f:
    pickle.dump([wtoi, itow, atoi, itoa, len(train_quess), len(val_quess), len(test_quess)], f)