## Make vocab file for transformers, and add special tokens in it

In [52]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms

import numpy as np
import time
import sys
import os
import math
import tqdm

from nltk.tokenize import RegexpTokenizer
from transformers import BertTokenizer, AutoTokenizer
from PIL import Image
import argparse

from catr.models import caption
from catr.models import utils as mtils
from catr.datasets import coco, utils
from catr.cfg_damsm_vocab import Config

import json, pickle
import pandas as pd
from pycocotools.coco import COCO as CC
import matplotlib.pyplot as plt


## make coco2014 vocab

In [3]:
## try bert tokenizer used in CATR
fpath = '/media/MyDataStor1/mmrl/MMRL/data/coco/captions.pickle'
with open(fpath, 'rb') as f:
    vocab = pickle.load(f)
train_captions, test_captions, ixtoword, wordtoix = vocab
len(ixtoword)

27297

In [4]:
ixtoword[27297] = '[CLS]'
ixtoword[27298] = '[SEP]'
ixtoword[27299] = '[UNK]'
ixtoword[0] = '[PAD]'
wordtoix['[CLS]'] = 27297
wordtoix['[SEP]'] = 27298
wordtoix['[UNK]'] = 27299
wordtoix['[PAD]'] = 0
wordtoix.pop('<end>')

0

In [5]:
print(len(ixtoword),len(wordtoix))
print(wordtoix['[CLS]'], wordtoix['[SEP]'], wordtoix['[UNK]'], wordtoix['[PAD]'])
print(ixtoword[27297], ixtoword[27298], ixtoword[27299], ixtoword[0])

27300 27300
27297 27298 27299 0
[CLS] [SEP] [UNK] [PAD]


In [15]:
## save ixtoword to .txt file
vocab_txt = 'catr/damsm_vocab.txt'
with open(vocab_txt, 'w') as f:
    for i in range(27300):
        f.write("%s\n" % ixtoword[i])

In [20]:
## test bert_tokenizer on damsm new vocab_dict
damsm_tker = BertTokenizer.from_pretrained("catr/damsm_vocab.txt", do_lower=True)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


27300
27297


In [21]:
print(damsm_tker.vocab_size)
print(damsm_tker._cls_token, damsm_tker.convert_tokens_to_ids(damsm_tker._cls_token))
print(damsm_tker._sep_token, damsm_tker.convert_tokens_to_ids(damsm_tker._sep_token))
print(damsm_tker._pad_token, damsm_tker.convert_tokens_to_ids(damsm_tker._pad_token))
print(damsm_tker._unk_token, damsm_tker.convert_tokens_to_ids(damsm_tker._unk_token))

27300
[CLS] 27297
[SEP] 27298
[PAD] 0
[UNK] 27299


In [26]:
sentences = ['a man is playing ball']
tks = damsm_tker(sentences, padding='max_length', return_attention_mask=True, 
                 return_token_type_ids=False, truncation=True, max_length=129)
print(tks)
damsm_tker.batch_decode(tks['input_ids'], skip_special_tokens=True)

{'input_ids': [[27297, 10, 80, 115, 137, 154, 27298, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


['a man is playing ball']

In [37]:
# tokenizer used in damsm dataloader
sentences = ['A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains']
reg_tk = RegexpTokenizer(r'\w+')
regds = reg_tk.tokenize(sentences[0].lower())
print(len(regds))

29


In [28]:
# load the raw tokens from Show, Attention, Tell paper
with open('catr/coco/dataset.json', 'r') as f:
    otk = json.load(f)

In [33]:
otk['images'][0]

{'filepath': 'val2014',
 'sentids': [770337, 771687, 772707, 776154, 781998],
 'filename': 'COCO_val2014_000000391895.jpg',
 'imgid': 0,
 'split': 'test',
 'sentences': [{'tokens': ['a',
    'man',
    'with',
    'a',
    'red',
    'helmet',
    'on',
    'a',
    'small',
    'moped',
    'on',
    'a',
    'dirt',
    'road'],
   'raw': 'A man with a red helmet on a small moped on a dirt road. ',
   'imgid': 0,
   'sentid': 770337},
  {'tokens': ['man',
    'riding',
    'a',
    'motor',
    'bike',
    'on',
    'a',
    'dirt',
    'road',
    'on',
    'the',
    'countryside'],
   'raw': 'Man riding a motor bike on a dirt road on the countryside.',
   'imgid': 0,
   'sentid': 771687},
  {'tokens': ['a',
    'man',
    'riding',
    'on',
    'the',
    'back',
    'of',
    'a',
    'motorcycle'],
   'raw': 'A man riding on the back of a motorcycle.',
   'imgid': 0,
   'sentid': 772707},
  {'tokens': ['a',
    'dirt',
    'path',
    'with',
    'a',
    'young',
    'person',

In [20]:
# sentences = ['This framework generates embeddings for each input sentence',
#              'Sentences are passed as a list of string.',
#              'The quick brown fox jumps over the lazy dog.']
sentences = ['a sizzor']
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower=True)
print(bert_tokenizer.vocab_size)
bkds = bert_tokenizer(sentences, padding='max_length', return_attention_mask=True, 
                 return_token_type_ids=False, truncation=True, max_length=129)
print(bkds)

bert_tokenizer.batch_decode(bkds['input_ids'], skip_special_tokens=True)

30522
{'input_ids': [[101, 1037, 9033, 12036, 2099, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


['a sizzor']

In [22]:
## another tokenizer used in AttnGAN by Abuzar
sentences = ['This framework generates embeddings for each input sentence',
             'Sentences are passed as a list of string.',
             'The quick brown fox jumps over the lazy dog.']
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
print(tokenizer.vocab_size)
tkds = tokenizer(sentences, padding='max_length', return_attention_mask=True, 
                 return_token_type_ids=False, truncation=True, max_length=129)
tkds

30522


{'input_ids': [[101, 2023, 7705, 19421, 7861, 8270, 4667, 2015, 2005, 2169, 7953, 6251, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 11746, 2024, 2979, 2004, 1037, 2862, 1997, 5164, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## test the commas in the sentences

In [4]:
tokenizer = BertTokenizer.from_pretrained("catr/damsm_vocab.txt", do_lower=True)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [6]:
sss = 'a dog is playing on the whole-green grass\'s.   '
print(sss)
sss = sss.replace('.', '').replace('\'', ' ').replace('-', ' ')
print(sss)
ttt = tokenizer.encode_plus(
            sss, max_length=129, padding='max_length', return_attention_mask=True, return_token_type_ids=False, truncation=True)
print(ttt['input_ids'][:20], ttt['attention_mask'][:20])

a dog is playing on the whole-green grass's.   
a dog is playing on the whole green grass s   
[27297, 10, 108, 115, 137, 11, 8, 1885, 41, 200, 84, 27298, 0, 0, 0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]


## Preprocess all the captions and caption_masks in training data and val data to save time and get rid of special marks in the sentences

In [18]:
with open(os.path.join(
            '/media/MyDataStor1/mmrl/MMRL/data/coco', 'annotations', 'captions_train2014.json'), 'rb') as f:
    test = json.load(f)
print(len(test['annotations']))
print(test['annotations'][-1])

414113
{'image_id': 133071, 'id': 829717, 'caption': 'A dinner plate has a lemon wedge garnishment.'}


In [31]:
for i in range(len(d['annotations'])):
    if 'mmm' in d['annotations'][i]['caption']:
        print(i)

90149
122899
155889
221020
256632
286858


In [2]:
## training set
train_file = os.path.join(
            '/media/MyDataStor1/mmrl/MMRL/data/coco', 'annotations', 'captions_train2014.json')
with open(train_file, 'r') as f:
    d = json.load(f)
print(len(d['annotations']))
print(d.keys())
print(d['annotations'][:2])

414113
dict_keys(['info', 'images', 'licenses', 'annotations'])
[{'image_id': 318556, 'id': 48, 'caption': 'A very clean and well decorated empty bathroom'}, {'image_id': 116100, 'id': 67, 'caption': 'A panoramic view of a kitchen and all of its appliances.'}]


In [3]:
tokenizer = BertTokenizer.from_pretrained("catr/damsm_vocab.txt", do_lower=True)
reger = RegexpTokenizer(r'\w+')

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [4]:
# sentence = 'a dog is playing on the whole-green grass\'s. '
for i in range(len(d['annotations'])):
    sentence = d['annotations'][i]['caption']
    regtk = reger.tokenize(sentence.lower())
    reg_cap = ' '.join(regtk)
    caption_encoded = tokenizer.encode_plus(
                reg_cap, max_length=129, padding='max_length', return_attention_mask=True, return_token_type_ids=False, truncation=True)
    d['annotations'][i]['caption'] = caption_encoded['input_ids']
    d['annotations'][i]['cap_mask'] = caption_encoded['attention_mask']

In [6]:
i = 1
print(len(d['annotations']))
print(d['annotations'][i].keys())
print(len(d['annotations'][i]['caption']), len(d['annotations'][i]['cap_mask']))
print(d['annotations'][i])

tokenizer.decode(d['annotations'][i]['caption'], skip_special_tokens=True)

414113
dict_keys(['image_id', 'id', 'caption', 'cap_mask'])
129 129
{'image_id': 116100, 'id': 67, 'caption': [27297, 10, 6766, 415, 21, 10, 288, 58, 48, 21, 343, 1101, 27298, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'cap_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


'a panoramic view of a kitchen and all of its appliances'

In [7]:
tr_pk_file = os.path.join(
            '/media/MyDataStor1/mmrl/MMRL/data/coco', 'annotations', 'tokens_train2014.json')
with open(tr_pk_file, 'w') as f:
#     pickle.dump(d, f, protocol=pickle.HIGHEST_PROTOCOL)
    json.dump(d, f)

In [None]:
## val set

In [2]:
val_file = os.path.join(
            '/media/MyDataStor1/mmrl/MMRL/data/coco', 'annotations', 'captions_val2014.json')
with open(val_file, 'r') as f:
    g = json.load(f)
print(len(g['annotations']))
print(g.keys())
print(g['annotations'][:2])

202654
dict_keys(['info', 'images', 'licenses', 'annotations'])
[{'image_id': 203564, 'id': 37, 'caption': 'A bicycle replica with a clock as the front wheel.'}, {'image_id': 179765, 'id': 38, 'caption': 'A black Honda motorcycle parked in front of a garage.'}]


In [3]:
tokenizer = BertTokenizer.from_pretrained("catr/damsm_vocab.txt", do_lower=True)
reger = RegexpTokenizer(r'\w+')

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [4]:
# sentence = 'a dog is playing on the whole-green grass\'s. '
for i in range(len(g['annotations'])):
    sentence = g['annotations'][i]['caption']
    regtk = reger.tokenize(sentence.lower())
    reg_cap = ' '.join(regtk)
    caption_encoded = tokenizer.encode_plus(
                reg_cap, max_length=129, padding='max_length', return_attention_mask=True, return_token_type_ids=False, truncation=True)
    g['annotations'][i]['caption'] = caption_encoded['input_ids']
    g['annotations'][i]['cap_mask'] = caption_encoded['attention_mask']

In [6]:
i = 1
print(len(g['annotations']))
print(g['annotations'][i].keys())
print(len(g['annotations'][i]['caption']), len(g['annotations'][i]['cap_mask']))
print(g['annotations'][i])

tokenizer.decode(g['annotations'][i]['caption'], skip_special_tokens=True)

202654
dict_keys(['image_id', 'id', 'caption', 'cap_mask'])
129 129
{'image_id': 179765, 'id': 38, 'caption': [27297, 10, 518, 3812, 206, 36, 7, 54, 21, 10, 4894, 27298, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'cap_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


'a black honda motorcycle parked in front of a garage'

In [7]:
vl_pk_file = os.path.join(
            '/media/MyDataStor1/mmrl/MMRL/data/coco', 'annotations', 'tokens_val2014.json')
with open(vl_pk_file, 'w') as f:
#     pickle.dump(g, f, protocol=pickle.HIGHEST_PROTOCOL)
    json.dump(g, f)

## make CUB vocab

In [53]:
## try bert tokenizer used in CATR
fpath = '/media/MyDataStor1/mmrl/MMRL/data/birds/captions.pickle'
with open(fpath, 'rb') as f:
    vocab = pickle.load(f)
train_captions, test_captions, ixtoword, wordtoix = vocab
len(ixtoword)

5450

In [3]:
ixtoword[5450] = '[CLS]'
ixtoword[5451] = '[SEP]'
ixtoword[5452] = '[UNK]'
ixtoword[0] = '[PAD]'
wordtoix['[CLS]'] = 5450
wordtoix['[SEP]'] = 5451
wordtoix['[UNK]'] = 5452
wordtoix['[PAD]'] = 0
wordtoix.pop('<end>')

0

In [5]:
print(len(ixtoword),len(wordtoix))
print(wordtoix['[CLS]'], wordtoix['[SEP]'], wordtoix['[UNK]'], wordtoix['[PAD]'])
print(ixtoword[5450], ixtoword[5451], ixtoword[5452], ixtoword[0])

5453 5453
5450 5451 5452 0
[CLS] [SEP] [UNK] [PAD]


In [6]:
## save ixtoword to .txt file
vocab_txt = 'catr/bird_vocab.txt'
with open(vocab_txt, 'w') as f:
    for i in range(len(ixtoword)):
        f.write("%s\n" % ixtoword[i])

In [54]:
## test bert_tokenizer on damsm new vocab_dict
damsm_tker = BertTokenizer.from_pretrained("catr/bird_vocab.txt", do_lower=True)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [55]:
print(damsm_tker.vocab_size)
print(damsm_tker._cls_token, damsm_tker.convert_tokens_to_ids(damsm_tker._cls_token))
print(damsm_tker._sep_token, damsm_tker.convert_tokens_to_ids(damsm_tker._sep_token))
print(damsm_tker._pad_token, damsm_tker.convert_tokens_to_ids(damsm_tker._pad_token))
print(damsm_tker._unk_token, damsm_tker.convert_tokens_to_ids(damsm_tker._unk_token))

5453
[CLS] 5450
[SEP] 5451
[PAD] 0
[UNK] 5452


In [56]:
sentences = ['a bird with black beak']
tks = damsm_tker(sentences, padding='max_length', return_attention_mask=True, 
                 return_token_type_ids=False, truncation=True, max_length=129)
print(tks)
damsm_tker.batch_decode(tks['input_ids'], skip_special_tokens=True)

{'input_ids': [[5450, 3066, 4217, 2622, 2074, 4839, 5451, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}


['a bird with black beak']

In [10]:
# tokenizer used in damsm dataloader
sentences = ['A bird with black beek.']
reg_tk = RegexpTokenizer(r'\w+')
regds = reg_tk.tokenize(sentences[0].lower())
print(len(regds))

5


In [57]:
## check filename and caption name
print(len(train_captions))
tks = train_captions[0]
print(tks)
damsm_tker.batch_decode([tks], skip_special_tokens=True)

88550
[3066, 4217, 2622, 3066, 2774, 3959, 340, 3950, 1946, 3066, 3959, 1692, 4839]


['a bird with a very long wing span and a long pointed beak']

In [58]:
## make the json file for image - caption pairs
## train set processing
# load filenames
data_dir = '/media/MyDataStor1/mmrl/MMRL/data/birds'
split = 'train'
# split = 'test'
filepath = '%s/%s/filenames.pickle' % (data_dir, split)
if os.path.isfile(filepath):
    with open(filepath, 'rb') as f:
        filenames = pickle.load(f, encoding='iso-8859-1')
    print('Load filenames from: %s (%d)' % (filepath, len(filenames)))
print(filenames[0])

Load filenames from: /media/MyDataStor1/mmrl/MMRL/data/birds/train/filenames.pickle (8855)
002.Laysan_Albatross/Laysan_Albatross_0002_1027


In [59]:
## load bbox for all images (train + test)
bbox_path = os.path.join(data_dir, 'CUB_200_2011/bounding_boxes.txt')
df_bounding_boxes = pd.read_csv(bbox_path,
                                delim_whitespace=True,
                                header=None).astype(int)
imgpath = os.path.join(data_dir, 'CUB_200_2011/images.txt')
df_filenames = pd.read_csv(imgpath, delim_whitespace=True, header=None)
imgnames = df_filenames[1].tolist()
print('Total filenames: ', len(imgnames), imgnames[0])
filename_bbox = {img_file[:-4]: [] for img_file in imgnames}
numImgs = len(imgnames)
for i in range(0, numImgs):
    # bbox = [x-left, y-top, width, height]
    bbox = df_bounding_boxes.iloc[i][1:].tolist()
    key = imgnames[i][:-4]
    filename_bbox[key] = bbox
print(len(filename_bbox))
print(imgnames[0][:-4], filename_bbox[imgnames[0][:-4]])

Total filenames:  11788 001.Black_footed_Albatross/Black_Footed_Albatross_0046_18.jpg
11788
001.Black_footed_Albatross/Black_Footed_Albatross_0046_18 [60, 27, 325, 304]


In [62]:
## load text captions for each image file
cap_list = train_captions
captions_bird = [] # bird dict: image filename + caption (10 pairs for each image)
for i in range(len(filenames)):
    # open text caption file
    for j in range(10):
        cap = damsm_tker.decode(cap_list[i*10+j], skip_special_tokens=True)
        captions_bird.append({'filename': filenames[i], 'bbox': filename_bbox[filenames[i]], 'caption': cap})
#     break

In [64]:
# check the output of bird dictionary
print(len(captions_bird))
k = 0
for cc in captions_bird[k:k+50]:
    print(cc, sep='\n')

88550
{'filename': '002.Laysan_Albatross/Laysan_Albatross_0002_1027', 'bbox': [144, 40, 333, 165], 'caption': 'a bird with a very long wing span and a long pointed beak'}
{'filename': '002.Laysan_Albatross/Laysan_Albatross_0002_1027', 'bbox': [144, 40, 333, 165], 'caption': 'the long beaked bird has a white body with long brown wings'}
{'filename': '002.Laysan_Albatross/Laysan_Albatross_0002_1027', 'bbox': [144, 40, 333, 165], 'caption': 'this is a white bird with brown wings and a large pointy beak'}
{'filename': '002.Laysan_Albatross/Laysan_Albatross_0002_1027', 'bbox': [144, 40, 333, 165], 'caption': 'this large bird has long bill a white breast belly head and a black back wings'}
{'filename': '002.Laysan_Albatross/Laysan_Albatross_0002_1027', 'bbox': [144, 40, 333, 165], 'caption': 'bird has an extremely long wingspan with a darker top and white belly and head'}
{'filename': '002.Laysan_Albatross/Laysan_Albatross_0002_1027', 'bbox': [144, 40, 333, 165], 'caption': 'this bird has wi

In [65]:
with open('/media/MyDataStor1/mmrl/MMRL/data/birds/captions_cub_train.json', 'w') as f:
    json.dump(captions_bird, f)

In [66]:
## test set processing

In [67]:
## check filename and caption name
print(len(test_captions))
tks = test_captions[0]
print(tks)
damsm_tker.decode(tks, skip_special_tokens=True)

29330
[4426, 4382, 2887, 4217, 2622, 3066, 1902, 1227, 1946, 3973, 4472, 4839]


'light tan colored bird with a white head and an orange beak'

In [68]:
## make the json file for image - caption pairs
## train set processing
# load filenames
data_dir = '/media/MyDataStor1/mmrl/MMRL/data/birds'
# split = 'train'
split = 'test'
filepath = '%s/%s/filenames.pickle' % (data_dir, split)
if os.path.isfile(filepath):
    with open(filepath, 'rb') as f:
        filenames = pickle.load(f, encoding='iso-8859-1')
    print('Load filenames from: %s (%d)' % (filepath, len(filenames)))
print(filenames[0])

Load filenames from: /media/MyDataStor1/mmrl/MMRL/data/birds/test/filenames.pickle (2933)
001.Black_footed_Albatross/Black_Footed_Albatross_0046_18


In [69]:
## load text captions for each image file
cap_list = test_captions
captions_bird = [] # bird dict: image filename + caption (10 pairs for each image)
for i in range(len(filenames)):
    # open text caption file
    for j in range(10):
        cap = damsm_tker.decode(cap_list[i*10+j], skip_special_tokens=True)
        captions_bird.append({'filename': filenames[i], 'bbox': filename_bbox[filenames[i]], 'caption': cap})

In [70]:
# check the output of bird dictionary
print(len(captions_bird))
k = 0
for cc in captions_bird[k:k+50]:
    print(cc, sep='\n')

29330
{'filename': '001.Black_footed_Albatross/Black_Footed_Albatross_0046_18', 'bbox': [60, 27, 325, 304], 'caption': 'light tan colored bird with a white head and an orange beak'}
{'filename': '001.Black_footed_Albatross/Black_Footed_Albatross_0046_18', 'bbox': [60, 27, 325, 304], 'caption': 'the bird has a very thick curved and beige beak'}
{'filename': '001.Black_footed_Albatross/Black_Footed_Albatross_0046_18', 'bbox': [60, 27, 325, 304], 'caption': 'this bird has a long neck that is grainy and a pastel orange blue narrow beak that droops down at the tip'}
{'filename': '001.Black_footed_Albatross/Black_Footed_Albatross_0046_18', 'bbox': [60, 27, 325, 304], 'caption': 'this bird is light brown has a long hooked bill and looks dumb'}
{'filename': '001.Black_footed_Albatross/Black_Footed_Albatross_0046_18', 'bbox': [60, 27, 325, 304], 'caption': 'this large white bird has a large curved bill and a brown eye'}
{'filename': '001.Black_footed_Albatross/Black_Footed_Albatross_0046_18', '

In [71]:
with open('/media/MyDataStor1/mmrl/MMRL/data/birds/captions_cub_test.json', 'w') as f:
    json.dump(captions_bird, f)

In [None]:
## check the json files

In [74]:
# load the raw tokens from Show, Attention, Tell paper
with open('/media/MyDataStor1/mmrl/MMRL/data/birds/captions_cub_train.json', 'r') as f:
    otk = json.load(f)
print(len(otk))
print(otk[29329])

88550
{'filename': '071.Long_tailed_Jaeger/Long_Tailed_Jaeger_0008_797066', 'bbox': [31, 141, 341, 127], 'caption': 'this is a gray bird with a white breast and a gray beak that curves downwards at the tip'}
