In [None]:
import os
import json
import numpy as np

In [None]:
root_captioning = "../../data"

In [6]:
def get_img_info(name, num=np.inf):
    """
    Returns img paths and captions

    Parameters:
    -----------
    name: str
        the json file name
    num: int (default: np.inf)
        the number of observations to get

    Return:
    --------
    list, dict, int
        img paths, corresponding captions, max length of captions
    """
    img_path = []
    caption = [] 
    max_length = 0
    if AWS:
        with open(f'{root_captioning}/json/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for filename in data.keys():
                if num is not None and len(caption) == num:
                    break
                img_path.append(
                    f'{root_captioning}/{name}/{filename}'
                )
                sen_list = []
                for sentence in data[filename]['sentences']:
                    max_length = max(max_length, len(sentence['tokens']))
                    sen_list.append(sentence['raw'])

                caption.append(sen_list)    
    else:            
        with open(f'{root_captioning}/interim/{name}.json', 'r') as json_data:
            data = json.load(json_data)
            for set_name in ['rsicd', 'ucm']:
                for filename in data[set_name].keys():
                    if num is not None and len(caption) == num:
                        break

                    img_path.append(
                        f'{root_captioning}/raw/imgs/{set_name}/{filename}'
                    )
                    sen_list = []
                    for sentence in data[set_name][filename]['sentences']:
                        max_length = max(max_length, len(sentence['tokens']))
                        sen_list.append(sentence['raw'])

                    caption.append(sen_list)
    
    return img_path, caption, max_length            


In [7]:
# get img path and caption list
# only test 800 train samples and 200 valid samples
# train_paths, train_descriptions, max_length_train = get_img_info('train', 800)
# test_paths, test_descriptions, max_length_test = get_img_info('valid', 200)

train_paths, train_descriptions, max_length_train = get_img_info('train')
test_paths, test_descriptions, max_length_test = get_img_info('valid')
max_length = max(max_length_train, max_length_test)

      
lex = set()
for sen in train_descriptions:
    [lex.update(d.split()) for d in sen]

for sen in test_descriptions:
    [lex.update(d.split()) for d in sen]


Stats on what was collected.

In [8]:
print(len(train_descriptions)) # How many images? 
print(len(test_descriptions)) # How many images? 
print(len(lex)) # How many unique words (vocab)
print(max_length) # Maximum length of a caption (in words)


8332
2084
2912
34


Display the size of the train and test sets.

In [9]:
print(len(train_paths))
print(len(test_paths))

8332
2084


In [10]:
train_paths[0]

'../../s3/train/ucm_1080.jpg'

Build the sequences.  We include a **start** and **stop** token at the beginning/end.  We will later use the **start** token to begin the process of generating a caption.  Encountering the **stop** token in the generated text will let us know we are done.

In [11]:
for v in train_descriptions: 
    for d in range(len(v)):
        v[d] = f'{START} {v[d]} {STOP}'

See how many discriptions were extracted.

In [12]:
train_descriptions[0]

['startseq Lots of boats docked at the harbor and the boats are closed to each other . endseq',
 'startseq Lots of boats docked neatly at the harbor . endseq',
 'startseq Many boats docked neatly at the harbor and the water is deep blue . endseq',
 'startseq Many boats docked neatly at the harbor and some positions are free . endseq',
 'startseq Lots of boats docked neatly at the harbor and the boats are closed to each other . endseq']

In [65]:
all_captions = []
for descriptions in [train_descriptions, test_descriptions]:
    for val in descriptions:
        for cap in val:
            all_captions.append(cap)
len(all_captions)

52080

In [68]:
all_captions[-1]

'a vast artificial lake was built in the park .'

In [69]:
all_captions[0]

'startseq Lots of boats docked at the harbor and the boats are closed to each other . endseq'

In [74]:
all_word_counts = {}
nsents = 0
for sent in all_captions:
    nsents += 1
    for w in sent.split(' '):
        all_word_counts[w] = all_word_counts.get(w, 0) + 1

all_vocab = [w for w in all_word_counts]
print(f'Found {len(all_vocab)} words.')

Found 2915 words.


### Loading Wikipedia2vec Embeddings

In [63]:
embeddings_index = {} 

f = open(
    f'{root_captioning}/download/enwiki_20180420_500d.txt', 
    encoding="utf-8"
)

f.readline()

for line in f:
    values = line.split()
    word = ' '.join(values[:-500])
    coefs = np.asarray(values[-500:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print(f'Found {len(embeddings_index)} word vectors.')

[A
4520033it [10:10, 7414.19it/s][A
4520787it [10:10, 7448.57it/s][A
4521542it [10:10, 7478.09it/s][A
4522299it [10:10, 7504.18it/s][A
4523050it [10:10, 7462.57it/s][A
4523798it [10:10, 7466.95it/s][A
4524545it [10:11, 7432.35it/s][A
4525289it [10:11, 7385.06it/s][A
4526028it [10:11, 7264.30it/s][A
4526755it [10:11, 7174.87it/s][A
4527485it [10:11, 7209.47it/s][A
4528207it [10:11, 7206.07it/s][A
4528946it [10:11, 7258.89it/s][A
4530030it [10:11, 7404.62it/s][A

Found 4530030 word vectors.





In [102]:
embedding_dim = 500

# Get 500-dim dense vector for each of the 4530030 words in out vocabulary
embedding_matrix = {}
count = 0
for word in all_word_counts.keys():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        count += 1
        # Words not found in the embedding index will be all zeros
        embedding_matrix[word] = embedding_vector.tolist()
        
print(f'{count} out of {len(all_vocab)} words are found in the pre-trained matrix.')

2338 words out of 2915 are fount in the pre-trained matrix.


In [103]:
# save the embedding matrix 
with open(f'{root_captioning}/enwiki_20180420_2338_words_500d.json', 'w', encoding='utf-8') as file:
    json.dump(embedding_matrix, file)

In [104]:
# read the embedding matrix 
with open(f'{root_captioning}/enwiki_20180420_2338_words_500d.json', 'r', encoding='utf-8') as file:
    em = json.load(file)