#### Build a vocabulary of the most commonly occurring words in the caption text

In [0]:
import json
import collections

In [0]:
# Read the file tokens_clean.txt and store the cleaned captions in a dictionary
content = None

with open ("Neural_Image_Caption_Generator/data/textFiles/tokens_clean.txt", 'r') as file:
    content = file.read()

json_acceptable_string = content.replace("'", "\"")
content = json.loads(json_acceptable_string)

In [14]:
print(type(content))

<class 'dict'>


In [15]:
# Build the vocabulary

vocab = set()
for key in content.keys():
    for sentence in content[key]:
        vocab.update(sentence.split())

print("Vocab size = %d" %len(vocab))

Vocab size = 8441


In [16]:
total_words = []

for key in content.keys():
    for caption in content[key]:
        for i in caption.split():
            total_words.append(i)

print("Total Words = %d" %len(total_words))

Total Words = 437466


In [17]:
# Compute the frequency of occurrence of each word

counter = collections.Counter(total_words)
freq_cnt = dict(counter)

print(len(freq_cnt.keys()))

8441


In [18]:
# Sort the dictionary according to frequency of occurrence

sorted_freq_cnt = sorted(freq_cnt.items(), reverse=True, key=lambda x:x[1])

#Filter off those words which occur less than the threshold
threshold = 5
sorted_freq_cnt = [x for x in sorted_freq_cnt if x[1]>threshold]
total_words = [x[0] for x in sorted_freq_cnt]

print(len(total_words))

2644


#### Prepare train and test data

In [0]:
# Read training and testing image names

train_file_data = ""
test_file_data = ""

with open ("Neural_Image_Caption_Generator/data/textFiles/trainImages.txt", 'r') as file:
    train_file_data = file.read()

with open ("Neural_Image_Caption_Generator/data/textFiles/testImages.txt", 'r') as file:
    test_file_data = file.read()

In [0]:
# Obtain a list of train and test images
train_data = [img_file_name for img_file_name in train_file_data.split("\n")[:-1]]
test_data = [img_file_name for img_file_name in test_file_data.split("\n")[:-1]]

# Obtain image ID from image file name
train_data = [image.split(".")[0] for image in train_data]
test_data = [image.split(".")[0] for image in test_data]

In [25]:
train_data[:5]

['2513260012_03d33305cf',
 '2903617548_d3e38d7f88',
 '3338291921_fe7ae0c8f8',
 '488416045_1c6d903fe0',
 '2644326817_8f45080b87']

In [0]:
# For each imageID in train_data, store its captions in a dictionary 

train_content = {}

for imageID in train_data:
    train_content[imageID] = []
    for caption in content[imageID]:
        # Add a start sequence token in the beginning and an end sequence token at the end
        cap_to_append = "startseq " + caption + " endseq"
        train_content[imageID].append(cap_to_append)

In [28]:
train_content['1007320043_627395c3d8']

['startseq a child playing on a rope net  endseq',
 'startseq a little girl climbing on red roping  endseq',
 'startseq a little girl in pink climbs a rope bridge at the park  endseq',
 'startseq a small child grips onto the red ropes at the playground  endseq',
 'startseq the small child climbs on a red ropes on a playground  endseq']