In [1]:
def preProBuildWordVocab(sentence_iterator, word_count_threshold=5):
    # borrowed this function from NeuralTalk
    print ('preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, ))

    word_counts = {}
    nsents = 0

    for sent in sentence_iterator:
        nsents += 1
        tmp_sent = sent.lower().split(' ')
        if '' in tmp_sent:
            tmp_sent.remove('')

        for w in tmp_sent:
           word_counts[w] = word_counts.get(w, 0) + 1

    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
    print ('filtered words from %d to %d' % (len(word_counts), len(vocab)))

    ixtoword = {}
    ixtoword[0] = '<bos>'
    ixtoword[1] = '<eos>'
    ixtoword[2] = '<pad>'
    ixtoword[3] = '<unk>'

    wordtoix = {}
    wordtoix['<bos>'] = 0
    wordtoix['<eos>'] = 1
    wordtoix['<pad>'] = 2
    wordtoix['<unk>'] = 3

    for idx, w in enumerate(vocab):
        wordtoix[w] = idx + 4
        ixtoword[idx+4] = w

    word_counts['<eos>'] = nsents
    word_counts['<bos>'] = nsents
    word_counts['<pad>'] = nsents
    word_counts['<unk>'] = nsents

    bias_init_vector = np.array([1.0 * word_counts[ ixtoword[i] ] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range

    return wordtoix, ixtoword, bias_init_vector


In [12]:


import pickle
import numpy as np



#######################################################################################################
# Parameters Setting
#######################################################################################################
batch_size = 50 # Being support batch_size
num_boxes = 50 # number of Detected regions in each image
feats_dim = 4096 # feature dimensions of each regions
project_dim = 1024 # project the features to one vector, which is 1024 dimensions

sentRNN_lstm_dim = 512 # the sentence LSTM hidden units
sentRNN_FC_dim = 1024 # the fully connected units
wordRNN_lstm_dim = 512 # the word LSTM hidden units
word_embed_dim = 1024 # the learned embedding vectors for the words

S_max = 6
N_max = 30
T_stop = 0.5

n_epochs = 500
learning_rate = 0.0001

open('E:\Projects\Story-Telling-Using-Show-Attend-and-tell\genomedata\paragraphs_v1.json').read()
#######################################################################################################
# Word vocubulary and captions preprocessing stage
#######################################################################################################
img2paragraph = pickle.load(open('E:\Projects\Story-Telling-Using-Show-Attend-and-tell\img2paragraph','rb'))
all_sentences = []
for key, paragraph in img2paragraph.items():
    for each_sent in paragraph[1]:
        each_sent.replace(',', ' ,')
        all_sentences.append(each_sent)
word2idx, idx2word, bias_init_vector = preProBuildWordVocab(all_sentences, word_count_threshold=2)
np.save('E:\Projects\Story-Telling-Using-Show-Attend-and-tell\idx2word_batch', idx2word)

img2paragraph_modify = {}
for img_name, img_paragraph in img2paragraph.items():
    img_paragraph_1 = img_paragraph[1]

    # img_paragraph_1 is a list
    # it may contain the element: '' or ' ', like this:
    # [["a man is walking"], ["the dog is running"], [""], [" "]]
    # so, we should remove them ' ' and '' element
    if '' in img_paragraph_1:
        img_paragraph_1.remove('')
    if ' ' in paragraph[1]:
        img_paragraph_1.remove(' ')

    # the number sents in each paragraph
    # if the sents is bigger than S_max,
    # we force the number of sents to be S_max
    img_num_sents = len(img_paragraph_1)
    if img_num_sents > S_max:
        img_num_sents = S_max

    # if a paragraph has 4 sentences
    # then the img_num_distribution will be like this:
    # [0, 0, 0, 1, 1, 1]
    img_num_distribution = np.zeros([S_max], dtype=np.int32)
    img_num_distribution[img_num_sents-1:] = 1

    # we multiply the number 2, because the <pad> is encoded into 2
    img_captions_matrix = np.ones([S_max, N_max+1], dtype=np.int32) * 2 # zeros([6, 50])
    for idx, img_sent in enumerate(img_paragraph_1):
        # the number of sentences is img_num_sents
        if idx == img_num_sents:
            break

        # because we treat the ',' as a word
        img_sent = img_sent.replace(',', ' ,')

        # Because I have preprocess the paragraph_v1.json file in VScode before,
        # and I delete all the 2, 3, 4...bankspaces
        # so, actually, the 'elif' code will never run
        #print(img_sent)
        if len(img_sent)>=3 and img_sent[0] == ' ' and img_sent[1] != ' ':
            img_sent = img_sent[1:]
        elif len(img_sent)>=3 and img_sent[0] == ' ' and img_sent[1] == ' ' and img_sent[2] != ' ':
            img_sent = img_sent[2:]

        # Be careful the last part in a sentence, like this:
        # '...world.'
        # '...world. '
        if len(img_sent)>=1 and img_sent[-1] == '.':
            img_sent = img_sent[0:-1]
        elif len(img_sent)>=2 and img_sent[-1] == ' ' and img_sent[-2] == '.':
            img_sent = img_sent[0:-2]

        # Last, we add the <bos> and the <eos> in each sentences
        img_sent = '<bos> ' + img_sent + ' <eos>'

        # translate each word in a sentence into the unique number in word2idx dict
        # when we meet the word which is not in the word2idx dict, we use the mark: <unk>
        for idy, word in enumerate(img_sent.lower().split(' ')):
            # because the biggest number of words in a sentence is N_max, here is 50
            if idy == N_max:
                break

            if word in word2idx:
                img_captions_matrix[idx, idy] = word2idx[word]
            else:
                img_captions_matrix[idx, idy] = word2idx['<unk>']

    # Pay attention, the value type 'img_name' here is NUMBER, I change it to STRING type
    img2paragraph_modify[str(img_name)] = [img_num_distribution, img_captions_matrix]
with open('E:\Projects\Story-Telling-Using-Show-Attend-and-tell\img2paragraph_modify_batch', 'wb') as f:
    pickle.dump(img2paragraph_modify, f)

preprocessing word counts and creating vocab based on word count threshold 2
filtered words from 18926 to 9933
