Importing files

In [32]:
import tensorflow as tf
import json
import os
import unicodedata
import re
import logging
import pickle
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

Downloading Captions from MS-COCO dataset

In [10]:
#https://www.tensorflow.org/tutorials/text/image_captioning
annotation_directory = '/annotations/'
if not os.path.exists(os.path.abspath('.') + annotation_directory):
  annotation_tar = tf.keras.utils.get_file('captions.zip',
                                          cache_subdir=os.path.abspath('.'),
                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
                                          extract = True)
  annotation_files = os.path.dirname(annotation_tar)+'/annotations/captions_train2014.json'
  os.remove(annotation_tar)

Downloading images from MS-COCO dataset

In [11]:
#https://www.tensorflow.org/tutorials/text/image_captioning
image_directory = '/train2014/'
if not os.path.exists(os.path.abspath('.') + image_directory):
  image_tar = tf.keras.utils.get_file('train2014.zip',
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',
                                      extract = True)
  PATH = os.path.dirname(image_tar) + image_directory
  os.remove(image_tar)
else:
  PATH = os.path.abspath('.') + image_directory

Function for reading JSON file

In [22]:
def open_json_file(file_name):
    with open(file_name, 'r') as f:
        annotations = json.load(f)
    return annotations

Function for removing HTML noise from text

In [13]:
def remove_html_markup(s):
    tag = False
    quote = False
    out = ""
    for c in s:
        if c == '<' and not quote:
            tag = True
        elif c == '>' and not quote:
            tag = False
        elif (c == '"' or c == "'") and tag:
            quote = not quote
        elif not tag:
            out = out + c
    return out

Function for converting unicode text to ASCII

In [14]:
#https://www.tensorflow.org/tutorials/text/nmt_with_attention
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

Function for cleaning sentences

In [15]:
def preprocess_sentence(w):
    w = remove_html_markup(w)
    w = w.lower().strip()
    if w == '':
        return 0
    else:
        w = unicode_to_ascii(w)
        w = re.sub(r"[^-!$&(),./%0-9:;?a-z'\"]+", " ", w)
        w = re.sub(r'(\d)th', r'\1 th', w, flags=re.I)
        w = re.sub(r'(\d)st', r'\1 st', w, flags=re.I)
        w = re.sub(r'(\d)rd', r'\1 rd', w, flags=re.I)
        w = re.sub(r'(\d)nd', r'\1 nd', w, flags=re.I)
        punc = list("-!$&(),./%:;?¿¡€'")
        for i in punc:
            w = w.replace(i, " "+i+" ")
        w = w.strip()
        w = re.sub(r'\s+', ' ', w)
        return w

Function for preprocessing images and extracting features

In [19]:
def preprocess_image(file_name, model):
    w = tf.io.read_file(file_name)
    w = tf.image.decode_jpeg(w, channels=3)
    w = tf.image.resize(w, (299, 299))
    w = tf.keras.applications.inception_v3.preprocess_input(w)
    w = tf.convert_to_tensor([w])
    w = model(w)
    w = tf.reshape(w, [w.shape[0], -1, w.shape[3]])
    return w

Function for creating dataset with the help of annotation file

In [43]:
def create_dataset(annotation_file, model):
    annotations = open_json_file(annotation_file)
    annotations = pd.DataFrame(annotations['annotations'])
    annotations = shuffle(annotations)
    images, image_id, captions = [], [], []
    for i in range(len(annotations)):
        if annotations['image_id'].iloc[i] in image_id:
            continue
        image_path = PATH + 'COCO_train2014_' + '%012d.jpg' \
                     % int(annotations['image_id'][i])
        new_annotations = annotations[annotations['image_id'] == annotations['image_id'].iloc[i]]
        for j in range(len(new_annotations)):
            caption = preprocess_sentence(new_annotations['caption'].iloc[j])
            if caption == 0:
                continue
            captions.append(caption)
            images.append(preprocess_image(image_path, model))
            image_id.append(str(new_annotations['image_id'].iloc[j]))
        if len(np.unique(image_id)) % 100 == 0:
            print('No. of images processed: ', len(np.unique(image_id)))
        if len(np.unique(image_id)) == 5000:
            break
    return images, captions

Inception V3 model for extracting features

In [24]:
model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')
new_model = tf.keras.Model(model.input, model.layers[-1].output)
new_model.trainable = False

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


In [41]:
images, captions = create_dataset(annotation_files, new_model)

No. of images processed:  100
No. of images processed:  200
No. of images processed:  300
No. of images processed:  400
No. of images processed:  500
No. of images processed:  600
No. of images processed:  700
No. of images processed:  800
No. of images processed:  900
No. of images processed:  1000
No. of images processed:  1100
No. of images processed:  1200
No. of images processed:  1300
No. of images processed:  1400
No. of images processed:  1500
No. of images processed:  1600
No. of images processed:  1700
No. of images processed:  1800
No. of images processed:  1900
No. of images processed:  2000
No. of images processed:  2100
No. of images processed:  2200
No. of images processed:  2300
No. of images processed:  2400
No. of images processed:  2500
No. of images processed:  2600
No. of images processed:  2700
No. of images processed:  2800
No. of images processed:  2900
No. of images processed:  3000
No. of images processed:  3100
No. of images processed:  3200
No. of images pro

ResourceExhaustedError: ignored

In [35]:
def lines_to_text(lines, sep):
    text = ''
    for i in range(len(lines)):
        if i == len(lines) - 1:
            text += str(lines[i])
        else:
            text += str(lines[i]) + sep
    return text

['a man that is standing up with a racquet .',
 'a man swinging a tennis racket at a tennis match .',
 'the tennis player is looking up to swing at his serve .',
 'a tennis player hitting the ball from a high point .',
 'there is man that is playing in a tennis match',
 'a narrow street is filled with shopping pedestrians .',
 'a small crowded ally between tons of buildings',
 'several people sitting down next to stores in a busy city .',
 'several people walking down an alleyway near stores .',
 'a crowded street is shown with shops and apartment buildings .',
 'two people on the same team playing soccer .',
 'the men are playing a game of soccer on the field .',
 'a group of young men kicking around a soccer ball .',
 'some players in action on the soccer field .',
 'two soccer players wear red and and black and are on green grass with a white ball .',
 'this is a skateboarder about to jump back into the pipe after a trick .',
 'this gay border is getting ready to skateboard on his k

In [None]:
def dataset_save(lines, name):
    text = lines_to_text(lines, '\n')
    f = open(name, 'w', encoding='utf-8')
    f.write(text)
    f.close()

In [None]:
from random import shuffle

c = list(zip(images, captions))
shuffle(c)
images, captions = zip(*c)

In [36]:
train_images, val_images, train_captions, val_captions = train_test_split(images, captions, test_size=0.2)
val_images, test_images, val_captions, test_captions = train_test_split(val_image_id, val_captions, test_size=0.5)
print('Training set size: ', len(train_images))
print('Validation set size: ', len(val_images))
print('Testing set size: ', len(test_images))

In [37]:
dataset_save(train_captions, 'train_captions')
dataset_save(val_captions, 'val_captions')
dataset_save(test_captions, 'test_captions')

TensorShape([50, 1, 64, 2048])

In [None]:
#https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb
!pip install sentencepiece
import sentencepiece as spm
spm.SentencePieceTrainer.train('--input=train_captions --model_prefix=en --vocab_size=2000')
sp = spm.SentencePieceProcessor()
sp.load('en.model')

In [None]:
new_train_captions = []
for i in train_captions:
    new_train_captions.append(' '.join(['<s>']+sp.encode(i)+['</s>']))

In [42]:
images

[<tf.Tensor: shape=(1, 64, 2048), dtype=float32, numpy=
 array([[[0.        , 0.        , 1.729485  , ..., 0.03974912,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.11589386,
          0.        , 0.        ],
         [0.        , 0.54879624, 0.0477189 , ..., 0.05925789,
          0.        , 0.        ],
         ...,
         [0.        , 0.47074562, 2.9153275 , ..., 0.        ,
          1.1122824 , 0.40483135],
         [0.71824956, 0.97843695, 3.5662599 , ..., 0.        ,
          1.0443866 , 0.56287646],
         [0.14668028, 0.        , 0.7850392 , ..., 0.        ,
          1.5057677 , 0.80599815]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 64, 2048), dtype=float32, numpy=
 array([[[0.        , 0.        , 1.729485  , ..., 0.03974912,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.11589386,
          0.        , 0.        ],
         [0.        , 0.54879624, 0.0477189 , ..., 0.05925789,
        