In [25]:

import json 

In [26]:
num_to_train = 500
num_to_val = 50

In [27]:
train_annotation = json.load(open("Dataset/annotations/train.json"))

In [28]:
print(train_annotation['info'])
total_image = len(train_annotation['images'])
print('Total Images: ',total_image)

{'description': 'This dataset contains crowdsourced captions of images from VizWiz datasets. This file contains the train partition.', 'license': {'url': 'https://creativecommons.org/licenses/by/4.0/', 'name': 'Attribution 4.0 International (CC BY 4.0)'}, 'url': 'https://vizwiz.org', 'version': 'VizWiz-Captions 1.0', 'year': 2019, 'contributor': 'VizWiz-Captions Consortium', 'date_created': '2019-12-23'}
Total Images:  23431


In [29]:

# load and show an image with Pillow
from matplotlib import image
from matplotlib import pyplot
from PIL import Image

In [30]:
descriptions = dict()
for i in range(num_to_train):
    image = Image.open('Dataset/train/'+train_annotation['images'][i]['file_name'])
    image = image.convert(mode='L')
    image = image.resize((640, 480))
    print(image.size)
    descriptions[i] = []
    for j in range(5):
        print('Caption #',j,': ',train_annotation['annotations'][i*5+j]['caption'])
        descriptions[i].append(train_annotation['annotations'][i*5+j]['caption'])
#     pyplot.subplot(5,2,i+1)
#     pyplot.imshow(image)
# pyplot.show()


(640, 480)
Caption # 0 :  ITS IS A BASIL LEAVES CONTAINER ITS CONTAINS THE NET WEIGHT TOO.
Caption # 1 :  A green and white plastic condiment bottle containing Basil leaves.
Caption # 2 :  Quality issues are too severe to recognize visual content.
Caption # 3 :  A bottle of spices in a plastic container laying on a surface.
Caption # 4 :  some basil leaves in a container on a counter
(640, 480)
Caption # 0 :  A can of Coca Cola on a counter is shown for when one can use a nice, cold drink.
Caption # 1 :  A black can of Coca Cola Zero calorie soda is on the counter near the coffee maker.
Caption # 2 :  A kitchen counter the various items on top including a can of Coca-Cola, metal containers, and a teapot.
Caption # 3 :  a black tin of Coca Cola placed on a black surface
Caption # 4 :  Black counter with canisters, kettle and can of soda.
(640, 480)
Caption # 0 :  A can of crushed tomatoes are on a brown surface, the tomatoes read crushed tomatoes on the brand.
Caption # 1 :  A can of cr

In [31]:
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from keras.preprocessing.text import Tokenizer


from numpy import array
from pickle import load
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint

In [32]:
from pickle import dump
from pickle import load

In [33]:

# extract features from each photo in the directory
def extract_features(annotation,isTrain = True):
	# load the model
	model = VGG16()
	# re-structure the model
	model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
	# summarize
	print(model.summary())
	# extract features from each photo
	features = dict()

	for i in range(num_to_train):
		if isTrain==False:
			if i==num_to_val:
				break
		# load an image from file
		filename = ''
		if isTrain:
			filename = 'Dataset/train/'+annotation['images'][i]['file_name']
		else:
			filename = 'Dataset/val/'+annotation['images'][i]['file_name']
		image = load_img(filename, target_size=(224, 224))
		# convert the image pixels to a numpy array
		image = img_to_array(image)
		# reshape data for the model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# prepare the image for the VGG model
		image = preprocess_input(image)
		# get features
		feature = model.predict(image, verbose=0)
		# store feature
		features[i] = feature
		# print('>%s' % name)
	return features

In [34]:
features = extract_features(train_annotation)
print('Extracted Features: %d' % len(features))
# save to file
dump(features, open('features.pkl', 'wb'))

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0   

In [35]:
print('Loaded: %d ' % len(descriptions))

Loaded: 500 


In [36]:
import string

def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			desc = [w.translate(table) for w in desc]
			# remove hanging 's' and 'a'
			desc = [word for word in desc if len(word)>1]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)

# clean descriptions
clean_descriptions(descriptions)

In [37]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
	# build a list of all description strings
	all_desc = set()
	for key in descriptions.keys():
		[all_desc.update(d.split()) for d in descriptions[key]]
	return all_desc

# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

Vocabulary Size: 2654


In [38]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(str(key) + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

# save descriptions
save_descriptions(descriptions, 'descriptions.txt')

In [39]:
# load photo features
def load_photo_features(filename, num_to_train):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in range(num_to_train)}
	return features
# photo features
train_features = load_photo_features('features.pkl', num_to_train)
print('Photos: train=%d' % len(train_features))

Photos: train=500


In [40]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
# load clean descriptions into memory
def load_clean_descriptions(filename, num_to_train):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = int(tokens[0]), tokens[1:]
		# skip images not in the set
		if image_id in range(num_to_train):
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			# store
			descriptions[image_id].append(desc)
	return descriptions
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', num_to_train)
print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=500


In [41]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
	lines = to_lines(descriptions)
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 2657


In [42]:
# calculate the length of the description with the most words
def max_lengths(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)
 
# create sequences of images, input sequences and output words for an image
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
	X1, X2, y = list(), list(), list()
	#print(desc_list)
	# walk through each description for the image
	for desc in desc_list:
		# print(desc)
		# encode the sequence
		seq = tokenizer.texts_to_sequences([desc])[0]
		# split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# store
			X1.append(photo)
			X2.append(in_seq)
			y.append(out_seq)
	return array(X1), array(X2), array(y)

In [43]:
# determine the maximum sequence length
max_length = max_lengths(train_descriptions)
print('Description Length: %d' % max_length)

Description Length: 52


In [44]:
val_annotation = json.load(open("Dataset/annotations/val.json"))
print(val_annotation['info'])
val_descriptions = dict()
for i in range(num_to_train//2):
    image = Image.open('Dataset/val/'+val_annotation ['images'][i]['file_name'])
    image = image.convert(mode='L')
    image = image.resize((640, 480))
    print(image.size)
    val_descriptions[i] = []
    for j in range(5):
        print('Caption #',j,': ',val_annotation['annotations'][i*5+j]['caption'])
        val_descriptions[i].append(val_annotation['annotations'][i*5+j]['caption'])
val_features = extract_features(val_annotation,False)
print('Extracted Features: %d' % len(val_features))
# save to file
dump(val_features, open('val_features.pkl', 'wb'))

{'description': 'This dataset contains crowdsourced captions of images from VizWiz datasets. This file contains the val partition.', 'license': {'url': 'https://creativecommons.org/licenses/by/4.0/', 'name': 'Attribution 4.0 International (CC BY 4.0)'}, 'url': 'https://vizwiz.org', 'version': 'VizWiz-Captions 1.0', 'year': 2019, 'contributor': 'VizWiz-Captions Consortium', 'date_created': '2019-12-23'}
(640, 480)
Caption # 0 :  A computer screen shows a repair prompt on the screen.
Caption # 1 :  a computer screen with a repair automatically pop up
Caption # 2 :  partial computer screen showing the need of repairs
Caption # 3 :  Part of a computer monitor showing a computer repair message.
Caption # 4 :  The top of a laptop with a blue background and dark blue text.
(640, 480)
Caption # 0 :  A person is holding a bottle that has medicine for the night time.
Caption # 1 :  A bottle of medication has a white twist top.
Caption # 2 :  night time medication bottle being held by someone
Cap

In [45]:
clean_descriptions(val_descriptions)
# summarize vocabulary
val_vocabulary = to_vocabulary(val_descriptions)
print('Vocabulary Size: %d' % len(val_vocabulary))
# save descriptions
save_descriptions(val_descriptions, 'val_descriptions.txt')
val_features = load_photo_features('val_features.pkl', num_to_val)
print('Photos: val=%d' % len(val_features))
val_descriptions = load_clean_descriptions('val_descriptions.txt', num_to_val)
print('Descriptions: val=%d' % len(val_descriptions))
# prepare tokenizer
val_tokenizer = create_tokenizer(val_descriptions)
val_vocab_size = len(val_tokenizer.word_index) + 1
print('Vocabulary Size: %d' % val_vocab_size)
# determine the maximum sequence length
val_max_length = max_lengths(val_descriptions)
print('Description Length: %d' % val_max_length)

Vocabulary Size: 1651
Photos: val=50
Descriptions: val=50
Vocabulary Size: 664
Description Length: 36


In [46]:
# define the captioning model
def define_model(vocab_size, max_length):
	# feature extractor model
	inputs1 = Input(shape=(4096,))
	fe1 = Dropout(0.5)(inputs1)
	fe2 = Dense(256, activation='relu')(fe1)
	# sequence model
	inputs2 = Input(shape=(max_length,))
	se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
	se2 = Dropout(0.5)(se1)
	se3 = LSTM(256)(se2)
	# decoder model
	decoder1 = add([fe2, se3])
	decoder2 = Dense(256, activation='relu')(decoder1)
	outputs = Dense(vocab_size, activation='softmax')(decoder2)
	# tie it together [image, seq] [word]
	model = Model(inputs=[inputs1, inputs2], outputs=outputs)
	model.compile(loss='categorical_crossentropy', optimizer='adam')
	# summarize model
	print(model.summary())
	plot_model(model, to_file='model.png', show_shapes=True)
	return model

In [47]:
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
	# loop for ever over images
	while 1:
		for key, desc_list in descriptions.items():
			# retrieve the photo feature
			photo = photos[key][0]
			in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
			yield [[in_img, in_seq], out_word]

In [48]:
# define the model
model = define_model(vocab_size, max_length)
epochs = 20
steps = len(train_descriptions)
for i in range(epochs):
	# create the data generator
	generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)
	# fit for one epoch
	model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
	# save model
	model.save('Model\model_' + str(i) + '.h5')

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 52)           0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 52, 256)      680192      input_8[0][0]                    
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 4096)         0           input_7[0][0]                    
____________________________________________________________________________________________