# Check GPU version.

In [None]:
!nvidia-smi

# Mount google drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install TensorFlow-1.14 GPU.

In [None]:
# Select TensorFlow-1.x version.
%tensorflow_version 1.x

# Uninstall previous TensorFlow version.
!pip uninstall tensorflow -y 1>/dev/null 2>/dev/null 
!pip uninstall tensorflow-gpu -y 1>/dev/null 2>/dev/null 

# Install TensorFlow-1.14.
!pip install --upgrade tensorflow==1.14.0 1>/dev/null 2>/dev/null 
!pip install --upgrade tensorflow-gpu==1.14.0 1>/dev/null 2>/dev/null 

# Restart the runtime.

# Set the root directory.

In [None]:
import os

root_dir = '/content/'
os.chdir(root_dir)

!ls -al

# Import TensorFlow-1.14.

In [None]:
try:
  %tensorflow_version 1.x
except Exception:
  pass

import tensorflow as tf
from tensorflow.keras import backend as K

import tensorflow.keras.layers as layers
import tensorflow.keras.models as models

import numpy as np
np.random.seed(7)

import matplotlib.pyplot as plot

print(tf.__version__)

# Download Flickr8K dataset.

### Download dataset.

In [None]:
!gdown --id 15IPp8p_b4BrLuOIWAmm1jknt0Ip-WZ-F
!ls -al

### Extract dataset.

In [None]:
!tar -xzf Flickr8K.tar.gz
!ls -al
!ls -al Flickr8K

# Load raw descriptions.

In [None]:
Flickr8K_root_dir = 'Flickr8K'
dataset_images_dir = os.path.join(Flickr8K_root_dir, 'images') + '/'

In [None]:
image_features_root_dir = 'Inception-v3'

In [None]:
descriptions_filename = os.path.join(image_features_root_dir, 'token.txt')
train_dataset_filename = os.path.join(image_features_root_dir, 'train_images.txt')
test_dataset_filename = os.path.join(image_features_root_dir, 'test_images.txt')

In [None]:
processed_descriptions_filename = os.path.join(image_features_root_dir, 'descriptions.txt')
train_features_filename = os.path.join(image_features_root_dir, 'train_features.pkl')
test_features_filename = os.path.join(image_features_root_dir, 'test_features.pkl')

In [None]:
def load_document(descriptions_filename):
	text_file = open(descriptions_filename, 'r')
	text_data = text_file.read()
	text_file.close()
	return( text_data )

### Load raw descriptions.

In [None]:
raw_descriptions = load_document(descriptions_filename)
print(raw_descriptions[:300])

# Parse descriptions.

In [None]:
def parse_descriptions(raw_descriptions):
	mapping = dict()

	for line in raw_descriptions.split('\n'):

		tokens = line.split()
		if len(line) < 2:
			continue

		image_id, image_descriptions = tokens[0], tokens[1:]
		image_id = image_id.split('.')[0]

		image_descriptions = ' '.join(image_descriptions)

		if image_id not in mapping:
			mapping[image_id] = list()

		mapping[image_id].append(image_descriptions)
	
	return mapping

### Parse descriptions.

In [None]:
descriptions = parse_descriptions(raw_descriptions)
print('loaded - %d descriptions.' % len(descriptions))

### View descriptions keys.

In [None]:
list(descriptions.keys())[:5]

### View sample descriptions.

In [None]:
descriptions['1000268201_693b08cb0e']

In [None]:
descriptions['1001773457_577c3a7d70']

# Clean descriptions.

In [None]:
import string

def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			desc = [w.translate(table) for w in desc]
			# remove hanging 's' and 'a'
			desc = [word for word in desc if len(word)>1]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)

### Clean descriptions.

In [None]:
clean_descriptions(descriptions)

### View cleaned sample descriptions.

In [None]:
descriptions['1000268201_693b08cb0e']

In [None]:
descriptions['1001773457_577c3a7d70']

# Create vocabulary of words.

In [None]:
def create_vocabulary(descriptions):
	all_descriptions = set() 
	for key in descriptions.keys():
		[all_descriptions.update(current_descriptions.split()) for current_descriptions in descriptions[key]]
	return( all_descriptions )

### Create vocabulary of words.

In [None]:
vocabulary = create_vocabulary(descriptions)
print('vcabulary size -', len(vocabulary))

# Save descriptions.

In [None]:
def save_descriptions(descriptions, filename):
	lines = list()
	for key, description_list in descriptions.items():
		for description in description_list:
			lines.append(key + ' ' + description)
	
	text_data = '\n'.join(lines)
	text_file = open(filename, 'w')
	text_file.write(text_data)
	text_file.close()

### Save descriptions.

In [None]:
save_descriptions(descriptions, processed_descriptions_filename)

In [None]:
!ls -al 'Inception-v3'

# Load dataset splits.

In [None]:
def load_dataset(filename):
	text_data = load_document(filename)
 
	dataset = list()
	for line in text_data.split('\n'):

		if len(line) < 1:
			continue

		identifier = line.split('.')[0]
		dataset.append(identifier)
	
	return set(dataset)

### Load training dataset split.

In [None]:
train_dataset = load_dataset(train_dataset_filename)
print('number of train dataset samples -', len(train_dataset))

# Create a list of all image filenames in the directory.

In [None]:
import glob

In [None]:
image_filenames = glob.glob(dataset_images_dir + '*.jpg')

### View sample image filenames.

In [None]:
print(image_filenames[:5])

### Read the train image filenames.

In [None]:
train_image_filenames = set(open(train_dataset_filename, 'r').read().strip().split('\n'))

train_images = []
for image_filename in image_filenames: 
    if image_filename[len(dataset_images_dir):] in train_image_filenames: 
        train_images.append(image_filename)
print('number of training samples -',len(train_images))

### View sample train image filenames.

In [None]:
print(train_images[:5])

### Read the test image filenames.

In [None]:
test_image_filenames = set(open(test_dataset_filename, 'r').read().strip().split('\n'))

test_images = []
for image_filename in image_filenames: 
    if image_filename[len(dataset_images_dir):] in test_image_filenames: 
        test_images.append(image_filename)
print('number of test samples -',len(test_images))

### View sample test image filenames.

In [None]:
print(test_images[:5])

# Load cleaned descriptions.

In [None]:
def load_cleaned_descriptions(filename, dataset):
	text_data = load_document(filename)
 
	descriptions = dict()
	for line in text_data.split('\n'):
		tokens = line.split()

		image_id, image_description = tokens[0], tokens[1:]

		if image_id in dataset:
			if image_id not in descriptions:
				descriptions[image_id] = list()

			current_description = 'startseq ' + ' '.join(image_description) + ' endseq'
			descriptions[image_id].append(current_description)
	 
	return( descriptions )

### Load cleaned decsriptions for train dataset split.

In [None]:
train_descriptions = load_cleaned_descriptions(processed_descriptions_filename, train_dataset)
print('number of training descriptions -' , len(train_descriptions))

# Preprocess an input image.

In [None]:
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image

In [None]:
def preprocess(image_path):
    input_image = image.load_img(image_path, target_size=(299, 299))    
    input_image = image.img_to_array(input_image)

    input_image = np.expand_dims(input_image, axis=0)
    input_image = preprocess_input(input_image)
    return( input_image)

# Create feature extractor model.

### Load Inception-v3 model.

In [None]:
base_model = InceptionV3(weights='imagenet')

### Create feature extractor model.

In [None]:
feature_extractor = models.Model(base_model.input, base_model.layers[-2].output)

# Encode input images.

### Create function to encode input images.

In [None]:
def encode_image(image_filename):
    input_image = preprocess(image_filename)
    image_features = feature_extractor.predict(input_image)
    image_features = np.reshape(image_features, image_features.shape[1])
    return( image_features )

### Create function to encode dataset.

In [None]:
from time import time

def encode_images(input_images):
  start = time()
  feature_dictionary = {}
  for input_image in input_images:
    #print(input_image)
    feature_dictionary[input_image[len(dataset_images_dir):]] = encode_image(input_image)

  print("time taken in seconds -", ( time()-start ))

  return(feature_dictionary)

### Encode train dataset.

In [None]:
import pickle

### Encode train images and store features in a file.

In [None]:
train_features = encode_images(train_images)
with open(train_features_filename, 'wb') as pickle_file:
    pickle.dump(train_features, pickle_file)

### Encode test images and store features in a file.

In [None]:
test_features = encode_images(test_images)
with open(test_features_filename, 'wb') as pickle_file:
    pickle.dump(test_features, pickle_file)

In [None]:
!ls -al Inception-v3

In [None]:
#!tar -czf Inception-v3.tar.gz Inception-v3
#!mv Inception-v3.tar.gz '/content/drive/My Drive/datasets/Flickr8K/.'

# OR

### Download image descriptions and Inception-v3 encoded image features from google drive.

In [None]:
!gdown --id 12f8iomzzeZNh0OVuLhjJkimKja21bYnn
!ls -al

In [None]:
!tar -xzf Inception-v3.tar.gz

In [None]:
!ls -al
!ls -al Inception-v3

In [None]:
!rm Inception-v3.tar.gz
!ls -al

# Load training dataset.

### Load training image features.

In [None]:
train_features = pickle.load(open(train_features_filename, 'rb'))
print('number of training samples -', len(train_features))

### Load training image descriptions.

# Create a list of all the training captions.

In [None]:
all_training_words = []
for key, current_descriptions in train_descriptions.items():
    for description in current_descriptions:
        all_training_words.append(description)

print('number of training words -', len(all_training_words))

# Keep only words which occur at least a given number of times in the corpus.

In [None]:
word_count_threshold = 10

In [None]:
word_counts = {}
nsents = 0
for sent in all_training_words:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))

In [None]:
ixtoword = {}
wordtoix = {}

ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

In [None]:
vocab_size = len(ixtoword) + 1 # one for appended 0's
vocab_size

In [None]:
# convert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

# calculate the length of the description with the most words
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('maximum description length -', max_length)

In [None]:
# data generator, intended to be used in a call to model.fit_generator()
from numpy import array
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encode the sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split one sequence into multiple X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # store
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
            # yield the batch data
            if n==num_photos_per_batch:
                yield [[array(X1), array(X2)], array(y)]
                X1, X2, y = list(), list(), list()
                n=0

# Load GloVe vectors.

In [None]:
!gdown --id 1-0SXOvbpDNvW8v3eFhKRqXb68xUQ86M1 # glove.6B.200d.txt

### OR

In [98]:
!wget -O glove.6B.200d.txt https://www.floydhub.com/api/v1/resources/Av2ThePYtAHXMAuSXEBV8X/glove.6B.200d.txt?content=true&rename=glove6b200dtxt 

--2020-06-26 11:29:26--  https://www.floydhub.com/api/v1/resources/Av2ThePYtAHXMAuSXEBV8X/glove.6B.200d.txt?content=true
Resolving www.floydhub.com (www.floydhub.com)... 104.26.0.30, 172.67.72.144, 104.26.1.30, ...
Connecting to www.floydhub.com (www.floydhub.com)|104.26.0.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/octet-stream]
Saving to: ‘glove.6B.200d.txt’

glove.6B.200d.txt       [              <=>   ] 661.31M  37.9MB/s    in 17s     

2020-06-26 11:29:44 (38.9 MB/s) - ‘glove.6B.200d.txt’ saved [693432828]



In [99]:
!ls -al

total 703044
drwxr-xr-x 1 root root      4096 Jun 26 11:29 .
drwxr-xr-x 1 root root      4096 Jun 26 09:08 ..
drwxr-xr-x 1 root root      4096 Jun 19 16:15 .config
drwx------ 4 root root      4096 Jun 26 09:10 drive
-rw-r--r-- 1 root root 693432828 Jun 26 11:29 glove.6B.200d.txt
drwxr-xr-x 2 root root      4096 Jun 26 08:12 Inception-v3
drwxr-xr-x 2 root root      4096 Jun 26 09:39 .ipynb_checkpoints
-rw-r--r-- 1 root root  19161704 Jun 26 10:24 model_9.h5
-rw-r--r-- 1 root root   7278880 Jun 26 11:18 model.h5
drwxr-xr-x 1 root root      4096 Jun 17 16:18 sample_data


In [None]:
glove_dir = '/content/'
embedding_dim = 200

In [None]:
embeddings_index = {} 
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors.' % len(embeddings_index))

In [None]:
# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoix.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

# Create the model.


In [None]:
from keras import Input
from keras.preprocessing import sequence
from keras.layers import LSTM, Embedding, Dense, Dropout
from keras.layers.merge import add
from keras.models import Model

In [91]:
image_features = Input(shape=(2048,))
features_layer = Dropout(0.5)(image_features)
features_layer = Dense(256, activation='relu')(features_layer)
image_caption = Input(shape=(max_length,))
embedding_layer = Embedding(vocab_size, embedding_dim, mask_zero=True)(image_caption)
embedding_layer = Dropout(0.5)(embedding_layer)
embedding_layer = LSTM(256)(embedding_layer)
decoder_layer = add([features_layer, embedding_layer])
decoder_layer = Dense(256, activation='relu')(decoder_layer)
output_predictions = Dense(vocab_size, activation='softmax')(decoder_layer)
model = Model(inputs=[image_features, image_caption], outputs=output_predictions)

In [None]:
model.summary()

In [None]:
model.layers[2]

In [None]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
epochs = 10
number_pics_per_bath = 3
steps = len(train_descriptions)//number_pics_per_bath

In [None]:
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, wordtoix, max_length, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)    
model.save_weights('model_10.h5')

In [None]:
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, wordtoix, max_length, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)    
model.save_weights('model_20.h5')

In [65]:
from keras.optimizers import Adam
optimizer = Adam(learning_rate=0.0001)

model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [67]:
epochs = 10
number_pics_per_bath = 6
steps = len(train_descriptions)//number_pics_per_bath

In [None]:
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, wordtoix, max_length, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)    
model.save_weights('model.h5')

# Load test dataset.

### Load test image features.

In [69]:
test_features = pickle.load(open(test_features_filename, 'rb'))
print('number of training samples -', len(test_features))

number of training samples - 1000


# Evaluate the model.

In [92]:
model.load_weights('model.h5')

In [94]:
model.load_weights('model_9.h5')

In [82]:
def greedy_search(photo):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [95]:
z = 1
pic = list(test_features.keys())[z]
image = test_features[pic].reshape((1,2048))
#x=plt.imread(images+pic)
#plt.imshow(x)
#plt.show()
print("Greedy:",greedy_search(image))

Greedy: man in red uniform rides bike on track
