<a href="https://colab.research.google.com/github/mahdiimanzadeh/Image-Captioning-with-Deep-Learning/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install necessary packages

In [None]:
!pip install tensorflow keras pillow numpy tqdm

## import all the necessary packages

## Getting and performing data cleaning

In [None]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import numpy as np

import tensorflow as tf

from tensorflow.keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
# from keras.utils import to_categorical
from tensorflow.keras.layers import Add
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout

# small library for seeing the progress of loops.
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


0it [00:00, ?it/s]

In [None]:
# Loading a text file into memory
def load_doc(filename):
    # Opening the file as read only
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# get all imgs with their captions
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions ={}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

#Data cleaning- lower casing, removing puntuations and words containing numbers
def cleaning_text(captions):
    table = str.maketrans('','',string.punctuation)
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):

            img_caption.replace("-"," ")
            desc = img_caption.split()

            #converts to lowercase
            desc = [word.lower() for word in desc]
            #remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            #remove hanging 's and a
            desc = [word for word in desc if(len(word)>1)]
            #remove tokens with numbers in them
            desc = [word for word in desc if(word.isalpha())]
            #convert back to string

            img_caption = ' '.join(desc)
            captions[img][i]= img_caption
    return captions

def text_vocabulary(descriptions):
    # build vocabulary of all unique words
    vocab = set()

    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]

    return vocab

#All descriptions in one file
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()

In [None]:
!mkdir -p /kaggle/working/Flickr8k_text

# Download Text Dataset
!wget https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip -O /kaggle/working/Flickr8k_text/Flickr8k_text.zip

# Extract dataset
!unzip /kaggle/working/Flickr8k_text/Flickr8k_text.zip -d /kaggle/working/Flickr8k_text/

In [None]:
# download image dataset
!wget https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip -O /kaggle/working/Flickr8k_Dataset.zip

# extract dataset
!unzip /kaggle/working/Flickr8k_Dataset.zip -d /kaggle/working/Flickr8k_Dataset/

In [None]:
!ls "/kaggle/working/Flickr8k_Dataset/Flicker8k_Dataset"

In [None]:
dataset_text = "/kaggle/working/Flickr8k_text"
dataset_text

'/kaggle/working/Flickr8k_text'

In [None]:
# Set these path according to project folder in you system
dataset_text = "/kaggle/working/Flickr8k_text"
dataset_images = "/kaggle/working/Flickr8k_Dataset/Flicker8k_Dataset"


#we prepare our text data
filename = dataset_text + "/" + "Flickr8k.token.txt"
#loading the file that contains all data
#mapping them into descriptions dictionary img to 5 captions
descriptions = all_img_captions(filename)
print("Length of descriptions =" , len(descriptions))

#cleaning the descriptions
clean_descriptions = cleaning_text(descriptions)

#building vocabulary
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))

#saving each description to file
save_descriptions(clean_descriptions, "/kaggle/working/Flickr8k_text/descriptions.txt")

Length of descriptions = 8092
Length of vocabulary =  8763


## Extracting the feature vector from all images (TAKES TIME)

In [None]:
!ls "/kaggle/working/features.p"

ls: cannot access '/kaggle/working/features.p': No such file or directory


In [None]:
features = load(open("/kaggle/working/features.p","rb"))
len(features)

In [None]:
def extract_features(directory):
        model = Xception( include_top=False, pooling='avg' )
        features = {}
        for img in tqdm(os.listdir(directory)):
            filename = directory + "/" + img
            image = Image.open(filename)
            image = image.resize((299,299))
            image = np.expand_dims(image, axis=0)
            #image = preprocess_input(image)
            image = image/127.5
            image = image - 1.0

            feature = model.predict(image)
            features[img] = feature
        return features

#2048 feature vector
features = extract_features(dataset_images)
dump(features, open("/kaggle/working/features.p","wb"))

## Loading dataset for Training the model

In [None]:
#load the data
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    return photos


def load_clean_descriptions(filename, photos):
    #loading clean_descriptions
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):

        words = line.split()
        if len(words)<1 :
            continue

        image, image_caption = words[0], words[1:]

        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + " ".join(image_caption) + ' <end>'
            descriptions[image].append(desc)

    return descriptions


def load_features(photos):
    #loading all features
    all_features = load(open("/kaggle/working/features.p","rb"))
    #selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features


filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"

#train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("/kaggle/working/Flickr8k_text/descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

## Tokenizing the vocabulary

In [None]:
#converting dictionary to clean list of descriptions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

#creating tokenizer class
#this will vectorise text corpus
#each integer will represent token in dictionary

from tensorflow.keras.preprocessing.text import Tokenizer

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

# give each word an index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('/kaggle/working/tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7577

In [None]:
#calculate maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(descriptions)
max_length

32

##  Create Data generator

In [None]:
#create input-output sequence pairs from the image description.

import tensorflow as tf

def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():
            # retrieve photo features
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)

            # Convert lists to tuples of tf.Tensor
            input_image = tf.convert_to_tensor(input_image)
            input_sequence = tf.convert_to_tensor(input_sequence)
            output_word = tf.convert_to_tensor(output_word)

            # Yield as a tuple
            yield (input_image, input_sequence), output_word

def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

#You can check the shape of the input and output for your model
(a,b),c = next(data_generator(train_descriptions, features, tokenizer, max_length))
a.shape, b.shape, c.shape
#((47, 2048), (47, 32), (47, 7577))

(TensorShape([47, 2048]), TensorShape([47, 32]), TensorShape([47, 7577]))

## Defining the CNN-RNN model

In [None]:
from tensorflow.keras.layers import Layer
import tensorflow as tf

class NotEqual(Layer):
    def __init__(self, **kwargs):
        super(NotEqual, self).__init__(**kwargs)

    def call(self, inputs):
        # Check which tokens are not equal to zero (masking)
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        return super().get_config()

In [None]:
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import add

# define the captioning model
def define_model(vocab_size, max_length):
    # Feature Extractor
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Sequence Model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256, use_cudnn=False)(se2)

    # Decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)

    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # Combine [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # summarize model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    return model


## TRAINING SECTION

In [None]:

# train our model
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

model = define_model(vocab_size, max_length)
print(model,'model')
epochs = 10
steps = len(train_descriptions)
# making a directory models to save our models
#os.mkdir("/content/drive/MyDrive/ML/models")
for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save("/kaggle/working/models6/model_" + str(i) + ".h5")

Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 7577
Description Length:  32


None
<Functional name=functional, built=True> model
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 35ms/step - accuracy: 0.1985 - loss: 4.9742
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 11ms/step - accuracy: 0.2881 - loss: 3.7119
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 11ms/step - accuracy: 0.3102 - loss: 3.3860
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 11ms/step - accuracy: 0.3217 - loss: 3.2064
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 11ms/step - accuracy: 0.3313 - loss: 3.0773
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 11ms/step - accuracy: 0.3384 - loss: 2.9872
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 11ms/step - accuracy: 0.3440 - loss: 2.9172
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 11ms/step - accuracy: 0.3504 - loss: 2.8621
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━

In [None]:
steps

6000

In [None]:
!ls /kaggle/working/models6/

model_0.h5  model_2.h5	model_4.h5  model_6.h5	model_8.h5
model_1.h5  model_3.h5	model_5.h5  model_7.h5	model_9.h5


In [None]:
def word_for_id(integer, tokenizer):
 for word, index in tokenizer.word_index.items():
     if index == integer:
         return word
 return None


In [None]:


def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'end':
            break
    return in_text

In [None]:
tf.__version__

'2.17.1'

# TEST THE MODEL USING DUMY FOLLOWING IMAGE AND CODE

In [None]:
print("Train descriptions:", len(train_descriptions))
print("Train features:", len(train_features))


Train descriptions: 6000
Train features: 6000


In [None]:

img = Image.open('/kaggle/working/Flickr8k_Dataset/Flicker8k_Dataset/111537222_07e56d5a30.jpg')


#path = 'Flicker8k_Dataset/111537222_07e56d5a30.jpg'
max_length = 32
tokenizer = load(open("/kaggle/working/tokenizer.p","rb"))

from tensorflow.keras.utils import get_custom_objects

model = load_model(
    "/kaggle/working/models6/model_9.h5",
    custom_objects={'NotEqual': NotEqual},  # ثبت لایه سفارشی
    compile=False  # اگر نیازی به کامپایل مدل نیست
)

xception_model = Xception(include_top=False, pooling="avg")

photo = extract_features(img, xception_model)


description = generate_desc(model, tokenizer, photo, max_length)
print("\n\n")
print(description)
plt.imshow(img)




ValueError: Only input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: 0 (of type <class 'int'>)

In [None]:
!ls "/kaggle/working/Flickr8k_Dataset/Flicker8k_Dataset"

In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Evaluate model performance
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
    actual, predicted = list(), list()

    # step over the whole set
    for key, desc_list in descriptions.items():
        # generate description
        yhat = generate_desc(model, tokenizer, photos[key], max_length)

        # store actual and predicted
        references = [d.split() for d in desc_list]
        actual.append(references)
        predicted.append(yhat.split())

    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

