#### We are going to use Flickr8-dataset. The dataset is available on kaggle here: https://www.kaggle.com/datasets/adityajn105/flickr8k?resource=download

#### **Preprocess data**

In [1]:
import os
import numpy as np
from PIL import Image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.vgg16 import preprocess_input

# Path to the Flickr8k images and captions
image_folder = r"C:\Applied AI\Advanced deep learning\Advanced_deep_learning\Advanced-deep-learning-labs\lab_3\Flickr8_data\Images" #CHANGE HERE TO YOUR OWN PATH!
caption_file = r"C:\Applied AI\Advanced deep learning\Advanced_deep_learning\Advanced-deep-learning-labs\lab_3\Flickr8_data\captions.txt" #SAME HERE!

# Load captions
def load_captions(filename):
    with open(filename, 'r') as file:
        text = file.read()
    captions = {}
    for line in text.strip().split('\n'):
        tokens = line.split()
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in captions:
            captions[image_id] = []
        captions[image_id].append(image_desc)
    return captions

captions = load_captions(caption_file)

# Load images
def load_images(image_folder, captions):
    images = {}
    for image_id in captions:
        image_path = os.path.join(image_folder, image_id + '.jpg')
        image = Image.open(image_path)
        images[image_id] = np.array(image)
    return images

images = load_images(image_folder, captions)




def preprocess_images(images):
    images_processed = {}
    for image_id, image in images.items():
        image = Image.fromarray(image).resize((224, 224))  # Resize images
        image = np.array(image)
        image = preprocess_input(image)  # Normalize images for VGG16
        images_processed[image_id] = image
    return images_processed

images_preprocessed = preprocess_images(images)


tokenizer = Tokenizer(num_words=10000, oov_token="<unk>")
all_captions = [caption for caption_list in captions.values() for caption in caption_list]
tokenizer.fit_on_texts(all_captions)
captions_seq = {image_id: tokenizer.texts_to_sequences(captions_list) for image_id, captions_list in captions.items()}

2024-05-08 14:28:20.586566: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Applied AI\\Advanced deep learning\\Advanced_deep_learning\\Advanced-deep-learning-labs\\lab_3\\Flickr8_data\\captions.txt'

#### **Here we are going to create a model. The model has an input shape of 224,224, 3, and will use the weights from imagenet.**

#### **This model is then connected to a LSTM, for image captioning.**



In [8]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Concatenate
from tensorflow.keras.applications import VGG16

def create_model(vocab_size, max_length):
    
    vgg = VGG16(include_top=False, input_shape=(224, 224, 3), weights='imagenet')
    vgg.trainable = False  
    image_input = Input(shape=(224, 224, 3))
    features = vgg(image_input)
    features = tf.keras.layers.GlobalAveragePooling2D()(features)
    image_dense = Dense(256, activation='relu')(features)
    
    # Caption model
    caption_input = Input(shape=(max_length,))
    caption_embedding = Embedding(input_dim=vocab_size, output_dim=256)(caption_input)
    caption_lstm = LSTM(256)(caption_embedding)
    
    # Combine image and caption information
    decoder_input = Concatenate()([image_dense, caption_lstm])
    outputs = Dense(vocab_size, activation='softmax')(decoder_input)
    
    # Final model
    model = Model(inputs=[image_input, caption_input], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model
