In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Dense, Dropout, Add
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from scipy.spatial.distance import cosine
import pickle
import os
import glob
from PIL import Image
from tqdm import tqdm

2024-10-27 15:24:38.160410: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load MSCOCO dataset - Replace this path with your MSCOCO path
dataset_path = "/Users/Aneesh/Downloads/captions_train2017.json"

# Preprocess captions
def preprocess_caption(caption):
    caption = caption.lower()
    caption = caption.replace("[^a-zA-Z]", " ")
    caption = "startseq " + caption + " endseq"
    return caption

# Load captions and preprocess
import json
with open(dataset_path, 'r') as f:
    captions_data = json.load(f)
    
captions = {}
for annot in captions_data['annotations']:
    img_id = annot['image_id']
    caption = preprocess_caption(annot['caption'])
    captions[img_id] = captions.get(img_id, []) + [caption]


In [3]:
# Load the InceptionV3 model and remove the last layer
inception_model = InceptionV3(weights='imagenet')
inception_model = Model(inception_model.input, inception_model.layers[-2].output)

def extract_features(filename, model):
    img = Image.open(filename)
    img = img.resize((299, 299))
    img = np.array(img)
    if img.shape == (299, 299, 3):
        img = np.expand_dims(img, axis=0)
        img = tf.keras.applications.inception_v3.preprocess_input(img)
        feature = model.predict(img, verbose=0)
        return feature.reshape(-1)
    return None

# Extract features for all images
image_dir = "/Users/Aneesh/Downloads/train2017"
features = {}
for img_file in tqdm(glob.glob(image_dir + '*.jpg')):
    img_id = int(os.path.basename(img_file).split('.')[0])
    features[img_id] = extract_features(img_file, inception_model)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5
[1m96112376/96112376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


0it [00:00, ?it/s]


In [4]:
# Prepare tokenizer
all_captions = [cap for caps in captions.values() for cap in caps]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

# Save the tokenizer for future use
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(c.split()) for c in all_captions)

# Create sequences and pad them
def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = [], [], []
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)


In [5]:
# Define LSTM and GRU models
def define_model(vocab_size, max_length, embedding_dim=256, rnn_type="LSTM"):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(embedding_dim, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    if rnn_type == "LSTM":
        se3 = LSTM(256)(se2)
    else:
        se3 = GRU(256)(se2)
    
    decoder1 = Add()([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

# Create models for both LSTM and GRU
lstm_model = define_model(vocab_size, max_length, rnn_type="LSTM")
gru_model = define_model(vocab_size, max_length, rnn_type="GRU")


In [7]:
for img_id, cap_list in tqdm(captions.items()):
    if img_id in features:
        img_features = features[img_id]
        X1, X2, y = create_sequences(tokenizer, max_length, cap_list, img_features)
        X1train.extend(X1)
        X2train.extend(X2)
        ytrain.extend(y)
    else:
        print(f"Image ID {img_id} not found in features.")


 43%|█████████████▊                  | 51182/118287 [00:00<00:00, 258211.08it/s]

Image ID 203564 not found in features.
Image ID 322141 not found in features.
Image ID 16977 not found in features.
Image ID 106140 not found in features.
Image ID 571635 not found in features.
Image ID 301837 not found in features.
Image ID 315702 not found in features.
Image ID 189634 not found in features.
Image ID 472598 not found in features.
Image ID 162113 not found in features.
Image ID 126657 not found in features.
Image ID 285421 not found in features.
Image ID 71988 not found in features.
Image ID 193622 not found in features.
Image ID 459912 not found in features.
Image ID 548498 not found in features.
Image ID 52087 not found in features.
Image ID 119964 not found in features.
Image ID 230610 not found in features.
Image ID 226419 not found in features.
Image ID 218026 not found in features.
Image ID 352538 not found in features.
Image ID 202913 not found in features.
Image ID 202658 not found in features.
Image ID 79472 not found in features.
Image ID 346071 not found in 

 85%|██████████████████████████▍    | 100863/118287 [00:00<00:00, 227610.58it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [6]:
# Train the LSTM model
X1train, X2train, ytrain = [], [], []
for img_id, cap_list in tqdm(captions.items()):
    img_features = features[img_id]
    X1, X2, y = create_sequences(tokenizer, max_length, cap_list, img_features)
    X1train.extend(X1)
    X2train.extend(X2)
    ytrain.extend(y)

X1train, X2train, ytrain = np.array(X1train), np.array(X2train), np.array(ytrain)

lstm_model.fit([X1train, X2train], ytrain, epochs=20, batch_size=64)
gru_model.fit([X1train, X2train], ytrain, epochs=20, batch_size=64)


  0%|                                                | 0/118287 [00:00<?, ?it/s]


KeyError: 203564

In [None]:
def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

# Test on sample images
for img_id in list(captions.keys())[:5]:
    caption = generate_caption(lstm_model, tokenizer, features[img_id].reshape(1, 2048), max_length)
    print("LSTM Caption:", caption)
    caption = generate_caption(gru_model, tokenizer, features[img_id].reshape(1, 2048), max_length)
    print("GRU Caption:", caption)


In [None]:
# Calculate BLEU and semantic distance
def evaluate_model(model, tokenizer, photos, captions, max_length):
    actual, predicted = [], []
    for key, desc_list in captions.items():
        y_pred = generate_caption(model, tokenizer, photos[key], max_length)
        references = [d.split() for d in desc_list]
        y_pred = y_pred.split()
        bleu = sentence_bleu(references, y_pred)
        actual.append(references)
        predicted.append(y_pred)
        print(f'BLEU: {bleu:.3f}')
