In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add
from nltk.translate.bleu_score import corpus_bleu
from underthesea import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import lime
import lime.lime_image
import numpy as np
import matplotlib.pyplot as plt
from skimage.segmentation import mark_boundaries
from PIL import Image

In [None]:
def load_dataset_train(base_path='../dataset/train'):
    image_paths = []
    captions = []
    for img_name in os.listdir(f'{base_path}/images'):
        if img_name.endswith('.jpg'):
            image_path = f'{base_path}/images/{img_name}'
            caption_path = f'{base_path}/captions/{img_name.replace(".jpg", ".txt")}'

            with open(caption_path, 'r') as f:
                caption = f.read()

            # Tokenize Vietnamese captions
            caption_tokens = word_tokenize(caption, format="text")
            captions.append(caption_tokens)
            image_paths.append(image_path)

    return image_paths, captions

image_paths, captions = load_dataset_train()
print(f'Loaded {len(image_paths)} images and {len(captions)} captions')

In [None]:
# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(captions)
max_seq_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

print(f'Vocab size: {vocab_size}, Max sequence length: {max_seq_length}')

In [None]:
resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3), pooling='avg')

In [None]:
# Extract features from images
def extract_features(image_paths):
    image_features = []
    for img_path in image_paths:
        img = tf.keras.preprocessing.image.load_img(img_path, target_size=(224, 224))
        img = tf.keras.preprocessing.image.img_to_array(img)
        img = np.expand_dims(img, axis=0)
        img = tf.keras.applications.resnet50.preprocess_input(img)
        features = resnet.predict(img)
        image_features.append(features)
    return np.array(image_features)

image_features = extract_features(image_paths)

In [None]:
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_seq_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = Add()([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

In [None]:
# Convert words to integers using Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(captions)

# Generate sequences of input-output pairs for training
def create_sequences(sequences, max_seq_length, vocab_size):
    X1, X2, y = list(), list(), list()
    for seq in sequences:
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_seq_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(image_features[i])  # Image feature for the current word
            X2.append(in_seq)  # Sequence up to the current word
            y.append(out_seq)  # The next word
    return np.array(X1), np.array(X2), np.array(y)

# Create input-output sequences
X1, X2, y = create_sequences(sequences, max_seq_length, vocab_size)

X1 = np.squeeze(X1, axis=1)

In [None]:
print("X1 shape:", X1.shape)
print("X2 shape:", X2.shape)
print("y shape:", y.shape)

In [None]:
def load_test_dataset(base_path='../dataset_test'):
    test_image_paths = []
    test_captions = []
    for img_name in os.listdir(f'{base_path}/images'):
        if img_name.endswith('.jpg'):
            image_path = f'{base_path}/images/{img_name}'
            caption_path = f'{base_path}/captions/{img_name.replace(".jpg", ".txt")}'

            with open(caption_path, 'r') as f:
                caption = f.read()

            test_image_paths.append(image_path)
            test_captions.append(caption)

    return test_image_paths, test_captions

test_image_paths, test_captions = load_test_dataset()
print(f'Loaded {len(test_image_paths)} images and {len(test_captions)} captions')

In [None]:
test_image_paths, test_captions = load_test_dataset()

# Function to map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# Function to generate caption for a given image
def generate_caption(model, photo, tokenizer, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

In [None]:
from keras.callbacks import Callback

class BLEUEvaluator(Callback):
    def __init__(self, test_image_paths, test_captions, tokenizer, max_seq_length):
        self.test_image_paths = test_image_paths
        self.test_captions = test_captions
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def on_epoch_end(self, epoch, logs=None):
        predicted_captions = []
        for i in range(len(self.test_image_paths)):
            photo = image_features[i].reshape((1, 2048))
            caption = generate_caption(model, photo, tokenizer, self.max_seq_length)
            predicted_captions.append(caption)

        references = [[caption.split()] for caption in self.test_captions]
        hypotheses = [caption.split() for caption in predicted_captions]

        bleu_score = corpus_bleu(references, hypotheses)
        print("BLEU Score after epoch {}: {}".format(epoch+1, bleu_score))

bleu_evaluator = BLEUEvaluator(test_image_paths, test_captions, tokenizer, max_seq_length)

model.fit([X1, X2], y, epochs=2, batch_size=32, callbacks=[bleu_evaluator])

In [None]:
image_path = '/Users/nghiempt/Downloads/DPL303m_FINAL/dataset_test/images/20586908_6c613a14b80a8591_MG_R_CC_ANON.jpg'
image = Image.open(image_path)
image_data = np.array(image)

explainer = lime.lime_image.LimeImageExplainer()

def predict_fn(images):
    return model.predict(images)

explanation = explainer.explain_instance(image_data, predict_fn, top_labels=5, hide_color=0, num_samples=1000)

# Show the explanation
temp, mask = explanation.get_image_and_mask(explanation.top_labels[0], positive_only=True, num_features=5, hide_rest=True)
img_boundry1 = mark_boundaries(temp / 2 + 0.5, mask)
plt.imshow(img_boundry1)
plt.show()