In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add
from nltk.translate.bleu_score import corpus_bleu
from underthesea import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import lime
import lime.lime_image
import numpy as np
import matplotlib.pyplot as plt
from skimage.segmentation import mark_boundaries
from PIL import Image

In [2]:
def load_dataset_train(base_path='../dataset/train'):
    image_paths = []
    captions = []
    for img_name in os.listdir(f'{base_path}/images'):
        if img_name.endswith('.jpg'):
            image_path = f'{base_path}/images/{img_name}'
            caption_path = f'{base_path}/captions/{img_name.replace(".jpg", ".txt")}'

            with open(caption_path, 'r') as f:
                caption = f.read()

            # Tokenize Vietnamese captions
            caption_tokens = word_tokenize(caption, format="text")
            captions.append(caption_tokens)
            image_paths.append(image_path)

    return image_paths, captions

image_paths, captions = load_dataset_train()
print(f'Loaded {len(image_paths)} images and {len(captions)} captions')

Loaded 288 images and 288 captions


In [3]:
# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(captions)
max_seq_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

print(f'Vocab size: {vocab_size}, Max sequence length: {max_seq_length}')

Vocab size: 269, Max sequence length: 139


In [4]:
resnet = VGG16(include_top=False, weights='imagenet', input_shape=(224, 224, 3), pooling='avg')

In [5]:
# Extract features from images
def extract_features(image_paths):
    image_features = []
    for img_path in image_paths:
        img = tf.keras.preprocessing.image.load_img(img_path, target_size=(224, 224))
        img = tf.keras.preprocessing.image.img_to_array(img)
        img = np.expand_dims(img, axis=0)
        img = tf.keras.applications.resnet50.preprocess_input(img)
        features = resnet.predict(img)
        image_features.append(features)
    return np.array(image_features)

image_features = extract_features(image_paths)



In [8]:
# Convert words to integers using Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(captions)

# Generate sequences of input-output pairs for training
def create_sequences(sequences, max_seq_length, vocab_size):
    X1, X2, y = list(), list(), list()
    for seq in sequences:
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_seq_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(image_features[i])  # Image feature for the current word
            X2.append(in_seq)  # Sequence up to the current word
            y.append(out_seq)  # The next word
    return np.array(X1), np.array(X2), np.array(y)

# Create input-output sequences
X1, X2, y = create_sequences(sequences, max_seq_length, vocab_size)

X1 = np.squeeze(X1, axis=1)

In [9]:
print("X1 shape:", X1.shape)
print("X2 shape:", X2.shape)
print("y shape:", y.shape)

X1 shape: (10721, 2048)
X2 shape: (10721, 139)
y shape: (10721, 269)


In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Add
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, PositionalEncoding

max_seq_length = 100

inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_seq_length,))
se1 = Embedding(vocab_size, 256)(inputs2)
se2 = Dropout(0.5)(se1)

def transformer_encoder(inputs):
    positional_encoding = PositionalEncoding(max_seq_length, 256)(inputs)
    attention = MultiHeadAttention(num_heads=2, key_dim=256)(positional_encoding, positional_encoding)
    attention = Dropout(0.1)(attention)
    add_attention = Add()([attention, positional_encoding])
    norm_attention = LayerNormalization(epsilon=1e-6)(add_attention)
    return norm_attention

se3 = transformer_encoder(se2)

decoder1 = Add()([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [10]:
model.fit([X1, X2], y, epochs=50, batch_size=16)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x2e8eb1520>