In [3]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from underthesea import word_tokenize
import shap
from lime import lime_image
from sklearn.decomposition import PCA
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
def load_dataset(base_path='../dataset'):
    image_paths = []
    captions = []
    for img_name in os.listdir(f'{base_path}/images'):
        if img_name.endswith('.jpg'):
            image_path = f'{base_path}/images/{img_name}'
            caption_path = f'{base_path}/captions/{img_name.replace(".jpg", ".txt")}'

            with open(caption_path, 'r') as f:
                caption = f.read()

            image_paths.append(image_path)
            captions.append(caption)

    return image_paths, captions

image_paths, captions = load_dataset()
print(f'Loaded {len(image_paths)} images and {len(captions)} captions')

Loaded 296 images and 296 captions


In [5]:
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array_expanded = np.expand_dims(img_array, axis=0)
    return preprocess_input(img_array_expanded)

resnet = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# PCA cho hình ảnh
pca = PCA(n_components=128) 

image_data = []
image_features = {}

for img_path in image_paths:
    preprocessed_img = preprocess_image(img_path)
    features = resnet.predict(preprocessed_img, verbose=0)
    image_data.append(features.flatten())

image_data = np.array(image_data)
image_data_pca = pca.fit_transform(image_data)

for i, img_path in enumerate(image_paths):
    image_id = img_path.split('/')[-1].split('.')[0]
    image_features[image_id] = image_data_pca[i]

images = list(image_features.keys())

In [6]:
# Tokenization và Padding cho tiếng Việt
tokenizer = Tokenizer(oov_token="<unk>")
captions_tokenized = [word_tokenize(caption) for caption in captions]
tokenizer.fit_on_texts(captions_tokenized)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(captions_tokenized)
max_length = max(len(s) for s in sequences)
captions_padded = pad_sequences(sequences, maxlen=max_length, padding='post')

print(f'Vocab size: {vocab_size}, Max length: {max_length}')
# print samples
for i in range(3):
    print(f'Original: {captions[i]}')
    print(f'Tokenized: {captions_tokenized[i]}')
    print(f'Padded: {captions_padded[i]}')
    print()

Vocab size: 277, Max length: 128
Original: Nhũ ảnh ghi lại sự biến dạng của mô đệm của vú trái, có đường kính 15 mm, tổn thương có tiêu chí nghi ngờ trung gian.  Kết quả hình ảnh lành tính BiRads 2.
Tokenized: ['Nhũ', 'ảnh', 'ghi', 'lại', 'sự', 'biến dạng', 'của', 'mô đệm', 'của', 'vú', 'trái', ',', 'có', 'đường kính', '15', 'mm', ',', 'tổn thương', 'có', 'tiêu chí', 'nghi ngờ', 'trung gian', '.', 'Kết quả', 'hình ảnh', 'lành tính', 'BiRads', '2', '.']
Padded: [ 19  20 115 101  57 159  43 160  43   5  40   3   4  49 126  37   3  61
   4 127  30 145   2  18   6  12   7  17   2   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]

Original: Vôi

In [22]:
def build_model(vocab_size, max_length):
    # Image feature extractor layer
    inputs1 = Input(shape=(128,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Sequence processor layer
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder layer
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)  # Update output dimension

    # Tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

vocab_size = len(tokenizer.word_index) + 1
model = build_model(vocab_size, max_length)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_8 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, 128, 256)             70912     ['input_9[0][0]']             
                                                                                                  
 dropout_6 (Dropout)         (None, 128)                  0         ['input_8[0][0]']             
                                                                                            

In [29]:
def data_generator(captions, image_features, tokenizer, max_length, batch_size):
    while True:
        for i in range(0, len(captions), batch_size):
            X1_batch = []
            X2_batch = []
            y_batch = []

            for j in range(i, min(i + batch_size, len(captions))):
                caption = captions[j]
                image_id = images[j]
                photo = image_features[image_id]

                # Convert caption to string if it's a numpy array
                if isinstance(caption, np.ndarray):
                    caption = ' '.join([tokenizer.index_word[idx] for idx in caption if idx != 0])

                # Tokenize caption
                seq = tokenizer.texts_to_sequences([caption])[0]

                for k in range(1, len(seq)):
                    in_seq, out_seq = seq[:k], seq[k]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    X1_batch.append(photo)
                    X2_batch.append(in_seq)
                    y_batch.append(out_seq)

            yield ([np.array(X1_batch), np.array(X2_batch)], np.array(y_batch))

In [30]:
from sklearn.model_selection import train_test_split

# Chia dataset thành tập huấn luyện và tập kiểm tra
X_train_img, X_test_img, X_train_seq, X_test_seq, y_train, y_test = train_test_split(image_data_pca, captions_padded, captions_padded, test_size=0.2, random_state=42)

print(f'Size of train set: {len(X_train_img)} images and {len(X_train_seq)} captions')
print(f'Size of test set: {len(X_test_img)} images and {len(X_test_seq)} captions')

Size of train set: 236 images and 236 captions
Size of test set: 60 images and 60 captions


In [31]:
# Thiết lập tham số
batch_size = 32
epochs = 2

# Huấn luyện mô hình
model.fit(
    data_generator(X_train_seq, image_features, tokenizer, max_length, batch_size),
    steps_per_epoch=len(X_train_seq)//batch_size,
    epochs=epochs,
    validation_data=([X_test_img, X_test_seq], y_test),
    validation_steps=len(X_test_seq)//batch_size
)

Epoch 1/2

ValueError: in user code:

    File "/Users/nghiempt/Library/Python/3.8/lib/python/site-packages/keras/src/engine/training.py", line 1972, in test_function  *
        return step_function(self, iterator)
    File "/Users/nghiempt/Library/Python/3.8/lib/python/site-packages/keras/src/engine/training.py", line 1956, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/nghiempt/Library/Python/3.8/lib/python/site-packages/keras/src/engine/training.py", line 1944, in run_step  **
        outputs = model.test_step(data)
    File "/Users/nghiempt/Library/Python/3.8/lib/python/site-packages/keras/src/engine/training.py", line 1852, in test_step
        self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/nghiempt/Library/Python/3.8/lib/python/site-packages/keras/src/engine/training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "/Users/nghiempt/Library/Python/3.8/lib/python/site-packages/keras/src/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/nghiempt/Library/Python/3.8/lib/python/site-packages/keras/src/losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/nghiempt/Library/Python/3.8/lib/python/site-packages/keras/src/losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/nghiempt/Library/Python/3.8/lib/python/site-packages/keras/src/losses.py", line 2122, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/Users/nghiempt/Library/Python/3.8/lib/python/site-packages/keras/src/backend.py", line 5560, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (60, 128) and (60, 277) are incompatible
