In [7]:
# from google.colab import drive
# drive.mount('/content/drive')

In [8]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt

In [9]:
def load_dataset(base_path='Inbreast'):
    image_paths = []
    captions = []
    for img_name in os.listdir(f'{base_path}/image'):
        if img_name.endswith('.jpg'):
            image_path = f'{base_path}/image/{img_name}'
            caption_path = f'{base_path}/caption/{img_name.replace(".jpg", ".txt")}'

            with open(caption_path, 'r') as f:
                caption = f.read()

            image_paths.append(image_path)
            captions.append(caption)

    return image_paths, captions

image_paths, captions = load_dataset()

In [10]:
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array_expanded = np.expand_dims(img_array, axis=0)
    return preprocess_input(img_array_expanded)

resnet = ResNet50(weights='imagenet', include_top=False, pooling='avg')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [11]:
tokenizer = Tokenizer(num_words=5000, oov_token="<unk>")
tokenizer.fit_on_texts(captions)
sequences = tokenizer.texts_to_sequences(captions)
max_length = max(len(s) for s in sequences)
captions_padded = pad_sequences(sequences, maxlen=max_length, padding='post')

In [12]:
def build_model(vocab_size, max_length):
    # Image feature extractor layer
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Sequence processor layer
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder layer
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # Tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    return model

vocab_size = len(tokenizer.word_index) + 1
model = build_model(vocab_size, max_length)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 139)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 2048)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 139, 256)             70656     ['input_3[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 2048)                 0         ['input_2[0][0]']             
                                                                                              

In [13]:
image_features = {}
for img_path in image_paths:
    preprocessed_img = preprocess_image(img_path)
    features = resnet.predict(preprocessed_img, verbose=0)
    image_id = img_path.split('/')[-1].split('.')[0]
    image_features[image_id] = features

images = list(image_features.keys())

In [14]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

def data_generator(captions, image_features, tokenizer, max_length, batch_size):
    X1, X2, y = list(), list(), list()
    n=0
    while 1:
        for i, caption in enumerate(captions):
            n+=1
            image_id = images[i]
            photo = image_features[image_id][0]
            seq = tokenizer.texts_to_sequences([caption])[0]

            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                X1.append(photo)
                X2.append(in_seq)
                y.append(out_seq)

            if n == batch_size:
                yield [[np.array(X1), np.array(X2)], np.array(y)]
                X1, X2, y = list(), list(), list()
                n=0

In [15]:
batch_size = 1
steps = len(captions) // batch_size

for i in range(2):
    generator = data_generator(captions, image_features, tokenizer, max_length, batch_size)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)


  model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)




In [18]:
pip install rouge

Defaulting to user installation because normal site-packages is not writeable
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [19]:
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
import numpy as np

In [20]:
def generate_caption(image_path, model, tokenizer, max_length):
    photo = preprocess_image(image_path)
    photo_feature = resnet.predict(photo, verbose=0)

    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo_feature, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text

In [None]:
def evaluate_model(model, captions, image_features, tokenizer, max_length):
    actual, predicted = list(), list()
    rouge = Rouge()

    for i, caption in enumerate(captions):
        image_id = images[i]
        image_path = image_paths[i]

        yhat = generate_caption(image_path, model, tokenizer, max_length)

        references = [caption.split()]
        actual.append(references)
        predicted.append(yhat.split())

    bleu_score = corpus_bleu(actual, predicted)
    print(f'BLEU Score: {bleu_score}')

    actual_rouge = [' '.join(a[0]) for a in actual]
    predicted_rouge = [' '.join(p) for p in predicted]

    rouge_score = rouge.get_scores(predicted_rouge, actual_rouge, avg=True)
    print(f'ROUGE Score: {rouge_score}')

evaluate_model(model, captions, image_features, tokenizer, max_length)

In [40]:
pip install py-rouge

Defaulting to user installation because normal site-packages is not writeable
Collecting py-rouge
  Downloading py_rouge-1.1-py3-none-any.whl.metadata (8.7 kB)
Downloading py_rouge-1.1-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m942.3 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hInstalling collected packages: py-rouge
Successfully installed py-rouge-1.1
Note: you may need to restart the kernel to use updated packages.


In [41]:
from sklearn.model_selection import train_test_split

# Tách dữ liệu huấn luyện thành dữ liệu huấn luyện và dữ liệu kiểm tra
train_image_paths, test_image_paths, train_captions, test_captions = train_test_split(image_paths, captions, test_size=0.2, random_state=42)

# Sử dụng train_image_paths và train_captions để huấn luyện mô hình
# Sử dụng test_image_paths và test_captions để kiểm tra mô hình
print(test_image_paths)
print(test_captions)

['Inbreast/image/50998113_66adfbb4f19c76d2_MG_L_CC_ANON.jpg', 'Inbreast/image/22613918_f23fa352e7de3dc7_MG_R_CC_ANON.jpg', 'Inbreast/image/53587131_7b71aa9928e6975e_MG_L_CC_ANON.jpg', 'Inbreast/image/53582395_3f0db31711fc9795_MG_L_ML_ANON.jpg', 'Inbreast/image/50999273_cb65e8dac169f596_MG_R_ML_ANON.jpg', 'Inbreast/image/20587544_d571b5880ad2a016_MG_R_CC_ANON.jpg', 'Inbreast/image/50997823_cbb6c98a81e69eeb_MG_R_CC_ANON.jpg', 'Inbreast/image/51049682_6f64793857feb5d0_MG_L_CC_ANON.jpg', 'Inbreast/image/53586361_dda3c6969a34ff8e_MG_L_ML_ANON.jpg', 'Inbreast/image/22579870_301f1776aebbf5d2_MG_L_CC_ANON.jpg', 'Inbreast/image/24055203_606e9b184978a350_MG_L_CC_ANON.jpg', 'Inbreast/image/22678980_b9a4da5f2dae63a9_MG_L_CC_ANON.jpg', 'Inbreast/image/24055502_ac3185e18ffdc7b6_MG_R_CC_ANON.jpg', 'Inbreast/image/24065407_83db89f57aea498a_MG_R_ML_ANON.jpg', 'Inbreast/image/22670511_7e677f3d530e41ed_MG_L_ML_ANON.jpg', 'Inbreast/image/20587054_b6a4f750c6df4f90_MG_R_CC_ANON.jpg', 'Inbreast/image/5099680

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from tensorflow.keras.preprocessing.sequence import pad_sequences
from underthesea import word_tokenize

# Tính BLEU Score
def evaluate_bleu(model, image_features, test_captions_padded):
    references = [[caption.split() for caption in captions] for captions in test_captions]
    predictions = []

    for i, image_path in enumerate(test_image_paths):
        image_id = image_path.split('/')[-1].split('.')[0]
        photo = np.array([image_features[image_id]])
        in_text = '<start>'
        for _ in range(max_length):
            sequence = tokenizer.texts_to_sequences([in_text])[0]
            sequence = pad_sequences([sequence], maxlen=max_length)
            yhat = model.predict([photo, sequence], verbose=0)
            yhat = np.argmax(yhat)
            word = word_for_id(yhat, tokenizer)
            if word is None:
                break
            in_text += ' ' + word
            if word == '<end>':
                break
        predictions.append(in_text.split())

    return corpus_bleu(references, predictions)

# Tính METEOR Score
def evaluate_meteor(model, image_features, test_captions_padded):
    predictions = []

    for i, image_path in enumerate(test_image_paths):
        image_id = image_path.split('/')[-1].split('.')[0]
        photo = np.array([image_features[image_id]])
        in_text = '<start>'
        for _ in range(max_length):
            sequence = tokenizer.texts_to_sequences([in_text])[0]
            sequence = pad_sequences([sequence], maxlen=max_length)
            yhat = model.predict([photo, sequence], verbose=0)
            yhat = np.argmax(yhat)
            word = word_for_id(yhat, tokenizer)
            if word is None:
                break
            in_text += ' ' + word
            if word == '<end>':
                break
        predictions.append(in_text)

    return meteor_score([caption for caption in test_captions], predictions)

# Tính ROUGE Score
def evaluate_rouge(model, image_features, test_captions_padded):
    rouge = Rouge()
    references = [' '.join(caption) for caption in test_captions_tokenized]
    predictions = []

    for i, image_path in enumerate(test_image_paths):
        image_id = image_path.split('/')[-1].split('.')[0]
        photo = np.array([image_features[image_id]])
        in_text = '<start>'
        for _ in range(max_length):
            sequence = tokenizer.texts_to_sequences([in_text])[0]
            sequence = pad_sequences([sequence], maxlen=max_length)
            yhat = model.predict([photo, sequence], verbose=0)
            yhat = np.argmax(yhat)
            word = word_for_id(yhat, tokenizer)
            if word is None:
                break
            in_text += ' ' + word
            if word == '<end>':
                break
        predictions.append(in_text)

    scores = rouge.get_scores(predictions, references, avg=True)
    return scores

# Tính CIDEr Score
def evaluate_cider(model, image_features, test_captions_padded):
    references = [' '.join(caption) for caption in test_captions_tokenized]
    predictions = []

    for i, image_path in enumerate(test_image_paths):
        image_id = image_path.split('/')[-1].split('.')[0]
        photo = np.array([image_features[image_id]])
        in_text = '<start>'
        for _ in range(max_length):
            sequence = tokenizer.texts_to_sequences([in_text])[0]
            sequence = pad_sequences([sequence], maxlen=max_length)
            yhat = model.predict([photo, sequence], verbose=0)
            yhat = np.argmax(yhat)
            word = word_for_id(yhat, tokenizer)
            if word is None:
                break
            in_text += ' ' + word
            if word == '<end>':
                break
        predictions.append(in_text)

    cider_scorer = Cider()
    cider_score, _ = cider_scorer.compute_score(references, predictions)

    return cider_score

# Chuẩn bị dữ liệu cho đánh giá
test_captions_tokenized = [word_tokenize(caption) for caption in test_captions]
test_sequences = tokenizer.texts_to_sequences(test_captions_tokenized)
test_captions_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Đánh giá mô hình
bleu_score = evaluate_bleu(model, image_features, test_captions_padded)
meteor_score = evaluate_meteor(model, image_features, test_captions_padded)
rouge_scores = evaluate_rouge(model, image_features, test_captions_padded)
# cider_score = evaluate_cider(model, image_features, test_captions_padded)

print("BLEU Score:", bleu_score)
print("METEOR Score:", meteor_score)
print("ROUGE Scores:", rouge_scores)
# print("CIDEr Score:", cider_score)


In [44]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
from sklearn.model_selection import train_test_split

image_paths_train, image_paths_test, captions_train, captions_test = train_test_split(image_paths, captions_padded, test_size=0.2, random_state=42)

# Tạo generator cho tập huấn luyện và tập kiểm tra
batch_size = 32
train_steps = len(captions_train) // batch_size
test_steps = len(captions_test) // batch_size

train_generator = data_generator(captions_train, image_features, tokenizer, max_length, batch_size)
test_generator = data_generator(captions_test, image_features, tokenizer, max_length, batch_size)

# # Huấn luyện mô hình
# epochs = 10
# history = model.fit(train_generator, epochs=epochs, steps_per_epoch=train_steps, validation_data=test_generator, validation_steps=test_steps)

# Đánh giá mô hình trên tập kiểm tra
loss, accuracy = model.evaluate(test_generator, steps=test_steps)
print("Loss on test set:", loss)
print("Accuracy on test set:", accuracy)

# Vẽ biểu đồ loss và accuracy
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [38]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
# from nltk.translate.cider import Cider
from nltk.tokenize import word_tokenize

# Chuẩn bị dữ liệu cho đánh giá
test_captions_tokenized = [word_tokenize(caption) for caption in test_captions]
test_sequences = tokenizer.texts_to_sequences(test_captions_tokenized)
test_captions_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Tính BLEU Score
def evaluate_bleu(model, image_features, test_captions_padded):
    references = [[caption.split() for caption in captions] for captions in test_captions_tokenized]
    predictions = []

    for i, image_path in enumerate(test_image_paths):
        image_id = image_path.split('/')[-1].split('.')[0]
        photo = np.array([image_features[image_id]])
        in_text = '<start>'
        for _ in range(max_length):
            sequence = tokenizer.texts_to_sequences([in_text])[0]
            sequence = pad_sequences([sequence], maxlen=max_length)
            yhat = model.predict([photo, sequence], verbose=0)
            yhat = np.argmax(yhat)
            word = word_for_id(yhat, tokenizer)
            if word is None:
                break
            in_text += ' ' + word
            if word == '<end>':
                break
        predictions.append(in_text.split())

    return corpus_bleu(references, predictions)

# Tính METEOR Score
def evaluate_meteor(model, image_features, test_captions_padded):
    predictions = []

    for i, image_path in enumerate(test_image_paths):
        image_id = image_path.split('/')[-1].split('.')[0]
        photo = np.array([image_features[image_id]])
        in_text = '<start>'
        for _ in range(max_length):
            sequence = tokenizer.texts_to_sequences([in_text])[0]
            sequence = pad_sequences([sequence], maxlen=max_length)
            yhat = model.predict([photo, sequence], verbose=0)
            yhat = np.argmax(yhat)
            word = word_for_id(yhat, tokenizer)
            if word is None:
                break
            in_text += ' ' + word
            if word == '<end>':
                break
        predictions.append(in_text)

    return meteor_score([caption for caption in test_captions], predictions)

# Tính ROUGE Score
def evaluate_rouge(model, image_features, test_captions_padded):
    rouge = Rouge()
    references = [' '.join(caption) for caption in test_captions_tokenized]
    predictions = []

    for i, image_path in enumerate(test_image_paths):
        image_id = image_path.split('/')[-1].split('.')[0]
        photo = np.array([image_features[image_id]])
        in_text = '<start>'
        for _ in range(max_length):
            sequence = tokenizer.texts_to_sequences([in_text])[0]
            sequence = pad_sequences([sequence], maxlen=max_length)
            yhat = model.predict([photo, sequence], verbose=0)
            yhat = np.argmax(yhat)
            word = word_for_id(yhat, tokenizer)
            if word is None:
                break
            in_text += ' ' + word
            if word == '<end>':
                break
        predictions.append(in_text)

    scores = rouge.get_scores(predictions, references, avg=True)
    return scores

# Tính CIDEr Score
# def evaluate_cider(model, image_features, test_captions_padded):
#     references = [' '.join(caption) for caption in test_captions_tokenized]
#     predictions = []

#     for i, image_path in enumerate(test_image_paths):
#         image_id = image_path.split('/')[-1].split('.')[0]
#         photo = np.array([image_features[image_id]])
#         in_text = '<start>'
#         for _ in range(max_length):
#             sequence = tokenizer.texts_to_sequences([in_text])[0]
#             sequence = pad_sequences([sequence], maxlen=max_length)
#             yhat = model.predict([photo, sequence], verbose=0)
#             yhat = np.argmax(yhat)
#             word = word_for_id(yhat, tokenizer)
#             if word is None:
#                 break
#             in_text += ' ' + word
#             if word == '<end>':
#                 break
#         predictions.append(in_text)

#     cider_scorer = Cider()
#     cider_score, _ = cider_scorer.compute_score(references, predictions)

#     return cider_score

# Đánh giá mô hình
bleu_score = evaluate_bleu(model, image_features, test_captions_padded)
meteor_score = evaluate_meteor(model, image_features, test_captions_padded)
rouge_scores = evaluate_rouge(model, image_features, test_captions_padded)
# cider_score = evaluate_cider(model, image_features, test_captions_padded)

print("BLEU Score:", bleu_score)
print("METEOR Score:", meteor_score)
print("ROUGE Scores:", rouge_scores)
# print("CIDEr Score:", cider_score)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/nghiempt/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/share/nltk_data'
    - '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************
