In [None]:
import pickle
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
# Load model terbaik
model = load_model("../checkpoints/caption_model_best.h5")

# Load vocab & config
FEATURE_DIR = "../data/features"

with open(f"{FEATURE_DIR}/word_to_idx.pkl", "rb") as f:
    word_to_idx = pickle.load(f)

with open(f"{FEATURE_DIR}/idx_to_word.pkl", "rb") as f:
    idx_to_word = pickle.load(f)

with open(f"{FEATURE_DIR}/max_length.pkl", "rb") as f:
    max_length = pickle.load(f)

vocab_size = len(word_to_idx) + 1


In [None]:
print("Vocab size:", vocab_size)
print("Max length:", max_length)
print("<start> idx:", word_to_idx.get("<start>"))
print("<end> idx:", word_to_idx.get("<end>"))


In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array

resnet = ResNet50(
    weights="imagenet",
    include_top=False,
    pooling="avg"
)
resnet.trainable = False


In [None]:
def extract_image_feature(img_path):
    img = load_img(img_path, target_size=(224, 224))
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)

    feature = resnet.predict(img, verbose=0)
    return feature[0]   # (2048,)


In [None]:
def generate_caption(model, feature, word_to_idx, idx_to_word, max_length):
    in_text = "<start>"

    for _ in range(max_length):
        seq = [word_to_idx[w] for w in in_text.split() if w in word_to_idx]
        seq = pad_sequences([seq], maxlen=max_length, padding="post")

        yhat = model.predict(
            {
                "image_input": feature.reshape(1, -1),
                "seq_input": seq
            },
            verbose=0
        )

        yhat_idx = int(np.argmax(yhat))
        word = idx_to_word.get(yhat_idx)

        if word is None:
            break

        in_text += " " + word
        if word == "<end>":
            break

    return in_text


In [None]:
img_path = "../data/images/example.jpg"

feature = extract_image_feature(img_path)

raw_caption = generate_caption(
    model,
    feature,
    word_to_idx,
    idx_to_word,
    max_length
)

print("Raw caption:")
print(raw_caption)


In [None]:
def clean_caption(caption):
    words = caption.split()
    words = [w for w in words if w not in ["<start>", "<end>"]]
    return " ".join(words)

final_caption = clean_caption(raw_caption)
print("Final caption:")
print(final_caption)
