In [None]:
import numpy as np
import zipfile
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, KFold
import pickle
from PIL import Image
import os
from tensorflow.keras.applications.vgg16 import preprocess_input

# Extract ZIP file
if not os.path.exists("temp_images"):
    with zipfile.ZipFile("Images.zip", "r") as zip_ref:
        zip_ref.extractall("temp_images")

# captions_path is no longer needed, reading directly from captions.txt
with open("captions.txt", "r", encoding="utf-8") as f:
    captions = [line.strip() for line in f.readlines()]

# Define constants
vocab_size = 5000  # Define vocabulary size
max_length = 30  # Max caption length

# Extract features using VGG16
def extract_features_from_image(image_path):
    vgg = VGG16(weights='imagenet', include_top=False)
    model = Model(inputs=vgg.input, outputs=vgg.output)

    img = Image.open(image_path).convert('RGB').resize((224, 224))
    img = np.array(img)
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    img = preprocess_input(img)  # Preprocess for VGG16

    features = model.predict(img, verbose=0)
    return features.flatten()

image_features = []
image_filenames = []

# Path to the subfolder containing images
images_folder = os.path.join("temp_images", "Images")

# Check if 'temp_images' directory contains files
if not os.listdir("temp_images"):
    raise ValueError("The 'temp_images' directory is empty. Please ensure it contains image files.")

images_folder = os.path.join("temp_images", "Images")  # Correct subfolder
for filename in os.listdir(images_folder):
    file_path = os.path.join(images_folder, filename)

    if os.path.isfile(file_path) and filename.lower().endswith((".jpg")):
        try:
            features = extract_features_from_image(file_path)
            image_features.append(features)
            image_filenames.append(filename)
            print(f"Processed: {filename}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

if not image_features:
    raise ValueError("No image features extracted. Check if images exist in 'temp_images'.")

image_features = np.array(image_features)  # Convert to NumPy array
print(f"Total number of image features extracted: {len(image_features)}")

# Tokenize captions
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(captions)
captions_sequences = tokenizer.texts_to_sequences(captions)
captions_padded = pad_sequences(captions_sequences, maxlen=max_length, padding="post")

# Split dataset
X_train, X_val, y_train, y_val = train_test_split(image_features, captions_padded, test_size=0.2, random_state=42)

# Define CNN-LSTM Model
def build_model(vocab_size, max_length, dropout=0.5, lstm_units=256):
    image_input = Input(shape=(25088,))  # VGG16 output shape
    img_dense = Dense(256, activation="relu")(image_input)

    text_input = Input(shape=(max_length,))
    text_embed = Embedding(vocab_size, 256, mask_zero=True)(text_input)
    text_lstm = LSTM(lstm_units, return_sequences=False)(text_embed)

    merged = add([img_dense, text_lstm])
    merged = Dropout(dropout)(merged)
    output = Dense(vocab_size, activation="softmax")(merged)

    model = Model(inputs=[image_input, text_input], outputs=output)
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

    return model

# Train Model with K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
best_loss = float("inf")

for train_idx, val_idx in kf.split(X_train):
    model = build_model(vocab_size, max_length)

    history = model.fit(
        [X_train[train_idx], y_train[train_idx]],
        y_train[train_idx],
        validation_data=([X_train[val_idx], y_train[val_idx]], y_train[val_idx]),
        epochs=10, batch_size=64, verbose=1
    )

    val_loss = min(history.history['val_loss'])
    if val_loss < best_loss:
        best_loss = val_loss
        os.makedirs("model", exist_ok=True)  # Ensure directory exists
        model.save("model/best_caption_generator.h5")

print("Training complete. Best model saved.")


BadZipFile: File is not a zip file

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
from PIL import Image
import cv2

# Load Model & Tokenizer
model = load_model("model/best_caption_generator.h5")
tokenizer = pickle.load(open("data/tokenizer.pkl", "rb"))
max_length = 30

# Load CNN Model for Feature Extraction
vgg = VGG16(weights="imagenet")
cnn_model = tf.keras.models.Model(inputs=vgg.input, outputs=vgg.layers[-2].output)

def extract_features(img_path):
    image = Image.open(img_path).resize((224, 224))
    image = np.expand_dims(np.array(image) / 255.0, axis=0)
    features = cnn_model.predict(image)
    return features.reshape(1, 4096)

def generate_caption(img_path):
    features = extract_features(img_path)
    caption = "<start>"

    for _ in range(max_length):
        seq = tokenizer.texts_to_sequences([caption])[0]
        seq = pad_sequences([seq], maxlen=max_length)

        y_pred = model.predict([features, seq])
        word_id = np.argmax(y_pred)

        word = tokenizer.index_word.get(word_id, "<end>")
        caption += " " + word

        if word == "<end>":
            break

    return caption.replace("<start>", "").replace("<end>", "").strip()

# Test Image
image_path = "test_images/sample.jpg"
caption = generate_caption(image_path)
print(f"Generated Caption: {caption}")


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import pickle

# Load Data
actual_captions = pickle.load(open("data/actual_captions.pkl", "rb"))
predicted_captions = pickle.load(open("data/predicted_captions.pkl", "rb"))

# BLEU Score Calculation
def calculate_bleu(actual, predicted):
    return sentence_bleu([actual.split()], predicted.split())

# ROUGE Score Calculation
def calculate_rouge(actual, predicted):
    rouge = Rouge()
    scores = rouge.get_scores(predicted, actual)
    return scores[0]

# Convert Words to Numerical IDs for Precision/Recall/F1
def words_to_ids(sentence, tokenizer):
    return tokenizer.texts_to_sequences([sentence])[0]

tokenizer = pickle.load(open("data/tokenizer.pkl", "rb"))

# Compute Metrics
bleu_scores = []
rouge_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for i in range(len(actual_captions)):
    bleu = calculate_bleu(actual_captions[i], predicted_captions[i])
    rouge = calculate_rouge(actual_captions[i], predicted_captions[i])

    actual_ids = words_to_ids(actual_captions[i], tokenizer)
    predicted_ids = words_to_ids(predicted_captions[i], tokenizer)

    # Ensure lengths match
    min_len = min(len(actual_ids), len(predicted_ids))
    actual_ids, predicted_ids = actual_ids[:min_len], predicted_ids[:min_len]

    precision = precision_score(actual_ids, predicted_ids, average="macro", zero_division=0)
    recall = recall_score(actual_ids, predicted_ids, average="macro", zero_division=0)
    f1 = f1_score(actual_ids, predicted_ids, average="macro", zero_division=0)

    bleu_scores.append(bleu)
    rouge_scores.append(rouge["rouge-l"]["f"])
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Print Results
print(f"Average BLEU Score: {np.mean(bleu_scores):.4f}")
print(f"Average ROUGE-L Score: {np.mean(rouge_scores):.4f}")
print(f"Average Precision: {np.mean(precision_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")
