## Enhancing Image Understanding and Captioning: Project Segment Overview

This segment outlines the initial steps towards enhancing image captioning capabilities, focusing on key frame extraction from videos and leveraging advanced AI models for generating and refining captions.

In [2]:
import json
import os
from pathlib import Path
import random
from io import BytesIO

import cv2
import matplotlib.pyplot as plt
import numpy as np
import requests
from PIL import Image
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet18
from torch.nn.functional import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from transformers import (
    CLIPProcessor, CLIPModel,
    VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer, ViTFeatureExtractor,
    AutoModelForCausalLM, AutoProcessor, BlipProcessor, BlipForConditionalGeneration
)

import openai




### Step 1 Workflow:
1. **Frame Extraction**: Extract middle frames from videos to capture the essence of each scene.
2. **Caption Generation**: Generate diverse captions for these frames using ViT_GPT-2, BLIP, and Transformer models.
3. **Caption Selection**: Filter captions based on relevance and accuracy using the CLIP Score model, selecting the top two.

#### References

Bianco, S., Celona, L., Donzella, M., & Napoletano, P. (2023). *Improving Image Captioning Descriptiveness by Ranking and LLM-based Fusion*. arXiv.Org. [https://doi.org/10.48550/arxiv.2306.11593](https://doi.org/10.48550/arxiv.2306.11593)
Betker, J., Goh, G., Jing, L., Brooks, T., Wang, J., Li, L., Ouyang, L., Zhuang, J., Lee, J., Guo, Y., ... Ramesh, A. (2023). Improving Image Generation with Better Captions. OpenAI. https://cdn.openai.com/papers/dall-e-3.pdf

#### Note on Implementation Limitations:
Due to limited computational resources, we encountered crashes when attempting to fully implement this step. As a result, the captions used in subsequent steps are the original ones provided.

In [None]:
def extract_frames_from_dict(folder, video_caption_dict, output_base="extracted_frames", limit=-1):
    """
    Extracts middle frames from videos specified in video_caption_dict and saves them.

    Args:
    - folder (str): Base directory where video files are stored.
    - video_caption_dict (dict): Dictionary with video paths as keys and captions as values.
    - output_base (str): Base directory for saving extracted frames.
    - limit (int): Maximum number of videos to process. If -1, no limit is applied.
    """
    cnt = 0
    for video_path, caption in video_caption_dict.items():
        if limit != -1 and cnt >= limit:
            break

        full_path = os.path.join(folder, video_path)
        frame = capture_middle_frame(full_path)

        if frame is None:
            print(f"Failed to capture middle frame for {video_path}. Skipping.")
            continue

        output_dir = Path(output_base) / Path(full_path).parent.name
        output_dir.mkdir(parents=True, exist_ok=True)
        frame_output_path = output_dir / (Path(full_path).stem + '.png')
        frame.save(frame_output_path)
        cnt += 1
    print(f"Processed {cnt} videos.")

def capture_middle_frame(video_path):
    """
    Captures and returns the middle frame of a video as a PIL Image.

    Args:
    - video_path (str): Path to the video file.

    Returns:
    - PIL.Image or None: The middle frame of the video or None if capture failed.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Failed to open video: {video_path}")
        return None

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.set(cv2.CAP_PROP_POS_FRAMES, total_frames // 2)
    success, frame = cap.read()
    cap.release()

    if success:
        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    else:
        print("Failed to capture middle frame.")
        return None
def load_captions(jsonl_file_path, limits=-1):
    """
    Loads video captions from a .jsonl file.

    Args:
    - jsonl_file_path (str): Path to the .jsonl file containing captions.
    - limits (int): Maximum number of captions to load. If -1, no limit is applied.

    Returns:
    - dict: A dictionary mapping video paths to captions.
    """
    caption_video_dict = {}
    with open(jsonl_file_path, 'r') as file:
        for cnt, line in enumerate(file):
            if limits != -1 and cnt >= limits:
                break
            data = json.loads(line)
            caption_video_dict[data['clip']] = [data['caption']]
    return caption_video_dict

def generate_image_pairs(clip_dict):
    """
    Converts video paths in clip_dict to image paths by changing the file extension.

    Args:
    - clip_dict (dict): Dictionary with video paths as keys and captions as values.

    Returns:
    - dict: A dictionary with modified paths pointing to images instead of videos.
    """
    return {Path(key).with_suffix('.png').as_posix(): value for key, value in clip_dict.items()}
def load_images(base_folder, image_dict, max_images=4000):
    """
    Loads images from a base folder given an image dictionary.
    """
    images = []
    image_names_with_parent = []
    count = 0
    for relative_path in image_dict.keys():
        if count >= max_images:
            break
        image_path = os.path.join(base_folder, relative_path)
        if os.path.isfile(image_path):
            img = Image.open(image_path).convert('RGB')
            images.append(img)
            image_names_with_parent.append(f"{Path(relative_path).parent.name}/{Path(relative_path).name}")
            count += 1
    return images, image_names_with_parent

def update_captions_dict(captions_dict, new_captions_list, names):
    """
    Updates the captions in captions_dict with new captions from new_captions_list.
    """
    for name, new_caption in zip(names, new_captions_list):
        captions_dict[name] = new_caption
    return captions_dict

def extract_captions(data):
    """
    Extracts and returns a list of captions from the data dictionary.
    """
    return [caption[0] for caption in data.values()]

In [None]:
def ViT_GPT2(images):
    """
    Generates captions for a list of images using the ViT_GPT2 model.
    """
    model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")  # Correct class name
    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    captions = []
    for image in images:
        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
        pixel_values = pixel_values.to(device)

        max_length = 16
        num_beams = 4
        gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

        output_ids = model.generate(pixel_values, **gen_kwargs)

        preds = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        captions.append(preds.strip())
    print(captions)
    return captions

def GIT(images):
    """
    Generates captions for a list of images using the GIT model.
    """

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_id = "microsoft/git-large-coco"
    processor = AutoProcessor.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

    captions = []
    for im in images:
        inputs = processor(images=im, return_tensors="pt").to(device)
        generated_ids = model.generate(pixel_values=inputs.pixel_values, num_beams=3, max_length=20, min_length=5)
        generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        captions.append(generated_caption)
    print(captions)
    return captions

def Transformer(images):
    """
    Generates captions for a list of images using the Transformer model.
    """
    model = pipeline('image-to-text')
    captions = []
    for im in images:
        # Image Captioning
        generated_text = model(im)[0]['generated_text']
        captions.append(generated_text)
    print(captions)
    return captions

def BLIP(images):
    """
    Generates captions for a list of images using the BLIP model.
    """
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    captions = []
    for im in images:
        inputs = processor(text=None, images=im, return_tensors="pt", padding=True)
        outputs = model.generate(**inputs, max_length=20, num_beams=3, return_dict_in_generate=True, output_scores=True)
        caption = processor.decode(outputs.sequences[0], skip_special_tokens=True)
        captions.append(caption)
    print(captions)
    return captions

def CLIP_score(images, captions):
    model_name = "openai/clip-vit-base-patch32"
    processor = CLIPProcessor.from_pretrained(model_name)
    model = CLIPModel.from_pretrained(model_name)

    text_inputs = processor(text=captions, return_tensors="pt", padding=True)
    image_inputs = processor(images=images, return_tensors="pt", padding=True)

    image_embeddings = model.get_image_features(**{"pixel_values": image_inputs.pixel_values})
    text_embeddings = model.get_text_features(**{"input_ids": text_inputs.input_ids, "attention_mask": text_inputs.attention_mask})

    image_embeddings = image_embeddings / image_embeddings.norm(dim=1, keepdim=True)
    text_embeddings = text_embeddings / text_embeddings.norm(dim=1, keepdim=True)

    cosine_similarities = torch.einsum('nd,nd->n', image_embeddings, text_embeddings)
    cosine_similarities = cosine_similarities.detach().tolist()
    print(cosine_similarities)
    return cosine_similarities

# Compare and Remove low-quality captions
def compare(lists, n_keep=1):
    '''
    lists: [list1, list2,...]
    n_keep: number of captions kept after ranking
    return max_values_info: ((max1, list of max1), (max2, list of max2), ...)
                            max1 > max2 > ....
    '''
    max_values_info = []

    for index in range(len(lists[0])):
        values = [(lst[index], i) for i, lst in enumerate(lists, start=0)]
        max_values = sorted(values, key=lambda x: x[0], reverse=True)[:n_keep]
        max_values_info.append(max_values)
    print(max_values_info)
    return max_values_info

# Filter captions
def first_filter(captions, max_scores):
    '''
    After generating scores and ranking, this function is for choosing captions based on scores

    return: list(list(caption)), at index ith, it corresponds to the image at index ith in images list and names list
    '''
    res = []
    for i, score in enumerate(max_scores):
        temp = []
        for j in score:
            temp.append(captions[j[1]][i])
        res.append(temp)
    print(res)
    return res

In [4]:
clip_dict = load_captions('cut_part0.jsonl')
image_dict = generate_image_pairs(clip_dict)

print(len(image_dict))

9497


In [5]:
# Extract old captions
old_captions = extract_captions(image_dict)
print(len(old_captions))

9497


In [7]:
extract_frames_from_dict('video_clips', clip_dict)
images, names = load_images('extracted_frames', image_dict, max_images = 1500)

Failed to open video: video_clips\44ot5_2UaaU/44ot5_2UaaU.11_1.mp4
Failed to capture middle frame for 44ot5_2UaaU/44ot5_2UaaU.11_1.mp4. Skipping.
Failed to open video: video_clips\44ot5_2UaaU/44ot5_2UaaU.11_2.mp4
Failed to capture middle frame for 44ot5_2UaaU/44ot5_2UaaU.11_2.mp4. Skipping.
Failed to open video: video_clips\44ot5_2UaaU/44ot5_2UaaU.11_3.mp4
Failed to capture middle frame for 44ot5_2UaaU/44ot5_2UaaU.11_3.mp4. Skipping.
Failed to open video: video_clips\44ot5_2UaaU/44ot5_2UaaU.11_4.mp4
Failed to capture middle frame for 44ot5_2UaaU/44ot5_2UaaU.11_4.mp4. Skipping.
Failed to open video: video_clips\44ot5_2UaaU/44ot5_2UaaU.11_5.mp4
Failed to capture middle frame for 44ot5_2UaaU/44ot5_2UaaU.11_5.mp4. Skipping.
Failed to open video: video_clips\44ot5_2UaaU/44ot5_2UaaU.11_6.mp4
Failed to capture middle frame for 44ot5_2UaaU/44ot5_2UaaU.11_6.mp4. Skipping.
Failed to open video: video_clips\44ot5_2UaaU/44ot5_2UaaU.11_7.mp4
Failed to capture middle frame for 44ot5_2UaaU/44ot5_2Uaa

In [8]:
captions_GIT = GIT(images)
captions_BLIP = BLIP(images)
captions_ViT = ViT_GPT2(images)
captions_Trans = Transformer(images)
captions = [captions_BLIP, captions_ViT, old_captions]

# Calculate CLIP Score
scores_GIT = CLIP_score(images, captions_GIT)
scores_BLIP = CLIP_score(images, captions_BLIP)
scores_ViT = CLIP_score(images, captions_ViT)
scores_Trans = CLIP_score(images, captions_Trans)
scores_Extracted = CLIP_score(images, old_captions)

scores = [scores_BLIP, scores_ViT, scores_Trans, scores_GIT, scores_Extracted]


# Ranking
max_scores = compare(scores)
filtered_captions = first_filter(captions, max_scores)

res = update_captions_dict(clip_dict, filtered_captions, names)


['a person cutting apples on a cutting board', 'a toothbrush in a glass on a counter', 'a person is pouring milk into a glass', 'a woman in a pink shirt holding a bowl of fruit', 'a person putting muffins into a microwave oven', 'a red pot on the stove', 'how to fold a napkin', 'a woman holding a tray with eggs in it', 'a person is pouring water into a cup', 'a person is pouring milk into a glass', 'a woman in a pink dress standing in a kitchen', 'a plate with some food and flowers on it', 'a person putting a cupcake into a muffin', 'a woman in a pink dress holding an apple', 'a woman in a pink dress standing in a kitchen', 'a woman in a pink dress standing in a kitchen', 'a woman in a pink dress standing in a kitchen', 'a person is putting an egg into a muffin', 'a tray with food on it', 'a glass filled with yellow liquid sitting on a wooden table', 'a tray of muffins on a wooden table', 'a person is peeling a piece of cheese on a plate', 'a woman in a pink dress standing in a kitchen

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


['a person cutting up a piece of fruit on a cutting board', 'a glass of milk sitting on top of a counter top', 'a person holding a spoon in front of a blender', 'a woman holding a green and yellow plastic bag', 'a kitchen counter with a toaster oven and a toaster', 'a bowl of cereal with bananas in it', 'a person holding a piece of paper on a wooden surface', 'a woman holding a tray of food in a kitchen', 'a glass of water sitting on top of a counter top', 'a glass of milk sitting on top of a counter top', 'a woman in a pink shirt is holding a carrot', 'a white plate topped with a piece of cake', 'a person is using a knife to cut a loaf of bread', 'a woman holding a bowl of fruit in a kitchen', 'a woman standing in a kitchen with a toothbrush in her mouth', 'a woman standing in a kitchen holding a white frisbee', 'a woman standing in a kitchen holding a wii remote', 'a plate of food on a stove top', 'a bowl of fruit and a spoon on a counter', 'a glass of orange juice sitting on top of 

: 

In [None]:
def write_to_jsonl_with_captions_list(data):
    """
    Writes the given dictionary to a uniquely named JSONL file, where each line is a JSON object
    with 'clip' as the key and a 'captions' key linking to a list of captions.

    Parameters:
    - data: Dictionary containing the data to write.
    """
    file_path = f'clips_captions.jsonl'

    with open(file_path, 'w') as f:
        for key, captions in data.items():
            # Structure the JSON object to include both the clip and captions
            json_object = {"clip": key, "captions": captions}
            # Write the JSON object to the file
            f.write(json.dumps(json_object) + '\n')

    return file_path

# Call the function with the modified data and get the path to the new file
new_captions_file_path = write_to_jsonl_with_captions_list(res)


### Step 2: Object Detection, Caption Enhancement, Image Generation, and Integrated Metric Calculation

In this step, we undertake a multifaceted approach to enrich the dataset by generating new images based on enhanced captions and evaluating these images and their captions through a combination of similarity metrics. This process includes several key sub-steps, each contributing to the final goal of creating accurate and relevant generated content.

#### Object Detection with YOLOv5
- **Model Initialization**: We utilize the YOLOv5 model, pre-trained on a comprehensive dataset, for detecting objects within frames extracted from video clips. This detection step is crucial for identifying the key elements within each scene.
- **Frame Extraction and Detection**: For each video, a representative frame is extracted, and distinct objects within this frame are identified. The detected objects provide a contextual foundation for generating more detailed and contextually accurate captions.

#### Caption Enhancement with OpenAI's GPT-4
- **Enhancement Logic**: Given the list of detected objects and the overall context caption, an enhanced caption is generated using OpenAI's GPT-4. This enhanced caption aims to integrate the detected objects into the context more naturally and descriptively, providing a richer description for image generation.

#### Image Generation with DALL-E 3
- **Image Creation**: Using the enhanced captions, new images are generated that visually represent the described scene. This image generation step leverages the DALL-E 3 model to produce images that closely match the detailed descriptions provided.

#### Captioning Generated Images with BLIP
- **Caption Generation**: The BLIP model is employed to generate captions for the newly created images. This automated captioning process provides a textual description of the content of generated images, facilitating a direct comparison between the original and generated content.

#### Integrated Metric Calculation and Similarity Scoring
- **Evaluation Framework**: The relevance and quality of generated images and captions are evaluated based on image similarity, text (caption) similarity, and CLIP scores. These metrics are integrated into a single metric to assess the accuracy and relevance of each generated output comprehensively.
    - **Image Similarity Score**: Quantifies the visual similarity between the generated image and the original frame.
    - **Text Similarity Score**: Measures the semantic closeness between the generated and original captions.
    - **CLIP Score**: Evaluates the contextual coherence between the generated image and caption.

#### References
Betker, J., Goh, G., Jing, L., Brooks, T., Wang, J., Li, L., Ouyang, L., Zhuang, J., Lee, J., Guo, Y., ... Ramesh, A. (2023). Improving Image Generation with Better Captions. OpenAI. https://cdn.openai.com/papers/dall-e-3.pdf


In [105]:
model_yolo = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)

def get_frame_path(video_path):
    """
    Constructs the path for the frame image based on the video's path.
    """
    base_name = os.path.basename(video_path)
    subfolder_name, _ = base_name.split('.', 1) 
    frame_name = base_name.rsplit('.', 1)[0] + ".png"
    frame_path = os.path.join('extracted_frames', subfolder_name, frame_name)

    if os.path.exists(frame_path):
        return frame_path
    else:
        print(f"Frame does not exist: {frame_path}")
        return None

def detect_objects_in_frame(video_path):
    """
    Detects objects in a frame and returns a list of unique detected object names.
    """
    frame_path = get_frame_path(video_path) 

    if frame_path is None:
        return []

    results = model_yolo(frame_path)
    detected_objects = [results.names[int(obj)] for obj in results.xyxy[0][:, -1]]
    unique_objects = list(set([str(obj) for obj in detected_objects]))

    return unique_objects
def enhance_caption_with_openai(object_names ,overall_context_caption):
    """
    Enhances the given caption with detailed scene description using OpenAI's GPT-4.
    """
    integrated_caption = " ".join(object_names)
    prompt = f"""
    Task: You are part of a team of bots that creates images. You work with an assistant bot that will draw anything
    you say in square brackets . For example , outputting " a beautiful morning in the woods with the sun peaking
    through the trees " will trigger your partner bot to output an image of a forest morning , as described .
    You will be prompted by people looking to create detailed images. The way to accomplish this
    is to take their short prompts(15-80 words) and make them extremely detailed and descriptive. You will be given overall context caption
    and list of other components which are in the same image and same context.

    Overall Context Caption: {overall_context_caption}
    List of Components: {object_names}

    Rules to follow:
    1. Remove components which are abnormal associations for example ’ elephant under a sea ’.or not relevant by comparing to the overall context caption.
    2. Construct a scene that logically includes remained components within the setting described by the overall caption.
    3. Ensure the scene is described in a realistic manner.
    4. Avoid adding emotional, sound, strong adjectives or exaggerations that could create noise to generated image, remember this is the common context.
    5. The integrated scene description should provide a clearer picture of the situation as described in the overall context caption.
    6. For Descriptions of Components, just link them together, do not provide any redundant desscription.
    7. The main object in overall context caption should be the main object in your description.
    8. Avoid providing your own comment on the context.
    Integrated Scene Description:
    """

    response = openai.ChatCompletion.create(
        model="gpt-4-0125-preview",
        messages=[
            {"role": "system", "content": "You are an excellent prompt engineering."},
            {"role": "user", "content": prompt}
        ],
        max_tokens = 100,
        seed = 42,
        temperature = 0.2

    )
    enhanced_caption = response.choices[0].message.content.strip()
    return enhanced_caption
def generate_and_save_image(enhanced_caption, clip_path, folder='generated_images'):
    """
    Generates an image based on the enhanced caption using DALL-E 3 and saves it.
    """
    os.makedirs(folder, exist_ok=True)
    
    base_name = os.path.basename(clip_path)
    image_name = base_name.replace('.mp4', '.jpg')  
    response = openai.Image.create(
        prompt=f"A image captured by a phone camera in a real-life, common context with specific details: {enhanced_caption}.",
        n=1,
        model="dall-e-3",
        style='vivid',
        quality="standard"
    )

    image_data = response['data'][0]['url']
    image_response = requests.get(image_data)
    image = Image.open(BytesIO(image_response.content))

    generated_image_path = os.path.join(folder, image_name) 
    print(f"Attempting to save image to {generated_image_path}")
    image.save(generated_image_path)  
    return generated_image_path

def caption_image_with_blip(image_path):
    """
    Generates a caption for the given image using BLIP model.
    """
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt", padding=True)
    outputs = model.generate(**inputs, max_length=20, num_beams=3, return_dict_in_generate=True, output_scores=True)

    caption = processor.decode(outputs.sequences[0], skip_special_tokens=True)

    return caption

def save_generated_image_info(json_file_path, clip_path, generated_image_path, generated_caption, enhanced_caption, original_caption):
    """
    Saves information about the generated image and captions to a JSON file.
    """
    base_name = os.path.basename(json_file_path)
    output_json_file_name = base_name.rsplit('.', 1)[0] + "_generated_info.json"
    output_json_file_path = os.path.join(os.getcwd(), output_json_file_name)

    data_to_append = {
        "clip": clip_path,
        "original_caption": original_caption,
        "generated_image_path": generated_image_path,
        "generated_caption": generated_caption,
        "enhanced_caption": enhanced_caption
    }

    try:
        if os.path.isfile(output_json_file_path):
            with open(output_json_file_path, 'r') as file:
                existing_data = json.load(file)
        else:
            existing_data = []
    except Exception as e:
        print(f"Error reading from {output_json_file_path}: {e}")
        existing_data = []

    existing_data.append(data_to_append)
    with open(output_json_file_path, 'w') as file:
        json.dump(existing_data, file, indent=4)

def second_filter(json_file_path, images_limit=None):
    """
    Process each video clip to detect objects, enhance captions, generate and caption new images,
    and save the information. Limited by an optional images limit.
    """
    openai.api_key = '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'
    processed_images_count = 0  

    with open(json_file_path, 'r') as file:
        lines = file.readlines()
        data = [json.loads(line) for line in lines]

    for item in data:
        if images_limit is not None and processed_images_count >= images_limit:
            break  

        clip_path = item['clip']
        overall_context_caption = item['caption']
        video_path = os.path.join('video_clips', clip_path)

        try:
            detected_objects = detect_objects_in_frame(video_path)
            if detected_objects:
                objects_names_str = ", ".join(detected_objects)
                enhanced_caption = enhance_caption_with_openai(detected_objects, overall_context_caption) 
                generated_image_path = generate_and_save_image(enhanced_caption, clip_path)
                generated_caption = caption_image_with_blip(generated_image_path)
                
                save_generated_image_info(
                    json_file_path,
                    item['clip'],
                    generated_image_path,
                    generated_caption,
                    enhanced_caption,
                    item['caption']
                )

                processed_images_count += 1  

                print(f"Processed {item['clip']}: Information saved.")
        except Exception as e:
            print(f"Error processing {item['clip']}: {e}")


json_file_path = 'cut_part0.jsonl'
images_limit = 0
second_filter(json_file_path, images_limit)

Using cache found in C:\Users\admin/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-3-5 Python-3.10.13 torch-2.2.1+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


In [48]:
resnet_model = resnet18(pretrained=True)
resnet_model.eval()
def integrated_metric(image_similarity_score, text_similarity_score, clip_score):
    """
    Calculates an integrated metric score based on normalized image, text, and clip scores.
    The distribution of weights based on the step that component has been through, lower weight for more number of steps.
    """
    acceptable_scores = {'text': 0.6, 'clip': 0.3, 'image': 0.5}
    weights = {'image': 0.2, 'text': 0.5, 'clip': 0.3}

    normalized_scores = {
        'image': (image_similarity_score / acceptable_scores['image']) * weights['image'],
        'text': (text_similarity_score / acceptable_scores['text']) * weights['text'],
        'clip': (clip_score / acceptable_scores['clip']) * weights['clip']
    }

    integrated_score = sum(normalized_scores.values())
    return integrated_score
def load_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image = Image.open(image_path).convert('RGB')
    return transform(image).unsqueeze(0)

def image_similarity(model, image1, image2):
    """
    This function calculates image similarity using a pretrained ResNet18 model and cosine similarity.
    """
    
    with torch.no_grad():
        features1 = model(image1)
        features2 = model(image2)
    return cosine_similarity(features1, features2).item()

def calculate_text_similarity(text1, text2, model):
    """
    This function calculates text similarity using a pretrained SentenceTransformer model cosine similarity.
    """
    embedding1 = model.encode(text1, convert_to_tensor=True)
    embedding2 = model.encode(text2, convert_to_tensor=True)

    similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    return similarity

def CLIP_score(image_path, caption):
    model_name = "openai/clip-vit-base-patch32"
    processor = CLIPProcessor.from_pretrained(model_name)
    model = CLIPModel.from_pretrained(model_name)

    image = Image.open(image_path).convert("RGB")
    images = [image]  

    text_inputs = processor(text=[caption], return_tensors="pt", padding=True, truncation=True)
    image_inputs = processor(images=images, return_tensors="pt", padding=True)

    image_embeddings = model.get_image_features(**image_inputs)
    text_embeddings = model.get_text_features(**text_inputs)

    image_embeddings = image_embeddings / image_embeddings.norm(dim=1, keepdim=True)
    text_embeddings = text_embeddings / text_embeddings.norm(dim=1, keepdim=True)

    cosine_similarities = torch.einsum('nd,nd->n', image_embeddings, text_embeddings)
    cosine_similarity = cosine_similarities.detach().cpu().numpy()[0]

    return cosine_similarity

def save_met_criteria_enhanced_caption(json_file_path, output_json_path):
    """
    Main function to process the generated images and captions, evaluating them with integrated metrics.
    """
    resnet_model = resnet18(pretrained=True)
    resnet_model.eval()
    
    text_model = SentenceTransformer('all-MiniLM-L6-v2')

    met_criteria_entries = []

    with open(json_file_path, 'r') as file:
        data = json.load(file)

    for item in data:
        clip_base = item['clip'].rsplit('.', 1)[0].split("/")[1]
        generated_image_path = os.path.join('generated_images', f"{clip_base}.jpg")

        original_caption = item['original_caption']
        generated_caption = item['generated_caption']
        enhanced_caption = item['enhanced_caption']

        try:
            clip_base = item['clip'].rsplit('.', 1)[0]
            image1 = load_image(os.path.join('extracted_frames', f"{clip_base}.png"))  
            image2 = load_image(generated_image_path)
            image_similarity_score = image_similarity(resnet_model, image1, image2)
        except FileNotFoundError as e:
            print(f"File not found: {e}")
            continue

        text_similarity_score = calculate_text_similarity(original_caption, generated_caption, text_model)
        clip_score = CLIP_score(generated_image_path, generated_caption)

        metric_score = integrated_metric(image_similarity_score, text_similarity_score, clip_score)

        print(f"{clip_base}: Image Similarity Score = {image_similarity_score:.3f}, Text Similarity Score = {text_similarity_score:.3f}, CLIP Score = {clip_score:.3f}, Integrated Metric Score = {metric_score:.3f}")

        if metric_score >= 1.0: 
            met_criteria_entries.append({
                'clip_base': clip_base,
                'enhanced_caption': enhanced_caption,
                'metric_score': metric_score
            })

    with open(output_json_path, 'w') as outfile:
        json.dump(met_criteria_entries, outfile, indent=4)

json_file_path = "cut_part0_generated_info.json"
output_json_path = "met_criteria_entries.json"
save_met_criteria_enhanced_caption(json_file_path, output_json_path)

-bmS0RumV9U/-bmS0RumV9U.21_0: Image Similarity Score = 0.714, Text Similarity Score = 0.859, CLIP Score = 0.332, Integrated Metric Score = 1.333
-bmS0RumV9U/-bmS0RumV9U.27_0: Image Similarity Score = 0.506, Text Similarity Score = 0.716, CLIP Score = 0.246, Integrated Metric Score = 1.045
-bmS0RumV9U/-bmS0RumV9U.27_1: Image Similarity Score = 0.577, Text Similarity Score = 0.479, CLIP Score = 0.353, Integrated Metric Score = 0.982
-bmS0RumV9U/-bmS0RumV9U.5_0: Image Similarity Score = 0.516, Text Similarity Score = 0.453, CLIP Score = 0.276, Integrated Metric Score = 0.859
-bmS0RumV9U/-bmS0RumV9U.22_0: Image Similarity Score = 0.565, Text Similarity Score = 0.566, CLIP Score = 0.328, Integrated Metric Score = 1.025
-bmS0RumV9U/-bmS0RumV9U.25_0: Image Similarity Score = 0.624, Text Similarity Score = 0.387, CLIP Score = 0.242, Integrated Metric Score = 0.814
-bmS0RumV9U/-bmS0RumV9U.11_0: Image Similarity Score = 0.627, Text Similarity Score = 0.548, CLIP Score = 0.306, Integrated Metric 

### Final Evaluation Function Overview

The `final_evaluation` function performs a comprehensive evaluation of the generated images before and after applying enhanced captions. It assesses how closely these images, both initially generated and regenerated, match the original images from the dataset. This evaluation helps in understanding the effectiveness of using enhanced captions for image regeneration in improving the visual similarity to the original scenes.

#### Key Components of the Function

- **Image Regeneration and Similarity Calculation**:
  - For each selected entry, two key processes are undertaken:
    1. **Regeneration of Images**: Generates new images using enhanced captions derived from the original captions but without specific object details. This step aims to produce images that better capture the overall scene or context described in the original caption.
    2. **Similarity Scoring**: Computes and compares the similarity scores between the original images and both the initially generated and the newly regenerated images. This comparison provides insights into the impact of the regeneration process on image quality and relevance.

- **Evaluation Metrics**:
  - Reports the similarity scores for the original versus initially generated images and the original versus regenerated images. It calculates the average similarity scores to provide a broad measure of the regeneration process's effectiveness.


#### Purpose and Outcome

The primary goal of the `final_evaluation` function is to measure the impact of regenerating images with enhanced captions on the visual similarity to the original images. A higher similarity score post-regeneration indicates that the approach is effective in producing images that are more aligned with the original context and visual details. 


In [108]:
def final_evaluation(json_file_path, captions_file_path, old_images_folder, new_images_folder, resnet_model, num_images):
    """
    Evaluates the similarity between original and generated images before and after enhancement.
    """
    captions_dict = load_captions(captions_file_path)
    os.makedirs(new_images_folder, exist_ok=True)

    similarity_scores_old = []
    similarity_scores_new = []

    with open(json_file_path, 'r') as file:
        met_criteria_entries = json.load(file)[:num_images]

    for entry in met_criteria_entries:
        clip_base = entry['clip_base']
        clip_folder, clip_name = clip_base.split('/')[1].split('.', 1)
        captions_key = f"{clip_folder}.{clip_name}"

        if captions_key in captions_dict:
            original_caption = captions_dict[captions_key]
            print(f"Original caption found for {captions_key}: {original_caption}")
        else:
            print(f"No original caption found for {captions_key}. Skipping.")
            continue

        original_image_path = os.path.join('extracted_frames', clip_folder, f"{clip_folder}.{clip_name}.png")
        old_generated_image_path = os.path.join(old_images_folder, f"{clip_folder}.{clip_name}.jpg")
        new_generated_image_path = os.path.join(new_images_folder, f"{clip_folder}.{clip_name}.jpg")

        enhanced_caption = enhance_caption_with_openai([], original_caption)
        generate_and_save_image(enhanced_caption, new_generated_image_path, folder = new_images_folder)

        try:
            original_image = load_image(original_image_path)
            old_generated_image = load_image(old_generated_image_path)
            new_generated_image = load_image(new_generated_image_path)

            old_similarity_score = image_similarity(resnet_model, original_image, old_generated_image)
            new_similarity_score = image_similarity(resnet_model, original_image, new_generated_image)

            similarity_scores_old.append(old_similarity_score)
            similarity_scores_new.append(new_similarity_score)

            print(f"Processed {clip_base}: Old Similarity = {old_similarity_score}, New Similarity = {new_similarity_score}")
        except Exception as e:
            print(f"Error processing {clip_base}: {e}")

    average_old_similarity = np.mean(similarity_scores_old) if similarity_scores_old else 0
    average_new_similarity = np.mean(similarity_scores_new) if similarity_scores_new else 0
    print(f"Average Old Similarity: {average_old_similarity:.4f}")
    print(f"Average New Similarity: {average_new_similarity:.4f}")

resnet_model = resnet18(pretrained=True)

final_evaluation(
    json_file_path="met_criteria_entries.json",
    captions_file_path="cut_part0.jsonl",
    old_images_folder="generated_images",
    new_images_folder="new_generated_images",
    resnet_model=resnet_model,
    num_images=150
)


Original caption found for -bmS0RumV9U.21_0: a person cutting up apples on a cutting board
Attempting to save image to new_generated_images\-bmS0RumV9U.21_0.jpg
Processed -bmS0RumV9U/-bmS0RumV9U.21_0: Old Similarity = 0.9931387901306152, New Similarity = 0.9938879013061523
Original caption found for -bmS0RumV9U.27_0: a person pouring milk into a glass
Attempting to save image to new_generated_images\-bmS0RumV9U.27_0.jpg
Processed -bmS0RumV9U/-bmS0RumV9U.27_0: Old Similarity = 0.9932164549827576, New Similarity = 0.9939292073249817
Original caption found for -bmS0RumV9U.22_0: a person taking a tray of cupcakes out of the oven
