In [1]:
import pandas as pd
import numpy as np
import os
import json
from PIL import Image
from io import StringIO

In [2]:
import os
import google.generativeai as genai

# Set the GOOGLE_API_KEY in the environment first
os.environ["GOOGLE_API_KEY"] = "AIzaSyA-R3wvqFinEGCJG73sCxfI8qDaQNuROvg"

# Then configure the genai module with the API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# Now you can create the GenerativeModel instance
model = genai.GenerativeModel('gemini-2.0-flash-001')

In [3]:
!wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-images-small.tar

--2025-05-11 15:33:21--  https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-images-small.tar
Resolving amazon-berkeley-objects.s3.amazonaws.com (amazon-berkeley-objects.s3.amazonaws.com)... 3.5.6.11, 52.216.53.241, 3.5.27.111, ...
Connecting to amazon-berkeley-objects.s3.amazonaws.com (amazon-berkeley-objects.s3.amazonaws.com)|3.5.6.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3253381120 (3.0G) [application/x-tar]
Saving to: ‘abo-images-small.tar’


2025-05-11 15:34:54 (33.4 MB/s) - ‘abo-images-small.tar’ saved [3253381120/3253381120]



In [4]:
!tar -xf abo-images-small.tar

In [6]:
listing_file = "/content/drive/MyDrive/abo-listings/listings/metadata/listings_1.json.gz"
image_metadata = pd.read_csv("/content/drive/MyDrive/images/metadata/images.csv")
image_dataset_path = "/content/images/small"

In [7]:
#create output csv file
output_file = f"/content/drive/MyDrive/images/{listing_file.split('/')[-1].split('.')[0]}_VQA.csv"

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# def get_listing_lines(listing_file):
#     """
#     Read the listing file and return a list of lines.
#     """
#     with open(listing_file, 'r') as file:
#         lines = file.readlines()
#     return lines
import gzip

def get_listing_lines(listing_file):
    """
    Read the listing file and return a list of lines.
    Handles gzip compression if the file name ends with '.gz'.
    """
    if listing_file.endswith('.gz'):
        with gzip.open(listing_file, 'rt', encoding='utf-8') as file:
            lines = file.readlines()
    else:
        with open(listing_file, 'r', encoding='utf-8') as file:
            lines = file.readlines()
    return lines


In [9]:
def preprocess_product_json(product_json):
        """
        Preprocess the product JSON to ensure it is in the correct format.
        """
        list_of_keys_to_remove = ['main_image_id','node','other_image_id','spin_id','3dmodel_id']
        # Convert JSON string to dictionary
        product_dict = json.loads(product_json)
        # Remove unnecessary keys
        for key in list_of_keys_to_remove:
            if key in product_dict:
                del product_dict[key]

        # Convert dictionary to JSON string with indentation for better readability
        return json.dumps(product_dict, indent=4)
def prompt_for_product(product_json):
        """
        Generate a prompt for the given product JSON.
        """
        preprocess_product_json(product_json)
        prompt = f"""
        You are a QA dataset generator that creates short, factual, and human-readable question-answer pairs from Amazon product metadata and image. Each question must target a specific field from the metadata and be answerable with a *single word only*.

        Below is the product metadata in structured format. Generate *5 to 10 diverse QA pairs*, where:
        - Each question is clear and unambiguous.
        - Each answer is strictly a *single word* (no phrases, no multi-word answers).
        - Avoid repeating the same field.
        - Prefer commonly relevant fields like: brand, bullet_points, color, material, product type, model name, style, fabric type, finish type, pattern, item shape, product description and color code.
        - Questions should be such a way that they can be answered by looking at the image.
        - The output should be in CSV format with columns: question, answer.

        If a value is not meaningful or not present, skip that field. Ensure that QA pairs are diverse and aligned with the data provided.

        ---
        {product_json}
        ---
        """

        return prompt

In [10]:
def get_images_paths(image_ids):
    """
    Get the paths of images based on their IDs.
    """
    image_paths = []
    for image_id in image_ids:
        image_path = image_metadata[image_metadata['image_id'] == image_id]['path'].values
        if len(image_path) > 0:
            if os.path.exists(f"{image_dataset_path}/{image_path[0]}"):
                image_paths.append(f"{image_path[0]}")
    return image_paths

def generate_VQA(prompt, image_path):
    img = Image.open(f"{image_dataset_path}/{image_path}")
    img = img.convert("RGB")
    # Generate the VQA using the model
    response = model.generate_content([prompt, img])
    # Extract the generated text from the response
    generated_text = response.text
    #read csv from the generated text
    csv_data = pd.read_csv(StringIO(generated_text.strip("`").replace("csv\n", "", 1).strip()))
    return csv_data

def get_VQA_for_product(product_json):
    df = pd.DataFrame(columns=["image_path","question", "answer"])
    list_of_image_ids = []

    prompt = prompt_for_product(product_json)
    product_dict = json.loads(product_json)
    list_of_image_ids.append(product_dict['main_image_id'])
    if "other_image_id" in product_dict.keys():
        # Check if the key exists in the dictionary
        if isinstance(product_dict['other_image_id'], list):
            # If it's a list, extend it to the list_of_image_ids
            list_of_image_ids.extend(product_dict['other_image_id'])
        else:
            # If it's not a list, append it directly
            list_of_image_ids.append(product_dict['other_image_id'])
    image_paths = get_images_paths(list_of_image_ids)

    print(f"Image paths: {image_paths}")
    # Generate the VQA using the model
    for image_path in image_paths:
        print(f"Generating VQA for image: {image_path}")
        # Generate the VQA using the model
        csv_data = generate_VQA(prompt, image_path)
        import time
        time.sleep(2)
        # Append the generated data to the dataframe
        csv_data['image_path'] = image_path
        df = pd.concat([df, csv_data], ignore_index=True)
    return df


In [11]:
from tqdm import tqdm

# Prepare existing CSV
if not os.path.exists(output_file):
    output_df = pd.DataFrame(columns=["image_path", "question", "answer"])
else:
    output_df = pd.read_csv(output_file)

# Track processed image_ids
processed_ids = set(output_df['image_path'].unique())

# Load metadata lines
lines = get_listing_lines(listing_file)
subset_lines = lines[900:9000]

try:
    for line in tqdm(subset_lines):
        if "\"en_" not in line:
            continue

        try:
            product_dict = json.loads(line)
            main_image_id = product_dict.get('main_image_id', None)
            if not main_image_id or main_image_id in processed_ids:
                continue
        except Exception:
            continue

        df = get_VQA_for_product(line)
        output_df = pd.concat([output_df, df], ignore_index=True)

        # Update processed_ids dynamically to reduce memory usage
        processed_ids.update(df['image_path'].unique())

        # Save incrementally to avoid data loss
        output_df.to_csv(output_file, index=False)

except Exception as e:
    print(f"An error occurred: {e}")
    pass


  0%|          | 0/8100 [00:00<?, ?it/s]

Image paths: ['9c/9ca0d27d.jpg', 'ee/ee8ee952.jpg', '49/49b1b22b.jpg', '66/66f3a68c.jpg', 'b2/b2ff8632.jpg']
Generating VQA for image: 9c/9ca0d27d.jpg
Generating VQA for image: ee/ee8ee952.jpg
Generating VQA for image: 49/49b1b22b.jpg
Generating VQA for image: 66/66f3a68c.jpg
Generating VQA for image: b2/b2ff8632.jpg


  0%|          | 1/8100 [00:22<51:24:58, 22.85s/it]

Image paths: ['a1/a1b1d8d5.jpg', '2f/2f357c87.jpg', '85/85192c70.jpg']
Generating VQA for image: a1/a1b1d8d5.jpg
Generating VQA for image: 2f/2f357c87.jpg
Generating VQA for image: 85/85192c70.jpg


  0%|          | 3/8100 [00:35<23:44:37, 10.56s/it]

Image paths: ['c9/c9ce8c90.jpg', 'ee/ee8ee952.jpg', '49/49b1b22b.jpg', '66/66f3a68c.jpg', '10/10146912.jpg']
Generating VQA for image: c9/c9ce8c90.jpg
Generating VQA for image: ee/ee8ee952.jpg
Generating VQA for image: 49/49b1b22b.jpg
Generating VQA for image: 66/66f3a68c.jpg
Generating VQA for image: 10/10146912.jpg


  0%|          | 4/8100 [00:56<31:54:54, 14.19s/it]

Image paths: ['ca/ca155f75.jpg', 'c0/c0229ce0.jpg', '9e/9eb555dd.jpg']
Generating VQA for image: ca/ca155f75.jpg
Generating VQA for image: c0/c0229ce0.jpg
Generating VQA for image: 9e/9eb555dd.jpg


  0%|          | 5/8100 [01:08<30:15:49, 13.46s/it]

Image paths: ['dd/dd8d8182.jpg', '2f/2f357c87.jpg', '85/85192c70.jpg']
Generating VQA for image: dd/dd8d8182.jpg
Generating VQA for image: 2f/2f357c87.jpg
Generating VQA for image: 85/85192c70.jpg


  0%|          | 6/8100 [01:20<29:26:39, 13.10s/it]

Image paths: ['8e/8e7480df.jpg', 'ee/ee8ee952.jpg', '49/49b1b22b.jpg', '66/66f3a68c.jpg', '89/8980e982.jpg']
Generating VQA for image: 8e/8e7480df.jpg
Generating VQA for image: ee/ee8ee952.jpg
Generating VQA for image: 49/49b1b22b.jpg
Generating VQA for image: 66/66f3a68c.jpg
Generating VQA for image: 89/8980e982.jpg


  0%|          | 7/8100 [01:40<34:04:15, 15.16s/it]

Image paths: ['7f/7f29db56.jpg', '1f/1f3d7ced.jpg', 'a8/a82dcccc.jpg', '8a/8a4fc2da.jpg', 'f3/f32a9034.jpg', '88/88dbb009.jpg']
Generating VQA for image: 7f/7f29db56.jpg
Generating VQA for image: 1f/1f3d7ced.jpg
Generating VQA for image: a8/a82dcccc.jpg
Generating VQA for image: 8a/8a4fc2da.jpg
Generating VQA for image: f3/f32a9034.jpg
Generating VQA for image: 88/88dbb009.jpg


  0%|          | 8/8100 [02:04<40:03:56, 17.82s/it]

Image paths: ['b4/b4045219.jpg', '6e/6ef274c7.jpg', '2c/2c7b280e.jpg', '5a/5ac3ea23.jpg', '8c/8c48826c.jpg', 'b2/b2d6f4d8.jpg', '13/13c410d3.jpg']
Generating VQA for image: b4/b4045219.jpg
Generating VQA for image: 6e/6ef274c7.jpg


  0%|          | 8/8100 [02:09<36:19:54, 16.16s/it]

An error occurred: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-001:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).



