In [2]:
import pandas as pd
import numpy as np
import os
import json
from PIL import Image
from io import StringIO
os.chdir('/home/danish/VR_PROJECT')

In [3]:
import os
import google.generativeai as genai

# Set the GOOGLE_API_KEY in the environment first
os.environ["GOOGLE_API_KEY"] = "AIzaSyBdhaEYAluqm91IFNB7STuIppmfiC7sY2s"

# Then configure the genai module with the API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# Now you can create the GenerativeModel instance
model = genai.GenerativeModel('gemini-2.0-flash-001')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
listing_file = "dataset/abo-listings/listings/metadata/listings_3.json"
image_metadata = pd.read_csv("dataset/abo-images-small/images/metadata/images.csv")
image_dataset_path = "dataset/abo-images-small/images/small/"

In [5]:
#create output csv file
output_file = f"dataset/VQA-dataset/{listing_file.split('/')[-1].split('.')[0]}_VQA.csv"

In [6]:
def get_listing_lines(listing_file):
    """
    Read the listing file and return a list of lines.
    """
    with open(listing_file, 'r') as file:
        lines = file.readlines()
    return lines





In [7]:
def preprocess_product_json(product_json):
        """
        Preprocess the product JSON to ensure it is in the correct format.
        """
        list_of_keys_to_remove = ['main_image_id','node','other_image_id','spin_id','3dmodel_id']
        # Convert JSON string to dictionary
        product_dict = json.loads(product_json)
        # Remove unnecessary keys
        for key in list_of_keys_to_remove:
            if key in product_dict:
                del product_dict[key]
        
        # Convert dictionary to JSON string with indentation for better readability
        return json.dumps(product_dict, indent=4)
def prompt_for_product(product_json):
        """
        Generate a prompt for the given product JSON.
        """
        product_json = preprocess_product_json(product_json)
        prompt = f"""
        You are a QA dataset generator that creates short, factual, and human-readable question-answer pairs from Amazon product metadata and image. Each question must target a specific field from the metadata and be answerable with a **single word only**.

        Below is the product metadata in structured format. Generate **5 to 10 diverse QA pairs**, where:
        - Each question is clear and unambiguous.
        - Each answer is strictly a **single word** (no phrases, no multi-word answers).
        - Avoid repeating the same field.
        - Prefer commonly relevant fields like: brand, bullet_points, color, material, product type, model name, style, fabric type, finish type, pattern, item shape, product description and color code.
        - Questions should be such a way that they can be answered by looking at the image.
        - The output should be in CSV format with columns: question, answer.

        If a value is not meaningful or not present, skip that field. Ensure that QA pairs are diverse and aligned with the data provided.

        ---
        {product_json}
        ---
        """
        
        return prompt

In [8]:
def get_images_paths(image_ids):
    """
    Get the paths of images based on their IDs.
    """
    image_paths = []
    for image_id in image_ids:
        image_path = image_metadata[image_metadata['image_id'] == image_id]['path'].values
        if len(image_path) > 0:
            if os.path.exists(f"{image_dataset_path}/{image_path[0]}"):
                image_paths.append(f"{image_path[0]}")
    if len(image_paths) > 2:
        image_paths = image_paths[:2]
    return image_paths

def generate_VQA(prompt, image_path):
    img = Image.open(f"{image_dataset_path}/{image_path}")
    img = img.convert("RGB")
    # Generate the VQA using the model
    response = model.generate_content([prompt, img])
    # Extract the generated text from the response
    generated_text = response.text
    #read csv from the generated text
    csv_data = pd.read_csv(StringIO(generated_text.strip("`").replace("csv\n", "", 1).strip()))
    return csv_data

def get_VQA_for_product(product_json):
    df = pd.DataFrame(columns=["image_path","question", "answer"])
    list_of_image_ids = []
    
    prompt = prompt_for_product(product_json)
    product_dict = json.loads(product_json)
    if "main_image_id" in product_dict.keys():
        list_of_image_ids.append(product_dict['main_image_id'])
    if "other_image_id" in product_dict.keys():
        # Check if the key exists in the dictionary
        if isinstance(product_dict['other_image_id'], list):
            # If it's a list, extend it to the list_of_image_ids
            list_of_image_ids.extend(product_dict['other_image_id'])
        else:
            # If it's not a list, append it directly
            list_of_image_ids.append(product_dict['other_image_id'])
    image_paths = get_images_paths(list_of_image_ids)

    # print(f"Image paths: {image_paths}")
    # Generate the VQA using the model
    if len(image_paths) == 0:
        print("No images found for this product.")
        return df
    for image_path in image_paths:
        # print(f"Generating VQA for image: {image_path}")
        # Generate the VQA using the model
        csv_data = generate_VQA(prompt, image_path)
        # Append the generated data to the dataframe
        csv_data['image_path'] = image_path
        df = pd.concat([df, csv_data], ignore_index=True)
    return df



    
    
    

In [9]:
import time
from tqdm import tqdm
print(output_file)
if not os.path.exists(output_file):
    print(f"Output file {output_file} does not exist. Creating a new one.")
    output_df = pd.DataFrame(columns=["image_path","question", "answer"])
else:
    output_df = pd.read_csv(output_file)


lines = get_listing_lines(listing_file)

# subset_lines = lines[556:]  # to process a subset of lines
start_index = 5000
try:
    for i in tqdm(range(start_index, len(lines)), initial=start_index, total=len(lines)):
        line = lines[i]
        if "\"en_" not in line:
            continue
        try:
            df = get_VQA_for_product(line)
            # time.sleep(2)
        except Exception as e:
            print(f"Error processing line {i}: {e}")
            if "Resource has been exhausted" in str(e):
                print("Resource has been exhausted. Please try again later.")
                time.sleep(300)
                try:
                    df = get_VQA_for_product(line)
                except Exception as e:
                    time.sleep(900)
                    df = get_VQA_for_product(line)
            else:
                continue
        output_df = pd.concat([output_df, df], ignore_index=True)
        if i % 50 == 0:
            # Save the output DataFrame to a CSV file every 50 iterations
            output_df.to_csv(output_file, index=False)
            print(f"Progress saved at line {i}")
except Exception as e:
    print(f"An error occurred: {e}")
    # Handle the exception as needed
    pass
# Save the output DataFrame to a CSV file
output_df.to_csv(output_file, index=False)
print(f"VQA dataset saved to {output_file}") 
print(f"Progress saved at line {i}")


dataset/VQA-dataset/listings_3_VQA.csv
Output file dataset/VQA-dataset/listings_3_VQA.csv does not exist. Creating a new one.


 54%|█████▍    | 5001/9232 [00:04<4:51:06,  4.13s/it]

Progress saved at line 5000


 55%|█████▍    | 5051/9232 [02:49<4:20:22,  3.74s/it]

Progress saved at line 5050


 55%|█████▌    | 5101/9232 [05:16<4:10:26,  3.64s/it]

Progress saved at line 5100


 56%|█████▌    | 5151/9232 [07:34<3:26:34,  3.04s/it]

Progress saved at line 5150


 56%|█████▋    | 5201/9232 [10:00<3:25:29,  3.06s/it]

Progress saved at line 5200


 57%|█████▋    | 5251/9232 [12:18<3:41:49,  3.34s/it]

Progress saved at line 5250


 57%|█████▋    | 5301/9232 [14:48<3:08:12,  2.87s/it]

Progress saved at line 5300


 58%|█████▊    | 5351/9232 [17:02<3:23:37,  3.15s/it]

Progress saved at line 5350


 58%|█████▊    | 5391/9232 [18:58<1:50:27,  1.73s/it]

Error processing line 5390: Error tokenizing data. C error: Expected 2 fields in line 7, saw 4



 59%|█████▊    | 5401/9232 [19:33<3:48:38,  3.58s/it]

Progress saved at line 5400


 59%|█████▉    | 5451/9232 [22:06<3:30:32,  3.34s/it]

Progress saved at line 5450


 60%|█████▉    | 5501/9232 [24:32<3:35:10,  3.46s/it]

Progress saved at line 5500


 60%|██████    | 5551/9232 [26:53<2:50:37,  2.78s/it]

Progress saved at line 5550


 61%|██████    | 5601/9232 [29:27<3:01:05,  2.99s/it]

Progress saved at line 5600


 61%|██████    | 5651/9232 [31:34<3:13:09,  3.24s/it]

Progress saved at line 5650


 62%|██████▏   | 5701/9232 [33:55<2:39:56,  2.72s/it]

Progress saved at line 5700


 62%|██████▏   | 5751/9232 [36:32<3:14:46,  3.36s/it]

Progress saved at line 5750


 63%|██████▎   | 5801/9232 [38:55<2:43:17,  2.86s/it]

Progress saved at line 5800


 63%|██████▎   | 5851/9232 [41:18<2:26:17,  2.60s/it]

Progress saved at line 5850


 64%|██████▍   | 5901/9232 [43:54<3:10:18,  3.43s/it]

Progress saved at line 5900


 64%|██████▍   | 5951/9232 [47:12<2:42:34,  2.97s/it]

Progress saved at line 5950


 65%|██████▌   | 6001/9232 [49:56<2:45:59,  3.08s/it]

Progress saved at line 6000


 65%|██████▌   | 6035/9232 [51:45<3:02:51,  3.43s/it]

No images found for this product.


 66%|██████▌   | 6051/9232 [52:27<2:28:10,  2.79s/it]

Progress saved at line 6050


 66%|██████▌   | 6101/9232 [54:41<2:51:21,  3.28s/it]

Progress saved at line 6100


 67%|██████▋   | 6190/9232 [58:54<2:32:20,  3.00s/it]

No images found for this product.


 67%|██████▋   | 6201/9232 [59:23<2:31:02,  2.99s/it]

Progress saved at line 6200


 68%|██████▊   | 6251/9232 [1:02:15<4:31:37,  5.47s/it]

Progress saved at line 6250


 68%|██████▊   | 6301/9232 [1:04:49<2:31:24,  3.10s/it]

Progress saved at line 6300


 69%|██████▉   | 6351/9232 [1:07:21<2:39:21,  3.32s/it]

Progress saved at line 6350


 69%|██████▉   | 6401/9232 [1:09:26<2:15:12,  2.87s/it]

Progress saved at line 6400


 70%|██████▉   | 6451/9232 [1:11:32<2:35:16,  3.35s/it]

Progress saved at line 6450


 70%|███████   | 6501/9232 [1:13:21<1:09:26,  1.53s/it]

Progress saved at line 6500


 71%|███████   | 6551/9232 [1:15:49<2:20:29,  3.14s/it]

Progress saved at line 6550


 71%|███████▏  | 6579/9232 [1:17:05<1:52:58,  2.55s/it]

No images found for this product.


 72%|███████▏  | 6601/9232 [1:17:53<1:19:34,  1.81s/it]

Progress saved at line 6600


 72%|███████▏  | 6651/9232 [1:20:13<2:00:53,  2.81s/it]

Progress saved at line 6650


 73%|███████▎  | 6775/9232 [1:25:26<2:03:28,  3.02s/it]

No images found for this product.


 74%|███████▎  | 6801/9232 [1:26:34<1:58:44,  2.93s/it]

Progress saved at line 6800


 74%|███████▍  | 6851/9232 [1:29:11<1:49:38,  2.76s/it]

Progress saved at line 6850


 75%|███████▍  | 6901/9232 [1:31:28<2:05:58,  3.24s/it]

Progress saved at line 6900


 75%|███████▌  | 6951/9232 [1:34:05<1:11:28,  1.88s/it]

Progress saved at line 6950


 76%|███████▌  | 7001/9232 [1:36:03<1:16:27,  2.06s/it]

Progress saved at line 7000


 77%|███████▋  | 7101/9232 [1:40:56<1:26:24,  2.43s/it]

Progress saved at line 7100


 78%|███████▊  | 7201/9232 [1:45:25<1:26:03,  2.54s/it]

Progress saved at line 7200


 79%|███████▊  | 7251/9232 [1:47:38<1:34:43,  2.87s/it]

Progress saved at line 7250


 79%|███████▉  | 7301/9232 [1:49:54<1:12:57,  2.27s/it]

Progress saved at line 7300


 80%|███████▉  | 7351/9232 [1:52:29<1:46:31,  3.40s/it]

Progress saved at line 7350


 80%|████████  | 7401/9232 [1:54:54<1:27:29,  2.87s/it]

Progress saved at line 7400


 81%|████████  | 7451/9232 [1:57:12<1:11:39,  2.41s/it]

Progress saved at line 7450


 81%|████████▏ | 7501/9232 [1:59:26<1:17:43,  2.69s/it]

Progress saved at line 7500


 81%|████████▏ | 7510/9232 [1:59:56<1:39:33,  3.47s/it]

Error processing line 7509: Error tokenizing data. C error: Expected 2 fields in line 6, saw 3



 82%|████████▏ | 7551/9232 [2:02:01<1:34:08,  3.36s/it]

Progress saved at line 7550


 83%|████████▎ | 7651/9232 [2:07:10<1:30:20,  3.43s/it]

Progress saved at line 7650


 84%|████████▍ | 7742/9232 [2:11:40<1:34:02,  3.79s/it]

No images found for this product.


 84%|████████▍ | 7801/9232 [2:14:36<1:18:42,  3.30s/it]

Progress saved at line 7800


 85%|████████▌ | 7851/9232 [2:17:11<53:06,  2.31s/it]  

Progress saved at line 7850


 86%|████████▌ | 7951/9232 [2:22:00<1:03:27,  2.97s/it]

Progress saved at line 7950


 87%|████████▋ | 8051/9232 [2:26:17<43:30,  2.21s/it]  

Progress saved at line 8050


 88%|████████▊ | 8101/9232 [2:28:35<46:16,  2.45s/it]  

Progress saved at line 8100


 88%|████████▊ | 8151/9232 [2:31:01<1:05:51,  3.66s/it]

Progress saved at line 8150


 89%|████████▉ | 8201/9232 [2:33:41<46:14,  2.69s/it]  

Progress saved at line 8200


 89%|████████▉ | 8251/9232 [2:36:11<1:00:38,  3.71s/it]

Progress saved at line 8250


 90%|████████▉ | 8301/9232 [2:38:43<32:20,  2.08s/it]  

Progress saved at line 8300


 90%|█████████ | 8351/9232 [2:41:26<38:24,  2.62s/it]

Progress saved at line 8350


 91%|█████████ | 8401/9232 [2:44:11<48:26,  3.50s/it]

Progress saved at line 8400


 92%|█████████▏| 8451/9232 [2:46:17<30:30,  2.34s/it]

Progress saved at line 8450


 92%|█████████▏| 8501/9232 [2:48:55<39:09,  3.21s/it]

Progress saved at line 8500


 93%|█████████▎| 8551/9232 [2:51:33<22:34,  1.99s/it]

Progress saved at line 8550


 94%|█████████▎| 8651/9232 [2:56:35<29:17,  3.03s/it]

Progress saved at line 8650


 94%|█████████▍| 8701/9232 [2:59:00<27:25,  3.10s/it]

Progress saved at line 8700


 95%|█████████▍| 8751/9232 [3:01:15<21:19,  2.66s/it]

Progress saved at line 8750


 95%|█████████▌| 8780/9232 [3:02:45<19:09,  2.54s/it]

No images found for this product.


 95%|█████████▌| 8801/9232 [3:03:50<21:17,  2.96s/it]

Progress saved at line 8800


 96%|█████████▌| 8851/9232 [3:06:18<22:44,  3.58s/it]

Progress saved at line 8850


 97%|█████████▋| 8951/9232 [3:11:14<12:37,  2.70s/it]

Progress saved at line 8950


 97%|█████████▋| 9001/9232 [3:13:47<11:48,  3.07s/it]

Progress saved at line 9000


 98%|█████████▊| 9051/9232 [3:16:21<09:17,  3.08s/it]

Progress saved at line 9050


 99%|█████████▊| 9101/9232 [3:19:08<08:34,  3.92s/it]

Progress saved at line 9100


 99%|█████████▉| 9151/9232 [3:21:36<03:54,  2.89s/it]

Progress saved at line 9150


100%|█████████▉| 9201/9232 [3:23:53<01:33,  3.03s/it]

Progress saved at line 9200


100%|██████████| 9232/9232 [3:25:31<00:00,  2.91s/it]

VQA dataset saved to dataset/VQA-dataset/listings_3_VQA.csv
Progress saved at line 9231



