In [1]:
import pandas as pd
import numpy as np
import os
import json
from PIL import Image
from io import StringIO
os.chdir('/home/danish/VR_PROJECT')

In [None]:
import os
import google.generativeai as genai

# Set the GOOGLE_API_KEY in the environment first

# Then configure the genai module with the API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# Now you can create the GenerativeModel instance
model = genai.GenerativeModel('gemini-2.0-flash')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
listing_file = "dataset/abo-listings/listings/metadata/listings_0.json"
image_metadata = pd.read_csv("dataset/abo-images-small/images/metadata/images.csv")
image_dataset_path = "dataset/abo-images-small/images/small/"

In [4]:
#create output csv file
output_file = f"dataset/VQA-dataset/{listing_file.split('/')[-1].split('.')[0]}_VQA.csv"

In [5]:
def get_listing_lines(listing_file):
    """
    Read the listing file and return a list of lines.
    """
    with open(listing_file, 'r') as file:
        lines = file.readlines()
    return lines





In [6]:
def preprocess_product_json(product_json):
        """
        Preprocess the product JSON to ensure it is in the correct format.
        """
        list_of_keys_to_remove = ['main_image_id','node','other_image_id','spin_id','3dmodel_id']
        # Convert JSON string to dictionary
        product_dict = json.loads(product_json)
        # Remove unnecessary keys
        for key in list_of_keys_to_remove:
            if key in product_dict:
                del product_dict[key]
        
        # Convert dictionary to JSON string with indentation for better readability
        return json.dumps(product_dict, indent=4)
def prompt_for_product(product_json):
        """
        Generate a prompt for the given product JSON.
        """
        product_json = preprocess_product_json(product_json)
        prompt = f"""
        You are a QA dataset generator that creates short, factual, and human-readable question-answer pairs from Amazon product metadata and image. Each question must target a specific field from the metadata and be answerable with a **single word only**.

        Below is the product metadata in structured format. Generate **5 to 10 diverse QA pairs**, where:
        - Each question is clear and unambiguous.
        - Each answer is strictly a **single word** (no phrases, no multi-word answers).
        - Avoid repeating the same field.
        - Prefer commonly relevant fields like: brand, bullet_points, color, material, product type, model name, style, fabric type, finish type, pattern, item shape, product description and color code.
        - Questions should be such a way that they can be answered by looking at the image.
        - The output should be in CSV format with columns: question, answer.

        If a value is not meaningful or not present, skip that field. Ensure that QA pairs are diverse and aligned with the data provided.

        ---
        {product_json}
        ---
        """
        
        return prompt

In [7]:
def get_images_paths(image_ids):
    """
    Get the paths of images based on their IDs.
    """
    image_paths = []
    for image_id in image_ids:
        image_path = image_metadata[image_metadata['image_id'] == image_id]['path'].values
        if len(image_path) > 0:
            if os.path.exists(f"{image_dataset_path}/{image_path[0]}"):
                image_paths.append(f"{image_path[0]}")
    return image_paths

def generate_VQA(prompt, image_path):
    img = Image.open(f"{image_dataset_path}/{image_path}")
    img = img.convert("RGB")
    # Generate the VQA using the model
    response = model.generate_content([prompt, img])
    # Extract the generated text from the response
    generated_text = response.text
    #read csv from the generated text
    csv_data = pd.read_csv(StringIO(generated_text.strip("`").replace("csv\n", "", 1).strip()))
    return csv_data

def get_VQA_for_product(product_json):
    df = pd.DataFrame(columns=["image_path","question", "answer"])
    list_of_image_ids = []
    
    prompt = prompt_for_product(product_json)
    product_dict = json.loads(product_json)
    if "main_image_id" in product_dict.keys():
        list_of_image_ids.append(product_dict['main_image_id'])
    if "other_image_id" in product_dict.keys():
        # Check if the key exists in the dictionary
        if isinstance(product_dict['other_image_id'], list):
            # If it's a list, extend it to the list_of_image_ids
            list_of_image_ids.extend(product_dict['other_image_id'])
        else:
            # If it's not a list, append it directly
            list_of_image_ids.append(product_dict['other_image_id'])
    image_paths = get_images_paths(list_of_image_ids)

    # print(f"Image paths: {image_paths}")
    # Generate the VQA using the model
    if len(image_paths) == 0:
        print("No images found for this product.")
        return df
    for image_path in image_paths:
        # print(f"Generating VQA for image: {image_path}")
        # Generate the VQA using the model
        csv_data = generate_VQA(prompt, image_path)
        # Append the generated data to the dataframe
        csv_data['image_path'] = image_path
        df = pd.concat([df, csv_data], ignore_index=True)
    return df



    
    
    

In [9]:
import time
from tqdm import tqdm
if not os.path.exists(output_file):
    output_df = pd.DataFrame(columns=["image_path","question", "answer"])
else:
    output_df = pd.read_csv(output_file)


lines = get_listing_lines(listing_file)

# subset_lines = lines[556:]  # to process a subset of lines
start_index = 3900
try:
    for i in tqdm(range(start_index, len(lines)), initial=start_index, total=len(lines)):
        line = lines[i]
        if "\"en_" not in line:
            continue
        try:
            df = get_VQA_for_product(line)
            # time.sleep(1)  # Sleep for 1 second to avoid rate limiting
        except Exception as e:
            print(f"Error processing line {i}: {e}")
            if "Resource has been exhausted" in str(e):
                print("Resource has been exhausted. Please try again later.")
                time.sleep(600)
                try:
                    df = get_VQA_for_product(line)
                except Exception as e:
                    time.sleep(900)
                    df = get_VQA_for_product(line)
            else:
                continue
        
        output_df = pd.concat([output_df, df], ignore_index=True)
        if i % 50 == 0:
            # Save the output DataFrame to a CSV file every 50 iterations
            output_df.to_csv(output_file, index=False)
            print(f"Progress saved at line {i}")
except Exception as e:
    print(f"An error occurred: {e}")
    # Handle the exception as needed
    pass
# Save the output DataFrame to a CSV file
output_df.to_csv(output_file, index=False)
print(f"VQA dataset saved to {output_file}") 
print(f"Progress saved at line {i}")


 42%|████▏     | 3901/9232 [00:06<10:01:05,  6.77s/it]

Progress saved at line 3900


 43%|████▎     | 3951/9232 [05:40<9:26:32,  6.44s/it] 

Progress saved at line 3950


 43%|████▎     | 4001/9232 [10:58<5:15:05,  3.61s/it] 

Progress saved at line 4000


 44%|████▍     | 4101/9232 [23:45<11:24:05,  8.00s/it]

Progress saved at line 4100


 45%|████▍     | 4151/9232 [29:14<8:16:39,  5.86s/it] 

Progress saved at line 4150


 46%|████▌     | 4201/9232 [35:27<9:58:27,  7.14s/it] 

Progress saved at line 4200


 47%|████▋     | 4301/9232 [46:58<10:11:53,  7.45s/it]

Progress saved at line 4300


 47%|████▋     | 4351/9232 [53:13<10:35:37,  7.81s/it]

Progress saved at line 4350


 48%|████▊     | 4401/9232 [58:31<5:19:22,  3.97s/it] 

Progress saved at line 4400


 48%|████▊     | 4451/9232 [1:04:23<12:42:03,  9.56s/it]

Progress saved at line 4450


 49%|████▉     | 4501/9232 [1:11:05<13:18:39, 10.13s/it]

Progress saved at line 4500


 49%|████▉     | 4551/9232 [1:17:20<10:17:24,  7.91s/it]

Progress saved at line 4550


 50%|████▉     | 4601/9232 [1:22:29<10:44:46,  8.35s/it]

Progress saved at line 4600


 50%|█████     | 4651/9232 [1:28:16<6:37:08,  5.20s/it] 

Progress saved at line 4650


 51%|█████     | 4701/9232 [1:34:19<11:04:18,  8.80s/it]

Progress saved at line 4700


 51%|█████     | 4702/9232 [1:34:29<11:29:09,  9.13s/it]

Error processing line 4701: Error tokenizing data. C error: Expected 2 fields in line 4, saw 3



 51%|█████▏    | 4751/9232 [1:40:39<10:01:53,  8.06s/it]

Progress saved at line 4750


 53%|█████▎    | 4851/9232 [1:55:46<15:34:17, 12.80s/it]

Progress saved at line 4850


 53%|█████▎    | 4901/9232 [2:03:20<10:31:55,  8.75s/it]

Progress saved at line 4900


 54%|█████▎    | 4959/9232 [2:10:32<5:43:34,  4.82s/it] 

Error processing line 4958: Error tokenizing data. C error: Expected 2 fields in line 6, saw 4



 54%|█████▍    | 5001/9232 [2:16:56<7:15:52,  6.18s/it] 

Progress saved at line 5000


 55%|█████▍    | 5051/9232 [2:22:49<8:46:02,  7.55s/it] 

Progress saved at line 5050


 55%|█████▌    | 5097/9232 [2:28:09<11:11:53,  9.75s/it]

No images found for this product.


 55%|█████▌    | 5101/9232 [2:28:57<14:57:00, 13.03s/it]

Progress saved at line 5100


 56%|█████▌    | 5151/9232 [2:35:15<6:55:43,  6.11s/it] 

Progress saved at line 5150


 56%|█████▋    | 5201/9232 [2:41:31<6:00:42,  5.37s/it] 

Progress saved at line 5200


 57%|█████▋    | 5251/9232 [2:48:13<9:36:39,  8.69s/it] 

Progress saved at line 5250


 57%|█████▋    | 5301/9232 [2:54:46<11:24:43, 10.45s/it]

Progress saved at line 5300


 58%|█████▊    | 5351/9232 [3:01:25<10:41:51,  9.92s/it]

Progress saved at line 5350


 59%|█████▊    | 5401/9232 [3:07:41<7:08:02,  6.70s/it] 

Progress saved at line 5400


 59%|█████▉    | 5451/9232 [3:13:39<8:37:34,  8.21s/it]

Progress saved at line 5450


 60%|█████▉    | 5501/9232 [3:19:24<8:21:01,  8.06s/it] 

Progress saved at line 5500


 60%|██████    | 5551/9232 [3:24:25<3:44:47,  3.66s/it]

Progress saved at line 5550


 61%|██████    | 5601/9232 [3:29:35<7:38:38,  7.58s/it]

Progress saved at line 5600


 61%|██████    | 5651/9232 [3:34:59<5:32:20,  5.57s/it] 

Progress saved at line 5650


 62%|██████▏   | 5701/9232 [3:40:26<7:54:36,  8.06s/it]

Progress saved at line 5700


 62%|██████▏   | 5751/9232 [3:45:39<5:55:33,  6.13s/it]

Progress saved at line 5750


 63%|██████▎   | 5801/9232 [3:50:33<6:01:26,  6.32s/it]

Progress saved at line 5800


 63%|██████▎   | 5851/9232 [3:55:31<5:34:25,  5.93s/it] 

Progress saved at line 5850


 64%|██████▍   | 5901/9232 [4:00:49<7:22:28,  7.97s/it]

Progress saved at line 5900


 64%|██████▍   | 5918/9232 [4:02:24<5:09:25,  5.60s/it]

No images found for this product.


 64%|██████▍   | 5951/9232 [4:05:36<7:50:36,  8.61s/it]

Progress saved at line 5950


 65%|██████▍   | 5964/9232 [4:07:20<8:34:18,  9.44s/it]

Error processing line 5963: Error tokenizing data. C error: Expected 2 fields in line 4, saw 3



 65%|██████▌   | 6001/9232 [4:11:52<5:15:53,  5.87s/it] 

Progress saved at line 6000


 66%|██████▌   | 6051/9232 [4:19:13<8:06:32,  9.18s/it] 

Progress saved at line 6050


 66%|██████▌   | 6101/9232 [4:25:55<7:53:59,  9.08s/it] 

Progress saved at line 6100


 67%|██████▋   | 6151/9232 [4:31:49<6:44:44,  7.88s/it]

Progress saved at line 6150


 67%|██████▋   | 6201/9232 [4:38:26<6:32:57,  7.78s/it] 

Progress saved at line 6200


 68%|██████▊   | 6251/9232 [4:46:45<9:11:34, 11.10s/it] 

Progress saved at line 6250


 68%|██████▊   | 6301/9232 [4:53:14<6:23:43,  7.86s/it] 

Progress saved at line 6300


 69%|██████▉   | 6351/9232 [4:58:47<9:35:23, 11.98s/it]

Progress saved at line 6350


 69%|██████▉   | 6401/9232 [5:04:20<5:25:46,  6.90s/it]

Progress saved at line 6400


 70%|██████▉   | 6451/9232 [5:10:12<4:35:10,  5.94s/it]

Progress saved at line 6450


 70%|███████   | 6501/9232 [5:15:05<5:38:34,  7.44s/it]

Progress saved at line 6500


 71%|███████   | 6551/9232 [5:20:50<5:55:03,  7.95s/it]

Progress saved at line 6550


 72%|███████▏  | 6601/9232 [5:26:18<5:33:35,  7.61s/it]

Progress saved at line 6600


 72%|███████▏  | 6651/9232 [5:31:47<4:34:41,  6.39s/it]

Progress saved at line 6650


 73%|███████▎  | 6701/9232 [5:37:20<4:42:53,  6.71s/it]

Progress saved at line 6700


 73%|███████▎  | 6751/9232 [5:42:49<4:08:17,  6.00s/it]

Progress saved at line 6750


 74%|███████▎  | 6801/9232 [5:48:46<5:00:49,  7.42s/it]

Progress saved at line 6800


 74%|███████▍  | 6851/9232 [5:54:52<5:16:25,  7.97s/it]

Progress saved at line 6850


 75%|███████▍  | 6901/9232 [6:01:06<6:10:15,  9.53s/it]

Progress saved at line 6900


 75%|███████▌  | 6951/9232 [6:07:15<4:35:18,  7.24s/it]

Progress saved at line 6950


 76%|███████▌  | 7001/9232 [6:11:36<4:21:33,  7.03s/it]

Progress saved at line 7000


 76%|███████▋  | 7051/9232 [6:17:35<4:55:29,  8.13s/it]

Progress saved at line 7050


 77%|███████▋  | 7151/9232 [6:29:55<2:48:32,  4.86s/it]

Progress saved at line 7150


 78%|███████▊  | 7197/9232 [6:35:51<5:48:08, 10.26s/it]

No images found for this product.


 79%|███████▊  | 7251/9232 [6:41:23<5:09:19,  9.37s/it]

Progress saved at line 7250


 79%|███████▉  | 7301/9232 [6:46:57<4:37:05,  8.61s/it]

Progress saved at line 7300


 80%|████████  | 7401/9232 [6:59:27<4:01:30,  7.91s/it]

Progress saved at line 7400


 81%|████████  | 7451/9232 [7:04:43<2:34:16,  5.20s/it]

Progress saved at line 7450


 82%|████████▏ | 7601/9232 [7:21:59<1:50:37,  4.07s/it]

Progress saved at line 7600


 83%|████████▎ | 7651/9232 [7:27:14<2:57:03,  6.72s/it]

Progress saved at line 7650


 83%|████████▎ | 7701/9232 [7:33:34<2:59:58,  7.05s/it]

Progress saved at line 7700


 84%|████████▍ | 7742/9232 [7:38:01<2:04:43,  5.02s/it]

No images found for this product.


 84%|████████▍ | 7751/9232 [7:38:38<1:42:33,  4.16s/it]

Progress saved at line 7750


 84%|████████▍ | 7801/9232 [7:44:39<3:59:51, 10.06s/it]

Progress saved at line 7800


 85%|████████▍ | 7824/9232 [7:47:30<1:58:26,  5.05s/it]

No images found for this product.


 85%|████████▌ | 7851/9232 [7:50:23<3:58:24, 10.36s/it]

Progress saved at line 7850


 87%|████████▋ | 8001/9232 [8:05:53<1:30:43,  4.42s/it]

Progress saved at line 8000


 87%|████████▋ | 8051/9232 [8:11:39<2:42:38,  8.26s/it]

Progress saved at line 8050


 88%|████████▊ | 8101/9232 [8:18:21<2:56:29,  9.36s/it]

Progress saved at line 8100


 88%|████████▊ | 8151/9232 [8:26:29<2:28:49,  8.26s/it]

Progress saved at line 8150


 89%|████████▉ | 8201/9232 [8:35:04<3:06:33, 10.86s/it]

Progress saved at line 8200


 89%|████████▉ | 8251/9232 [8:42:49<2:29:32,  9.15s/it]

Progress saved at line 8250


 90%|████████▉ | 8301/9232 [8:50:10<2:14:20,  8.66s/it]

Progress saved at line 8300


 90%|█████████ | 8351/9232 [8:57:56<3:38:19, 14.87s/it]

Progress saved at line 8350


 91%|█████████ | 8401/9232 [9:05:09<1:53:24,  8.19s/it]

Progress saved at line 8400


 92%|█████████▏| 8451/9232 [9:10:44<2:00:27,  9.25s/it]

Progress saved at line 8450


 92%|█████████▏| 8501/9232 [9:17:23<1:37:53,  8.04s/it]

Progress saved at line 8500


 93%|█████████▎| 8551/9232 [9:24:17<1:13:26,  6.47s/it]

Progress saved at line 8550


 94%|█████████▍| 8701/9232 [9:44:31<1:09:42,  7.88s/it]

Progress saved at line 8700


 95%|█████████▍| 8751/9232 [9:51:29<54:21,  6.78s/it]  

Progress saved at line 8750


 95%|█████████▌| 8801/9232 [9:57:15<40:19,  5.61s/it]  

Progress saved at line 8800


 96%|█████████▋| 8901/9232 [10:08:07<47:30,  8.61s/it] 

Progress saved at line 8900


 97%|█████████▋| 9001/9232 [10:19:02<27:52,  7.24s/it]

Progress saved at line 9000


 98%|█████████▊| 9010/9232 [10:20:00<24:10,  6.53s/it]

Error processing line 9009: Error tokenizing data. C error: Expected 2 fields in line 8, saw 3



 98%|█████████▊| 9051/9232 [10:25:21<20:31,  6.81s/it]

Progress saved at line 9050


 99%|█████████▊| 9101/9232 [10:31:09<16:33,  7.58s/it]

Progress saved at line 9100


 99%|█████████▉| 9151/9232 [10:37:12<07:52,  5.83s/it]

Progress saved at line 9150


100%|█████████▉| 9201/9232 [10:43:13<03:59,  7.73s/it]

Progress saved at line 9200


100%|██████████| 9232/9232 [10:46:50<00:00,  7.28s/it]


VQA dataset saved to dataset/VQA-dataset/listings_0_VQA.csv
Progress saved at line 9231
