In [1]:
import pandas as pd
import numpy as np
import os
import json
from PIL import Image
from io import StringIO
os.chdir('/home/danish/VR_PROJECT')

In [2]:
import os
import google.generativeai as genai

# Set the GOOGLE_API_KEY in the environment first
os.environ["GOOGLE_API_KEY"] = "AIzaSyCxqjnThpCwzJYkC4_AWu7yM4r3nMWYbAw"

# Then configure the genai module with the API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# Now you can create the GenerativeModel instance
model = genai.GenerativeModel('gemini-2.0-flash-001')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
listing_file = "dataset/abo-listings/listings/metadata/listings_8.json"
image_metadata = pd.read_csv("dataset/abo-images-small/images/metadata/images.csv")
image_dataset_path = "dataset/abo-images-small/images/small/"

In [4]:
#create output csv file
output_file = f"dataset/VQA-dataset/{listing_file.split('/')[-1].split('.')[0]}_VQA.csv"

In [5]:
def get_listing_lines(listing_file):
    """
    Read the listing file and return a list of lines.
    """
    with open(listing_file, 'r') as file:
        lines = file.readlines()
    return lines





In [6]:
def preprocess_product_json(product_json):
        """
        Preprocess the product JSON to ensure it is in the correct format.
        """
        list_of_keys_to_remove = ['main_image_id','node','other_image_id','spin_id','3dmodel_id']
        # Convert JSON string to dictionary
        product_dict = json.loads(product_json)
        # Remove unnecessary keys
        for key in list_of_keys_to_remove:
            if key in product_dict:
                del product_dict[key]
        
        # Convert dictionary to JSON string with indentation for better readability
        return json.dumps(product_dict, indent=4)
def prompt_for_product(product_json):
        """
        Generate a prompt for the given product JSON.
        """
        product_json = preprocess_product_json(product_json)
        prompt = f"""
        You are a QA dataset generator that creates short, factual, and human-readable question-answer pairs from Amazon product metadata and image. Each question must target a specific field from the metadata and be answerable with a **single word only**.

        Below is the product metadata in structured format. Generate **5 to 10 diverse QA pairs**, where:
        - Each question is clear and unambiguous.
        - Each answer is strictly a **single word** (no phrases, no multi-word answers).
        - Avoid repeating the same field.
        - Prefer commonly relevant fields like: brand, bullet_points, color, material, product type, model name, style, fabric type, finish type, pattern, item shape, product description and color code.
        - Questions should be such a way that they can be answered by looking at the image.
        - The output should be in CSV format with columns: question, answer.

        If a value is not meaningful or not present, skip that field. Ensure that QA pairs are diverse and aligned with the data provided.

        ---
        {product_json}
        ---
        """
        
        return prompt

In [7]:
def get_images_paths(image_ids):
    """
    Get the paths of images based on their IDs.
    """
    image_paths = []
    for image_id in image_ids:
        image_path = image_metadata[image_metadata['image_id'] == image_id]['path'].values
        if len(image_path) > 0:
            if os.path.exists(f"{image_dataset_path}/{image_path[0]}"):
                image_paths.append(f"{image_path[0]}")
    return image_paths

def generate_VQA(prompt, image_path):
    img = Image.open(f"{image_dataset_path}/{image_path}")
    img = img.convert("RGB")
    # Generate the VQA using the model
    response = model.generate_content([prompt, img])
    # Extract the generated text from the response
    generated_text = response.text
    #read csv from the generated text
    csv_data = pd.read_csv(StringIO(generated_text.strip("`").replace("csv\n", "", 1).strip()))
    return csv_data

def get_VQA_for_product(product_json):
    df = pd.DataFrame(columns=["image_path","question", "answer"])
    list_of_image_ids = []
    
    prompt = prompt_for_product(product_json)
    product_dict = json.loads(product_json)
    if "main_image_id" in product_dict.keys():
        list_of_image_ids.append(product_dict['main_image_id'])
    if "other_image_id" in product_dict.keys():
        # Check if the key exists in the dictionary
        if isinstance(product_dict['other_image_id'], list):
            # If it's a list, extend it to the list_of_image_ids
            list_of_image_ids.extend(product_dict['other_image_id'])
        else:
            # If it's not a list, append it directly
            list_of_image_ids.append(product_dict['other_image_id'])
    image_paths = get_images_paths(list_of_image_ids)

    # print(f"Image paths: {image_paths}")
    # Generate the VQA using the model
    if len(image_paths) == 0:
        print("No images found for this product.")
        return df
    for image_path in image_paths:
        # print(f"Generating VQA for image: {image_path}")
        # Generate the VQA using the model
        csv_data = generate_VQA(prompt, image_path)
        # Append the generated data to the dataframe
        csv_data['image_path'] = image_path
        df = pd.concat([df, csv_data], ignore_index=True)
    return df



    
    
    

In [None]:
import time
from tqdm import tqdm
print(output_file)
if not os.path.exists(output_file):
    print(f"Output file {output_file} does not exist. Creating a new one.")
    output_df = pd.DataFrame(columns=["image_path","question", "answer"])
else:
    output_df = pd.read_csv(output_file)


lines = get_listing_lines(listing_file)

# subset_lines = lines[556:]  # to process a subset of lines
start_index = 650
try:
    for i in tqdm(range(start_index, len(lines)), initial=start_index, total=len(lines)):
        line = lines[i]
        if "\"en_" not in line:
            continue
        try:
            df = get_VQA_for_product(line)
            time.sleep(2)
        except Exception as e:
            print(f"Error processing line {i}: {e}")
            if "Resource has been exhausted" in str(e):
                print("Resource has been exhausted. Please try again later.")
                time.sleep(300)
                try:
                    df = get_VQA_for_product(line)
                except Exception as e:
                    time.sleep(900)
                    df = get_VQA_for_product(line)
            else:
                continue
        output_df = pd.concat([output_df, df], ignore_index=True)
        if i % 50 == 0:
            # Save the output DataFrame to a CSV file every 50 iterations
            output_df.to_csv(output_file, index=False)
            print(f"Progress saved at line {i}")
except Exception as e:
    print(f"An error occurred: {e}")
    # Handle the exception as needed
    pass
# Save the output DataFrame to a CSV file
output_df.to_csv(output_file, index=False)
print(f"VQA dataset saved to {output_file}") 
print(f"Progress saved at line {i}")


dataset/VQA-dataset/listings_8_VQA.csv


  7%|▋         | 651/9232 [00:12<30:41:07, 12.87s/it]

Progress saved at line 650


  7%|▋         | 654/9232 [00:49<29:25:24, 12.35s/it]

Error processing line 654: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


  7%|▋         | 680/9232 [10:00<23:19:47,  9.82s/it]  

Error processing line 680: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


  7%|▋         | 682/9232 [15:22<173:01:00, 72.85s/it] 

Error processing line 682: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


  8%|▊         | 710/9232 [25:13<20:51:38,  8.81s/it]  

Error processing line 710: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


  8%|▊         | 746/9232 [50:10<20:13:53,  8.58s/it]  

Error processing line 746: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


  8%|▊         | 747/9232 [55:18<193:49:59, 82.24s/it]

Error processing line 747: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


  8%|▊         | 751/9232 [1:01:15<145:53:02, 61.92s/it] 

Progress saved at line 750


  8%|▊         | 779/9232 [1:05:36<24:31:06, 10.44s/it] 

Error processing line 779: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


  9%|▊         | 801/9232 [1:14:33<24:57:41, 10.66s/it]  

Progress saved at line 800


  9%|▉         | 809/9232 [1:15:41<17:52:07,  7.64s/it]

Error processing line 809: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


  9%|▉         | 821/9232 [1:37:50<43:22:35, 18.57s/it]  

Error processing line 821: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


  9%|▉         | 845/9232 [1:46:42<22:31:33,  9.67s/it] 

Error processing line 846: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


  9%|▉         | 851/9232 [1:52:38<69:22:06, 29.80s/it] 

Progress saved at line 850


 10%|▉         | 885/9232 [1:58:07<26:15:02, 11.32s/it]

Error processing line 885: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 10%|▉         | 901/9232 [2:05:46<30:00:29, 12.97s/it] 

Progress saved at line 900


 10%|▉         | 912/9232 [2:07:32<23:24:56, 10.13s/it]

Error processing line 912: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 10%|█         | 938/9232 [2:17:45<22:53:09,  9.93s/it]  

Error processing line 938: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 10%|█         | 951/9232 [2:24:37<28:01:31, 12.18s/it]  

Progress saved at line 950


 10%|█         | 952/9232 [2:24:44<24:38:54, 10.72s/it]

Error processing line 953: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 10%|█         | 954/9232 [2:45:04<638:28:35, 277.67s/it]

Error processing line 954: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 11%|█         | 996/9232 [2:56:48<17:47:00,  7.77s/it]  

Error processing line 996: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 11%|█         | 1001/9232 [3:03:06<81:52:06, 35.81s/it]

Progress saved at line 1000


 11%|█         | 1024/9232 [3:06:43<22:08:35,  9.71s/it]

Error processing line 1026: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 11%|█▏        | 1051/9232 [3:14:57<18:03:31,  7.95s/it] 

Progress saved at line 1050


 11%|█▏        | 1061/9232 [3:16:38<22:57:46, 10.12s/it]

Error processing line 1061: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 12%|█▏        | 1077/9232 [3:24:31<28:22:40, 12.53s/it]  

Error processing line 1077: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 12%|█▏        | 1101/9232 [3:32:56<19:34:55,  8.67s/it]  

Progress saved at line 1100


 12%|█▏        | 1114/9232 [3:34:39<19:44:08,  8.75s/it]

Error processing line 1114: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 12%|█▏        | 1151/9232 [3:45:03<13:53:43,  6.19s/it] 

Progress saved at line 1150


 12%|█▏        | 1153/9232 [3:45:15<14:07:38,  6.30s/it]

Error processing line 1153: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 13%|█▎        | 1173/9232 [3:52:48<18:48:03,  8.40s/it] 

No images found for this product.


 13%|█▎        | 1174/9232 [3:52:50<14:44:11,  6.58s/it]

Error processing line 1174: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 13%|█▎        | 1175/9232 [3:58:05<213:55:03, 95.58s/it]

Error processing line 1175: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 13%|█▎        | 1185/9232 [4:04:29<40:25:24, 18.08s/it]  

Error processing line 1186: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 13%|█▎        | 1201/9232 [4:11:38<17:26:39,  7.82s/it] 

Progress saved at line 1200


 13%|█▎        | 1203/9232 [4:12:03<23:38:37, 10.60s/it]

Error processing line 1203: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 14%|█▎        | 1251/9232 [4:39:21<18:40:59,  8.43s/it]  

Progress saved at line 1250


 14%|█▍        | 1305/9232 [4:47:41<19:01:26,  8.64s/it]

Error processing line 1305: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.


 14%|█▍        | 1326/9232 [5:11:28<21:40:10,  9.87s/it]  

Error processing line 1326: 429 Resource has been exhausted (e.g. check quota).
Resource has been exhausted. Please try again later.
