In [30]:
import json

file_path = "listings/metadata/listings_e.json"

with open(file_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]  # Read each line as a separate JSON object

print(len(data))  # Number of JSON objects in the file

9232


In [31]:
def format_for_rag(data):
    """Format JSON into a structured, RAG-friendly text string."""
    
    # Extract key fields
    item_name = data.get("item_name", [{}])[0].get("value", "Unknown Item")
    brand = data.get("brand", [{}])[0].get("value", "Unknown Brand")
    color = data.get("color", [{}])[0].get("value", "Unknown Color")
    style = data.get("style", [{}])[0].get("value", "Unknown Style")
    product_type = data.get("product_type", [{}])[0].get("value", "Unknown Type")
    model_number = data.get("model_number", [{}])[0].get("value", "Unknown Model")
    model_year = data.get("model_year", [{}])[0].get("value", "Unknown Year")
    marketplace = data.get("marketplace", "Unknown Marketplace")
    
    # Extract bullet points and keywords
    bullet_points = [bp["value"] for bp in data.get("bullet_point", [])]
    keywords = [kw["value"] for kw in data.get("item_keywords", [])]

    # Extract category info
    category_path = " > ".join([node["node_name"] for node in data.get("node", [])])

    # Construct the formatted string
    formatted_string = (
        f"**Item Name:** {item_name}. "
        f"**Brand:** {brand}. "
        f"**Color:** {color}. "
        f"**Style:** {style}. "
        f"**Key Features:** {', '.join(bullet_points)}. "
        f"**Keywords:** {', '.join(keywords)}. "
        f"**Product Type:** {product_type}. "
        f"**Model Number:** {model_number}. "
        f"**Model Year:** {model_year}. "
        f"**Marketplace:** {marketplace}. "
        f"**Category Path:** {category_path}."
    )

    return formatted_string


In [32]:
all_keys = set()
for idx, item in enumerate(data):
    for key in item.keys():
        if key not in all_keys and idx != 0:
            print(f"idx {idx}, new key is {key}")
    all_keys.update(item.keys())

idx 1, new key is item_dimensions
idx 1, new key is fabric_type
idx 1, new key is material
idx 1, new key is pattern
idx 1, new key is style
idx 2, new key is item_shape
idx 2, new key is product_description
idx 4, new key is color_code
idx 56, new key is model_year
idx 65, new key is spin_id
idx 65, new key is 3dmodel_id
idx 204, new key is finish_type


In [36]:
import os
import json

# Directory containing JSON/JSONL files
json_folder = "listings/metadata"

def format_for_rag(data):
    """General function to format JSON into a structured string for RAG, handling different keys dynamically."""
    parts = []

    def extract_values(value):
        """Extract values recursively from dicts and lists."""
        if isinstance(value, dict):
            return ", ".join(f"{k}: {extract_values(v)}" for k, v in value.items())
        elif isinstance(value, list):
            return ", ".join(str(extract_values(v)) for v in value)
        return str(value)

    for key, value in data.items():
        formatted_value = extract_values(value)
        parts.append(f"{key.capitalize()}: {formatted_value}")

    return "\n".join(parts)


In [50]:
formatted_data_sample = format_for_rag(data[1342]) # 13, 1342
# Some strings contain too many characters.
# For example, data[1342] of listings_e.json 
# contains over 7000 characters in Item_keywords, which are important.
# There may be smarter ways to generate the formatted string,
# such as truncating the string or using a more concise representation.
# Also, so descriptions are not in English, one idea my be to translate them to English.

In [51]:
data[1342]

{'brand': [{'language_tag': 'en_IN', 'value': 'Amazon Brand - Solimo'}],
 'bullet_point': [{'language_tag': 'en_IN',
   'value': '3D Printed Hard Back Case Mobile Cover for Oppo F3'},
  {'language_tag': 'en_IN',
   'value': 'Easy to put & take off with perfect cutouts for volume buttons, audio & charging ports.'},
  {'language_tag': 'en_IN',
   'value': 'Stylish design and appearance, express your unique personality.'},
  {'language_tag': 'en_IN',
   'value': 'Extreme precision design allows easy access to all buttons and ports while featuring raised bezel to life screen and camera off flat surface.'},
  {'language_tag': 'en_IN', 'value': 'Slim Hard Back Cover'},
  {'language_tag': 'en_IN', 'value': 'No Warranty'}],
 'color': [{'language_tag': 'en_IN',
   'standardized_values': ['multi-colored'],
   'value': 'Others'}],
 'item_id': 'B07TH3DNQ4',
 'item_name': [{'language_tag': 'en_IN',
   'value': 'Amazon Brand - Solimo Designer Sea Seen 3D Printed Hard Back Case Mobile Cover for Oppo 

In [49]:
len(formatted_data_sample)

8379

In [53]:
print(formatted_data_sample)

Brand: language_tag: en_IN, value: Amazon Brand - Solimo
Bullet_point: language_tag: en_IN, value: 3D Printed Hard Back Case Mobile Cover for Oppo F3, language_tag: en_IN, value: Easy to put & take off with perfect cutouts for volume buttons, audio & charging ports., language_tag: en_IN, value: Stylish design and appearance, express your unique personality., language_tag: en_IN, value: Extreme precision design allows easy access to all buttons and ports while featuring raised bezel to life screen and camera off flat surface., language_tag: en_IN, value: Slim Hard Back Cover, language_tag: en_IN, value: No Warranty
Color: language_tag: en_IN, standardized_values: multi-colored, value: Others
Item_id: B07TH3DNQ4
Item_name: language_tag: en_IN, value: Amazon Brand - Solimo Designer Sea Seen 3D Printed Hard Back Case Mobile Cover for Oppo F3
Item_weight: normalized_value: unit: pounds, value: 0.110231131, unit: grams, value: 50
Model_name: language_tag: en_IN, value: Oppo F3
Model_number: 

In [55]:
s = "Item_keywords: language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: samsung mobile case, language_tag: en_IN, value: samsung phone cover, language_tag: en_IN, value: samsung back case, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover, language_tag: en_IN, value: mobile cover, language_tag: en_IN, value: back cover, language_tag: en_IN, value: mobile case, language_tag: en_IN, value: phone case, language_tag: en_IN, value: mobile panel, language_tag: en_IN, value: phone panel, language_tag: en_IN, value: Oppo mobile case, language_tag: en_IN, value: Oppo phone cover, language_tag: en_IN, value: Oppo back case, language_tag: en_IN, value: hard case, language_tag: en_IN, value: 3D printed mobile cover"

In [56]:
len(s)

7040

In [None]:
# ChromaDB with chuncking https://chatgpt.com/share/67c66fd2-3b00-800f-9b27-ad2a00ec8ae6