In [None]:
import json
import os
import chromadb
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
import numpy as np

file_path = "listings/metadata/listings_0.json"

with open(file_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]  # Read each line as a separate JSON object

n_data_samples = len(data)
print(n_data_samples)  # Number of JSON objects in the file

In [None]:
all_keys = set()
for idx, item in enumerate(data):
    for key in item.keys():
        if key not in all_keys and idx != 0:
            print(f"idx {idx}, new key is {key}")
    all_keys.update(item.keys())

In [None]:
import os
import json

# Directory containing JSON/JSONL files
json_folder = "listings/metadata"

def format_for_rag(data):
    """General function to format JSON into a structured string for RAG, handling different keys dynamically."""
    parts = []

    def extract_values(value):
        """Extract values recursively from dicts and lists."""
        if isinstance(value, dict):
            return ", ".join(f"{k}: {extract_values(v)}" if k != "value" else f"{extract_values(v)}" for k, v in value.items())
        elif isinstance(value, list):
            return ", ".join(str(extract_values(v)) for v in value)
        return str(value)

    for key, value in data.items():
        formatted_value = extract_values(value)
        parts.append(f"{key.capitalize()}: {formatted_value}")

    return "\n".join(parts)


In [None]:
formatted_data_sample = format_for_rag(data[1342]) 
# Some strings contain too many characters.
# There may be smarter ways to generate the formatted string...
# Also, so descriptions are not in English, one idea my be to translate them to English.

print(formatted_data_sample)

In [None]:
# ChromaDB with chuncking https://chatgpt.com/share/67c66fd2-3b00-800f-9b27-ad2a00ec8ae6

In [None]:
formatted_data = np.array([format_for_rag(data[i]) for i in range(n_data_samples)])

In [None]:
# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection(name="products")

# Load embedding model (Sentence Transformers or OpenAI)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") 
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# By default, input text longer than 256 word pieces is truncated.

In [None]:
# 128 tokens because of the way the model was trained.
def chunk_text(text, max_tokens=128):
    """Split text into chunks of max_tokens using a tokenizer."""
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = [tokens[i:i+max_tokens] for i in range(0, len(tokens), max_tokens)]
    return [tokenizer.decode(chunk) for chunk in chunks]

In [None]:
# Process each text entry
n_samples = 100

for i, text in enumerate(formatted_data[:n_samples]):
    text_chunks = chunk_text(text)  # Split text into chunks

    # Generate embeddings for each chunk
    embeddings = embedding_model.encode(text_chunks)

    # Store in ChromaDB
    for j, (chunk, embedding) in enumerate(zip(text_chunks, embeddings)):
        collection.add(
            ids=[f"item_{i}_chunk_{j}"],  # Unique ID per chunk
            embeddings=[embedding.tolist()],  # Convert NumPy array to list
            metadatas=[{"original_id": i, "chunk_index": j, "text": chunk}]
        )

    print(f"✅ Processed {len(text_chunks)} chunks for text entry {i}")

print("🚀 All data stored in ChromaDB successfully!")

In [None]:
def query_chroma(query_text, collection, top_k=3):
    """Retrieve the most relevant text chunks from ChromaDB based on query."""
    
    # Generate embedding for the query
    query_embedding = embedding_model.encode([query_text])

    # Perform similarity search in ChromaDB
    results = collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=top_k  # Retrieve top K most relevant results
    )

    # Display retrieved results
    print(f"\n🔍 Query: {query_text}\n")
    for i, match in enumerate(results["metadatas"][0]):  # First query result batch
        print(f"✨ Match {i+1}:")
        print(f"🔹 Original ID: {match['original_id']}")
        print(f"🔹 Chunk Index: {match['chunk_index']}")
        print(f"🔹 Text: {match['text']}\n")
        print("=" * 80)

In [None]:
# Example Query
query_text = "I love cats and I want a mobile phone cover with multiple colors for samsung"
query_chroma(query_text, collection, top_k=5)

In [None]:
print(formatted_data[147])