In [1]:
import os
from typing import List

# LangChain + Chroma
from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredMarkdownLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.chat_models import ChatOllama
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
import numpy as np
# LLMs
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

# UI
import gradio as gr

In [2]:
CATALOG_DIR = os.environ.get("CATALOG_DIR", "./catalog_docs")
PERSIST_DIR = os.environ.get("CHROMA_PERSIST", "./chroma_products")
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
MAX_NEW_TOKENS = 512

In [3]:
def load_documents(folder: str) -> List[Document]:
    if not os.path.exists(folder):
        os.makedirs(folder, exist_ok=True)
        raise FileNotFoundError(f"No catalog docs found in {folder}. Please add .txt, .md, or .pdf files.")

    docs = []
    for file in os.listdir(folder):
        path = os.path.join(folder, file)
        if file.endswith(".txt"):
            loader = TextLoader(path)
            docs.extend(loader.load())
        elif file.endswith(".md"):
            loader = UnstructuredMarkdownLoader(path)
            docs.extend(loader.load())
    return docs

# ---------- Embeddings + Vector Store ----------

def build_vectorstore(docs: List[Document]):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    splits = splitter.split_documents(docs)
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    vectordb = Chroma.from_documents(splits, embeddings, persist_directory=PERSIST_DIR)
    vectordb.persist()
    return vectordb

In [4]:
docs = load_documents(CATALOG_DIR)
vectordb = build_vectorstore(docs)

  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
  return forward_call(*args, **kwargs)
  vectordb.persist()


In [5]:
collection = vectordb._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 384 dimensions


In [8]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['source'] for metadata in result['metadatas']]
print(doc_types)

['./catalog_docs\\headphones-airpods-pro-2.md', './catalog_docs\\headphones-airpods-pro-2.md', './catalog_docs\\headphones-akg-n700nc-m3.md', './catalog_docs\\headphones-akg-n700nc-m3.md', './catalog_docs\\headphones-anker-soundcore-space-one.txt', './catalog_docs\\headphones-anker-soundcore-space-one.txt', './catalog_docs\\headphones-apple-airpods-max.txt', './catalog_docs\\headphones-apple-airpods-max.txt', './catalog_docs\\headphones-audio-technica-ath-m50xbt2.md', './catalog_docs\\headphones-audio-technica-ath-m50xbt2.md', './catalog_docs\\headphones-bang-and-olufsen-h95.md', './catalog_docs\\headphones-bang-and-olufsen-h95.md', './catalog_docs\\headphones-beats-studio-pro.md', './catalog_docs\\headphones-beats-studio-pro.md', './catalog_docs\\headphones-beyerdynamic-dt-900-pro-x.txt', './catalog_docs\\headphones-beyerdynamic-dt-900-pro-x.txt', './catalog_docs\\headphones-bose-quietcomfort-ultra.md', './catalog_docs\\headphones-bose-quietcomfort-ultra.md', './catalog_docs\\headphon

In [10]:

category_colors = {
    "headphones": "blue",
    "laptops": "green",
    "smartphones": "red",
    "smartwatches": "orange"
}

# Extract category from each filepath and map to color
colors = [
     category_colors[os.path.basename(path).split("-")[0]]
    for path in doc_types
]

In [11]:

from sklearn.manifold import TSNE
import plotly.graph_objects as go

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [12]:
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()