# L4 â€” Q&A / RAG over a CSV (Chroma + OpenAI Embeddings)

# Setup

This notebook uses **OpenAI (Python SDK v2) + LangChain v1**.

## Prereqs
1. Set your API key in the environment:

```bash
export OPENAI_API_KEY="..."
```

2. Restart the kernel after setting env vars.


In [None]:
import os

# Make sure your key is set
assert os.getenv("OPENAI_API_KEY"), "Set OPENAI_API_KEY in your environment before running."

MODEL = "gpt-5-mini"


We'll index the provided `OutdoorClothingCatalog_1000.csv` and answer questions using retrieval-augmented generation.

In [None]:
from pathlib import Path
import pandas as pd

csv_path = Path("OutdoorClothingCatalog_1000.csv")
if not csv_path.exists():
    # In this repo it may be adjacent; update path if needed.
    csv_path = Path("/mnt/data/OutdoorClothingCatalog_1000.csv")

df = pd.read_csv(csv_path)
df.head()


## 1) Convert rows to Documents

In [None]:
from langchain_core.documents import Document

docs = []
for _, row in df.iterrows():
    text = "\n".join([f"{col}: {row[col]}" for col in df.columns])
    docs.append(Document(page_content=text, metadata={"source": "catalog"}))

len(docs), docs[0].page_content[:200]


## 2) Split + Embed + Store (Chroma)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
splits = splitter.split_documents(docs)

emb = OpenAIEmbeddings(model="text-embedding-3-small")

persist_dir = ".chroma_outdoor_catalog"
vs = Chroma.from_documents(
    documents=splits,
    embedding=emb,
    persist_directory=persist_dir,
)

retriever = vs.as_retriever(search_kwargs={"k": 4})


## 3) RAG chain (retriever + prompt + model)

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(model=MODEL)
to_text = StrOutputParser()

rag_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "Answer using ONLY the provided context. "
     "If the answer is not in the context, say you don't know."),
    ("user", "Question: {question}\n\nContext:\n{context}")
])

def format_docs(docs):
    return "\n\n---\n\n".join(d.page_content for d in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | to_text
)

print(rag_chain.invoke("What are some waterproof jackets in the catalog?"))
