In [None]:
# LOAD EXISTING VECTOR DATABASE
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import chromadb

# 1. Define the path to the existing database
persist_dir = os.path.join(os.getcwd(), "../models/tal_chroma")
print(f"Loading ChromaDB from: {persist_dir}")

# 2. Initialize the embedding function
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
)

# 3. Connect to the persistent client
client = chromadb.PersistentClient(path=persist_dir)

# 4. Load the vector store
vector_store = Chroma(
    client=client,
    collection_name="tal_collection",
    embedding_function=embeddings,
)

print("Vector store loaded successfully.")

In [None]:
from langchain.chains import RetrievalQA
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.documents import Document
from typing import List, Dict, Any
import os
from dotenv import load_dotenv
import tiktoken
import chromadb

load_dotenv()

In [None]:
!pip install langchain openai chromadb tiktoken jq python-dotenv

In [13]:
print(docs[0]
)

page_content='Joe Franklin?' metadata={'source': '/Users/matthewlu/Downloads/podcast-RAG/data/transcripts_full.json', 'seq_num': 1, 'episode': 'ep-1', 'role': 'interviewer', 'speaker': 'ira glass', 'act': 'prologue', 'utterance_start': 0.17, 'utterance_end': 0.58}


In [14]:
print(docs[1])

page_content='I'm ready.' metadata={'source': '/Users/matthewlu/Downloads/podcast-RAG/data/transcripts_full.json', 'seq_num': 2, 'episode': 'ep-1', 'role': 'subject', 'speaker': 'joe franklin', 'act': 'prologue', 'utterance_start': 0.58, 'utterance_end': 1.39}


In [16]:
docs = vector_store.similarity_search("In episode 462, what did Ira Glass and Steve Blass talk about?", k=3)
for d in docs:
    print(d.metadata)
    print(d.page_content[:500], "...\n")


NameError: name 'vector_store' is not defined

In [20]:
tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")

# chunked_docs is a list of Document objects, but encode expects a string.
# We need to iterate and encode the page_content of each doc.
total_tokens = 0
for doc in chunked_docs:
    total_tokens += len(tokenizer.encode(doc.page_content))

print(f"Total tokens across {len(chunked_docs)} chunks: {total_tokens}")

Total tokens across 28642 chunks: 10993831


In [10]:
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=1,
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    verbose=False,
    streaming=False

)

  llm = ChatOpenAI(


In [7]:
import re

def ask_podcast_rag(question: str):
    # 1. Try to extract an episode number for metadata filtering
    # This helps the vector store narrow down to the specific episode
    search_kwargs = {"k": 10}
    
    # Regex to find "episode <number>"
    match = re.search(r"episode\s+(\d+)", question, re.IGNORECASE)
    filter_used = None
    if match:
        ep_num = match.group(1)
        # Construct the ID format used in your metadata (e.g., "ep-462")
        filter_dict = {"episode": f"ep-{ep_num}"}
        search_kwargs["filter"] = filter_dict
        filter_used = filter_dict
    
    # 2. Retrieve documents
    docs = vector_store.similarity_search(question, **search_kwargs)
    
    # 3. Format context
    context = ""
    for i, doc in enumerate(docs):
        context += f"\nDocument {i+1} (Episode {doc.metadata.get('episode')}):\n{doc.page_content}\n"
    
    # 4. Build prompt
    prompt = f"""You are a helpful assistant answering questions about podcast transcripts.
Use the following context, and also general knowledge to answer the question.

Context:
{context}

Question: {question}
"""

    # 5. Get answer
    response = llm.predict(prompt)
    
    return response, docs, filter_used

# Example usage


In [9]:
query = "In Episode 500, what object does Ira Glass compare the milestone to clicking over?"
answer, source_docs, filter_used = ask_podcast_rag(query)

if filter_used:
    print(f"Applying metadata filter: {filter_used}")

# Clean up the response to ensure consistent formatting
final_answer = answer.strip()
if final_answer.startswith("Answer:"):
    final_answer = final_answer[len("Answer:"):].strip()

print("Answer:")
print(final_answer)
print()
print("-" * 50)

print("Context Used:")
for i, doc in enumerate(source_docs):
    print(f"Document {i+1} (Episode {doc.metadata.get('episode')}):")
    print(doc.page_content)
    print()
print("-" * 50)


NameError: name 'vector_store' is not defined

In [105]:
golden_set = [
    {
        "question": "What is the title of episode 462 of This American Life?",
        "ground_truth": "Own Worst Enemy"
    },
    {
        "question": "What is the title of episode 449 of This American Life?",
        "ground_truth": "Middle School"
    },
    {
        "question": "Which episode is titled 'In Defense of Ignorance'?",
        "ground_truth": "585"
    },
    {
        "question": "Which show is hosted by the program described as a weekly public radio program produced by WBEZ Chicago and syndicated by PRX?",
        "ground_truth": "This American Life"
    },
    {
        "question": "What is the original name of 'This American Life' when it first aired in 1995?",
        "ground_truth": "Your Radio Playhouse"
    },
    {
        "question": "As of 2025, who is listed as the host of This American Life?",
        "ground_truth": "Ira Glass"
    },
    {
        "question": "What is the episode number of 'In Defense of Ignorance'?",
        "ground_truth": "585"
    },
    {
        "question": "Which This American Life episode archive page indicates that transcripts become available the week after broadcast?",
        "ground_truth": "FAQ page"
    },
    {
        "question": "Who is the interviewee in the prologue of Episode 1?",
        "ground_truth": "Joe Franklin"
    },
    {
        "question": "In Episode 200, which government department hired a former ad executive to run an information campaign?",
        "ground_truth": "US State Department"
    },
    {
        "question": "In Episode 500, what object does Ira Glass compare the milestone to clicking over?",
        "ground_truth": "odometer"
    }
]

def ask_vanilla_llm(question):
    """Asks the LLM without any retrieved context."""
    prompt = f"""You are a helpful assistant. Answer the following question to the best of your ability.
    
    Question: {question}
    """
    return llm.predict(prompt)

def grade_answer(question, ground_truth, prediction):
    """
    Grades the answer based on string matching rules:
    1. Exact match (case-insensitive, stripped)
    2. Ground truth is a substring of prediction
    3. Prediction is a substring of ground truth
    4. All words in ground truth appear in prediction
    """
    gt_norm = ground_truth.lower().strip()
    pred_norm = prediction.lower().strip()
    
    # 1. Exact match
    if gt_norm == pred_norm:
        return True
        
    # 2. Ground truth is substring of prediction
    if gt_norm in pred_norm:
        return True
        
    # 3. Prediction is substring of ground truth
    if pred_norm in gt_norm:
        return True
        
    # 4. All words in ground truth appear in prediction
    gt_words = set(gt_norm.split())
    pred_words = set(pred_norm.split())
    if gt_words.issubset(pred_words):
        return True
        
    return False

# 2. Run Evaluation
results = []
print(f"Starting evaluation on {len(golden_set)} questions...")

for i, item in enumerate(golden_set):
    q = item["question"]
    gt = item["ground_truth"]
    
    print(f"\nTest {i+1}: {q}")
    
    # Test RAG
    # Note: ask_podcast_rag returns (response, docs, filter_used)
    rag_ans, _, _ = ask_podcast_rag(q)
    rag_correct = grade_answer(q, gt, rag_ans)
    print(f"  RAG: {'✅' if rag_correct else '❌'}")
    
    # Test Vanilla
    vanilla_ans = ask_vanilla_llm(q)
    vanilla_correct = grade_answer(q, gt, vanilla_ans)
    print(f"  Vanilla: {'✅' if vanilla_correct else '❌'}")
    
    results.append({
        "question": q,
        "ground_truth": gt,
        "rag_correct": rag_correct,
        "vanilla_correct": vanilla_correct,
        "rag_ans": rag_ans,
        "vanilla_ans": vanilla_ans
    })

# 3. Calculate Metrics
rag_accuracy = sum(1 for r in results if r["rag_correct"]) / len(results) * 100
vanilla_accuracy = sum(1 for r in results if r["vanilla_correct"]) / len(results) * 100

print("-" * 50)
print(f"Final Results:")
print(f"RAG Accuracy:     {rag_accuracy:.1f}%")
print(f"Vanilla Accuracy: {vanilla_accuracy:.1f}%")
print("-" * 50)

# Optional: Print failures to analyze
# print("\nAnalysis of RAG Failures:")
# for r in results:
#     if not r["rag_correct"]:
#         print(f"Q: {r['question']}")
#         print(f"Expected: {r['ground_truth']}")
#         print(f"Got: {r['rag_ans']}\n")

Starting evaluation on 11 questions...

Test 1: What is the title of episode 462 of This American Life?
  RAG: ✅
  RAG: ✅
  Vanilla: ❌

Test 2: What is the title of episode 449 of This American Life?
  Vanilla: ❌

Test 2: What is the title of episode 449 of This American Life?
  RAG: ❌
  RAG: ❌
  Vanilla: ❌

Test 3: Which episode is titled 'In Defense of Ignorance'?
  Vanilla: ❌

Test 3: Which episode is titled 'In Defense of Ignorance'?
  RAG: ✅
  RAG: ✅
  Vanilla: ❌

Test 4: Which show is hosted by the program described as a weekly public radio program produced by WBEZ Chicago and syndicated by PRX?
  Vanilla: ❌

Test 4: Which show is hosted by the program described as a weekly public radio program produced by WBEZ Chicago and syndicated by PRX?
  RAG: ✅
  RAG: ✅
  Vanilla: ✅

Test 5: What is the original name of 'This American Life' when it first aired in 1995?
  Vanilla: ✅

Test 5: What is the original name of 'This American Life' when it first aired in 1995?
  RAG: ❌
  RAG: ❌
  Va