In [16]:
!pip install langchain openai chromadb tiktoken jq python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [39]:
from langchain.chains import RetrievalQA
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import JSONLoader
from langchain_core.documents import Document
from collections import defaultdict
from typing import Dict, List, Tuple, Any
import os
from dotenv import load_dotenv
import tiktoken
import time
import chromadb


In [26]:
def group_by_episode_act(docs: List[Document]) -> Dict[Tuple[str, str], List[Document]]:
    """
    Group documents by (episode, act).
    """
    grouped: Dict[Tuple[str, str], List[Document]] = defaultdict(list)
    for doc in docs:
        episode = doc.metadata.get("episode", "unknown_episode")
        act = doc.metadata.get("act", "unknown_act")
        grouped[(episode, act)].append(doc)
    return grouped


def sort_group_by_utterance_start(group: List[Document]) -> List[Document]:
    """
    Sort a list of docs by utterance_start (ascending).
    Missing values are treated as 0.
    """
    return sorted(
        group,
        key=lambda d: (d.metadata.get("utterance_start")
                       if d.metadata.get("utterance_start") is not None
                       else 0.0)
    )




In [27]:
class TranscriptChunker:
    """
    Chunk This American Life–style transcripts into smaller text blocks,
    grouped by episode + act and ordered by utterance_start.

    Chunks are formed by concatenating utterances until `max_words`
    is reached, with optional overlap in terms of utterances.
    """

    def __init__(self,
                 max_words: int = 350,
                 overlap_utterances: int = 2):
        """
        Args:
            max_words: Target maximum words per chunk.
            overlap_utterances: How many utterances from the end of the
                                previous chunk to repeat at the start
                                of the next chunk.
        """
        self.max_words = max_words
        self.overlap_utterances = overlap_utterances

    def _make_chunk_document(
        self,
        episode: str,
        act: str,
        chunk_index: int,
        docs_in_chunk: List[Document]
    ) -> Document:
        """
        Create a new Document representing one chunk, with aggregated metadata.
        """
        text_parts = [d.page_content.strip() for d in docs_in_chunk if d.page_content]
        chunk_text = " ".join(text_parts)

        # Aggregate metadata
        speakers = {d.metadata.get("speaker") for d in docs_in_chunk if d.metadata.get("speaker")}
        roles = {d.metadata.get("role") for d in docs_in_chunk if d.metadata.get("role")}

        starts = [d.metadata.get("utterance_start") for d in docs_in_chunk
                  if d.metadata.get("utterance_start") is not None]
        ends = [d.metadata.get("utterance_end") for d in docs_in_chunk
                if d.metadata.get("utterance_end") is not None]

        chunk_metadata: Dict[str, Any] = {
            "episode": episode,
            "act": act,
            "chunk_index": chunk_index,
            "num_utterances": len(docs_in_chunk),
            "num_words": len(chunk_text.split()),
            "speakers": ", ".join(sorted(list(speakers))),
            "roles": ", ".join(sorted(list(roles))),
            "chunk_utterance_start": min(starts) if starts else None,
            "chunk_utterance_end": max(ends) if ends else None,
        }

        return Document(page_content=chunk_text, metadata=chunk_metadata)

    def chunk_group(self,
                    episode: str,
                    act: str,
                    docs_in_group: List[Document]) -> List[Document]:
        """
        Chunk all utterances for a single (episode, act) group.
        Assumes docs_in_group are already sorted by utterance_start.
        """
        chunks: List[Document] = []
        current_docs: List[Document] = []
        current_word_count = 0
        chunk_index = 0

        for doc in docs_in_group:
            words = doc.page_content.split()
            n_words = len(words)

            # If adding this utterance would exceed the max_words, flush current chunk
            if current_docs and (current_word_count + n_words > self.max_words):
                chunk_doc = self._make_chunk_document(
                    episode, act, chunk_index, current_docs
                )
                chunks.append(chunk_doc)
                chunk_index += 1

                # prepare next chunk, with overlap
                if self.overlap_utterances > 0:
                    overlap_docs = current_docs[-self.overlap_utterances:]
                else:
                    overlap_docs = []

                current_docs = list(overlap_docs)
                current_word_count = sum(
                    len(d.page_content.split()) for d in current_docs
                )

            # add current utterance
            current_docs.append(doc)
            current_word_count += n_words

        # flush final chunk
        if current_docs:
            chunk_doc = self._make_chunk_document(
                episode, act, chunk_index, current_docs
            )
            chunks.append(chunk_doc)

        return chunks

    def chunk_documents(self, docs: List[Document]) -> List[Document]:
        """
        High-level method: groups by (episode, act), sorts within each group,
        and returns a flat list of chunk Documents.
        """
        grouped = group_by_episode_act(docs)
        all_chunks: List[Document] = []

        for (episode, act), group_docs in grouped.items():
            sorted_docs = sort_group_by_utterance_start(group_docs)
            group_chunks = self.chunk_group(episode, act, sorted_docs)
            all_chunks.extend(group_chunks)

        return all_chunks

In [28]:
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["episode"] = record.get("episode")
    metadata["role"] = record.get("role")
    metadata["speaker"] = record.get("speaker")
    metadata["act"] = record.get("act")
    metadata["utterance_start"] = record.get("utterance_start")
    metadata["utterance_end"] = record.get("utterance_end")
    
    return metadata


In [29]:
loader = JSONLoader(
    file_path="data/transcripts_full.json",
    jq_schema=".[].[]",     
    content_key="utterance",  
    metadata_func=metadata_func
)

docs = loader.load()

In [30]:
print(docs[0]
)

page_content='Joe Franklin?' metadata={'source': '/Users/matthewlu/Downloads/podcast-RAG/data/transcripts_full.json', 'seq_num': 1, 'episode': 'ep-1', 'role': 'interviewer', 'speaker': 'ira glass', 'act': 'prologue', 'utterance_start': 0.17, 'utterance_end': 0.58}


In [31]:
print(docs[1])

page_content='I'm ready.' metadata={'source': '/Users/matthewlu/Downloads/podcast-RAG/data/transcripts_full.json', 'seq_num': 2, 'episode': 'ep-1', 'role': 'subject', 'speaker': 'joe franklin', 'act': 'prologue', 'utterance_start': 0.58, 'utterance_end': 1.39}


In [32]:
#Combine docs into larger chunks
chunker = TranscriptChunker(
    max_words=350,          # you can tune this
    overlap_utterances=2    # you can tune this too
)

chunked_docs = chunker.chunk_documents(docs)

print(f"Original utterances: {len(docs)}")
print(f"Chunked docs: {len(chunked_docs)}")
print(chunked_docs[0].metadata)
print(chunked_docs[0].page_content[:400], "...")


Original utterances: 163808
Chunked docs: 28642
{'episode': 'ep-1', 'act': 'prologue', 'chunk_index': 0, 'num_utterances': 15, 'num_words': 342, 'speakers': 'ira glass, joe franklin', 'roles': 'host, interviewer, subject', 'chunk_utterance_start': 0.17, 'chunk_utterance_end': 132.75}
Joe Franklin? I'm ready. It's Ira Glass here. Oh, you're the emcee on the show, Ira. I am the emcee on the show. Yes. Oh great. Ira? I-R-A, Ira? Ira, I-R-A. Oh, great. Now hold on one second, Ira. Don't go away. Hello? [UNINTELLIGIBLE]. Call me after 3 o'clock. I have great news for you. Ira. Yes. So listen, Tony. If the phone rings, take it in the back. And then come out and tell me who it is. Ju ...


In [15]:
print(chunked_docs[1])

page_content='Nobody hearing my words right now is thinking, "Oh, man, remember that show, back when it used to be good? That show, I never missed that show back in the old days, back in the first couple years before it got so-called popular. Back when it was still good." No, actually, I think that force, that human desire to say that is so strong, to say that "I was there back when that show was good," that force is so strong, it is so basic to who we are as people that I know-- OK, what are we? We are two minutes into the program-- I know that somewhere out there, one or two of you are saying, "Oh, sure. I used to listen to that show back in the first 30 seconds, back when it used to be really good. Remember back when they used to do all that crazy stuff? When they had that guy on the phone? Remember back then?" Well, from WBEZ, in the glorious city of Chicago, Illinois. The name of this show is Your Radio Playhouse. I'm your emcee. I'm your emcee, Ira Glass. OK, the idea of this sho

In [15]:
print(len(chunked_docs))

28642


In [38]:
# 1. Initialize embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

# 2. Extract texts + metadata
texts = [doc.page_content for doc in chunked_docs]
metadatas = [doc.metadata for doc in chunked_docs]
ids = [str(i) for i in range(len(texts))]

# 3. Manual embedding with rate-limit protection
batch_size = 20     # safe
sleep_time = 0.15   # ~150 ms between batches

vectors = []

for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    vec = embeddings.embed_documents(batch)
    vectors.extend(vec)

    print(f"Embedded {i + len(batch)} / {len(texts)}", end="\r")
    time.sleep(sleep_time)

print("\nFinished embedding all chunks.")

# 4. Build the Chroma DB
vector_store = Chroma.from_embeddings(
    embeddings=vectors,
    metadatas=metadatas,
    ids=ids
)

print("Chroma vector store created successfully.")

Using existing embeddings from memory.


  vector_store = Chroma(


InternalError: ValueError: Batch size of 28642 is greater than max batch size of 5461

In [42]:
import gc

# Force garbage collection to release any dangling file locks
gc.collect()

persist_dir = os.path.join(os.getcwd(), "tal_chroma")
print(f"Using ChromaDB persistence directory: {persist_dir}")

try:
    client = chromadb.PersistentClient(path=persist_dir)
    collection = client.get_or_create_collection("tal_collection")
except Exception as e:
    print(f"Error initializing ChromaDB: {e}")
    print("If you see 'attempt to write a readonly database', please RESTART THE KERNEL.")
    raise e

max_batch_size = 5000  # below 5461 limit

for start in range(0, len(ids), max_batch_size):
    end = start + max_batch_size
    batch_ids = ids[start:end]
    batch_vectors = vectors[start:end]
    batch_metas = metadatas[start:end]
    batch_docs = texts[start:end]
    
    collection.upsert(
        ids=batch_ids,
        embeddings=batch_vectors,
        metadatas=batch_metas,
        documents=batch_docs,
    )
    print(f"Upserted {end} / {len(ids)}", end="\r")

print("\nDone writing to Chroma.")


embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
)

vector_store = Chroma(
    client=client,
    collection_name="tal_collection",
    embedding_function=embeddings,
)

print("Chroma vector store created successfully.")

Using ChromaDB persistence directory: /Users/matthewlu/Downloads/podcast-RAG/tal_chroma
Upserted 30000 / 28642
Done writing to Chroma.
Chroma vector store created successfully.
Upserted 30000 / 28642
Done writing to Chroma.
Chroma vector store created successfully.


In [64]:
docs = vector_store.similarity_search("In episode 462, what did Ira Glass and Steve Blass talk about?", k=3)
for d in docs:
    print(d.metadata)
    print(d.page_content[:500], "...\n")


{'num_words': 219, 'chunk_index': 0, 'chunk_utterance_end': 3540.00052154195, 'num_utterances': 14, 'chunk_utterance_start': 3456.93, 'speakers': 'adam davidson, alex blumberg, announcer, glen pizzolorusso, ira glass', 'act': 'credits', 'episode': 'ep-355', 'roles': 'host, subject'}
Alex Blumberg and Adam Davidson. Alex, my voice is so bad, I think maybe you should read the credits. Why don't you do it? All right, Ira. I hope you feel better. I'm going to bring Adam in here to help me to, since I've never done this before. Thanks today to Ellen Weiss at NPR who made this collaboration happen this week between the news division at NPR, where I work, All Things Considered, and This American Life. Where I work. Thanks also to Mary Ann Casavant, Anna Chai, Kevin Byers, the fant ...

{'episode': 'ep-73', 'act': 'prologue', 'chunk_utterance_start': 200.18, 'roles': 'host, interviewer, subject', 'speakers': 'ira glass, jim biederman', 'num_utterances': 6, 'num_words': 224, 'chunk_utterance_en

In [20]:
tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")

# chunked_docs is a list of Document objects, but encode expects a string.
# We need to iterate and encode the page_content of each doc.
total_tokens = 0
for doc in chunked_docs:
    total_tokens += len(tokenizer.encode(doc.page_content))

print(f"Total tokens across {len(chunked_docs)} chunks: {total_tokens}")

Total tokens across 28642 chunks: 10993831


In [79]:
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=1,
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    verbose=False,
    streaming=False

)

In [100]:
import re

def ask_podcast_rag(question: str):
    # 1. Try to extract an episode number for metadata filtering
    # This helps the vector store narrow down to the specific episode
    search_kwargs = {"k": 10}
    
    # Regex to find "episode <number>"
    match = re.search(r"episode\s+(\d+)", question, re.IGNORECASE)
    filter_used = None
    if match:
        ep_num = match.group(1)
        # Construct the ID format used in your metadata (e.g., "ep-462")
        filter_dict = {"episode": f"ep-{ep_num}"}
        search_kwargs["filter"] = filter_dict
        filter_used = filter_dict
    
    # 2. Retrieve documents
    docs = vector_store.similarity_search(question, **search_kwargs)
    
    # 3. Format context
    context = ""
    for i, doc in enumerate(docs):
        context += f"\nDocument {i+1} (Episode {doc.metadata.get('episode')}):\n{doc.page_content}\n"
    
    # 4. Build prompt
    prompt = f"""You are a helpful assistant answering questions about podcast transcripts.
Use the following context, and also general knowledge to answer the question.

Context:
{context}

Question: {question}
"""

    # 5. Get answer
    response = llm.predict(prompt)
    
    return response, docs, filter_used

# Example usage


In [103]:
query = "In Episode 500, what object does Ira Glass compare the milestone to clicking over?"
answer, source_docs, filter_used = ask_podcast_rag(query)

if filter_used:
    print(f"Applying metadata filter: {filter_used}")

# Clean up the response to ensure consistent formatting
final_answer = answer.strip()
if final_answer.startswith("Answer:"):
    final_answer = final_answer[len("Answer:"):].strip()

print("Answer:")
print(final_answer)
print()
print("-" * 50)

print("Context Used:")
for i, doc in enumerate(source_docs):
    print(f"Document {i+1} (Episode {doc.metadata.get('episode')}):")
    print(doc.page_content)
    print()
print("-" * 50)


Applying metadata filter: {'episode': 'ep-500'}
Answer:
Ira Glass compares the milestone of the 500th episode to an odometer clicking over.

--------------------------------------------------
Context Used:
Document 1 (Episode ep-500):
From WBEZ Chicago, it's This American Life, distributed by Public Radio International. I'm Ira Glass, and this is our 500th episode. And what does that feel like? Well, it feels like both a milestone and it feels like nothing. It fills like an odometer clicking over. I was talking to the show's senior producer, Julie Snyder, about this. She's been here for 15 of the show's 17 years, since episode number 58. Five hun-- what-- I just-- I just come to work. I do my job. I go home. Play each ballgame the best I can. Yeah, exactly. No, it really is a blur. It is a blur. And over the last few weeks here at the radio show, we talked about what we should do for the 500th episode. And like, first of all, should we mark it at all? You know what I mean? Like 500 sho

In [104]:
golden_set = [
    {
        "question": "What is the title of episode 462 of This American Life?",
        "ground_truth": "Own Worst Enemy"
    },
    {
        "question": "What is the title of episode 449 of This American Life?",
        "ground_truth": "Middle School"
    },
    {
        "question": "Which episode is titled 'In Defense of Ignorance'?",
        "ground_truth": "585"
    },
    {
        "question": "Which show is hosted by the program described as a weekly public radio program produced by WBEZ Chicago and syndicated by PRX?",
        "ground_truth": "This American Life"
    },
    {
        "question": "What is the original name of 'This American Life' when it first aired in 1995?",
        "ground_truth": "Your Radio Playhouse"
    },
    {
        "question": "As of 2025, who is listed as the host of This American Life?",
        "ground_truth": "Ira Glass"
    },
    {
        "question": "What is the episode number of 'In Defense of Ignorance'?",
        "ground_truth": "585"
    },
    {
        "question": "Which This American Life episode archive page indicates that transcripts become available the week after broadcast?",
        "ground_truth": "FAQ page"
    },
    {
        "question": "Who is the interviewee in the prologue of Episode 1?",
        "ground_truth": "Joe Franklin"
    },
    {
        "question": "In Episode 200, which government department hired a former ad executive to run an information campaign?",
        "ground_truth": "US State Department"
    },
    {
        "question": "In Episode 500, what object does Ira Glass compare the milestone to clicking over?",
        "ground_truth": "odometer"
    }
]

def ask_vanilla_llm(question):
    """Asks the LLM without any retrieved context."""
    prompt = f"""You are a helpful assistant. Answer the following question to the best of your ability.
    
    Question: {question}
    """
    return llm.predict(prompt)

def grade_answer(question, ground_truth, prediction):
    """
    Grades the answer based on string matching rules:
    1. Exact match (case-insensitive, stripped)
    2. Ground truth is a substring of prediction
    3. Prediction is a substring of ground truth
    4. All words in ground truth appear in prediction
    """
    gt_norm = ground_truth.lower().strip()
    pred_norm = prediction.lower().strip()
    
    # 1. Exact match
    if gt_norm == pred_norm:
        return True
        
    # 2. Ground truth is substring of prediction
    if gt_norm in pred_norm:
        return True
        
    # 3. Prediction is substring of ground truth
    if pred_norm in gt_norm:
        return True
        
    # 4. All words in ground truth appear in prediction
    gt_words = set(gt_norm.split())
    pred_words = set(pred_norm.split())
    if gt_words.issubset(pred_words):
        return True
        
    return False

# 2. Run Evaluation
results = []
print(f"Starting evaluation on {len(golden_set)} questions...")

for i, item in enumerate(golden_set):
    q = item["question"]
    gt = item["ground_truth"]
    
    print(f"\nTest {i+1}: {q}")
    
    # Test RAG
    # Note: ask_podcast_rag returns (response, docs, filter_used)
    rag_ans, _, _ = ask_podcast_rag(q)
    rag_correct = grade_answer(q, gt, rag_ans)
    print(f"  RAG: {'✅' if rag_correct else '❌'}")
    
    # Test Vanilla
    vanilla_ans = ask_vanilla_llm(q)
    vanilla_correct = grade_answer(q, gt, vanilla_ans)
    print(f"  Vanilla: {'✅' if vanilla_correct else '❌'}")
    
    results.append({
        "question": q,
        "ground_truth": gt,
        "rag_correct": rag_correct,
        "vanilla_correct": vanilla_correct,
        "rag_ans": rag_ans,
        "vanilla_ans": vanilla_ans
    })

# 3. Calculate Metrics
rag_accuracy = sum(1 for r in results if r["rag_correct"]) / len(results) * 100
vanilla_accuracy = sum(1 for r in results if r["vanilla_correct"]) / len(results) * 100

print("-" * 50)
print(f"Final Results:")
print(f"RAG Accuracy:     {rag_accuracy:.1f}%")
print(f"Vanilla Accuracy: {vanilla_accuracy:.1f}%")
print("-" * 50)

# Optional: Print failures to analyze
# print("\nAnalysis of RAG Failures:")
# for r in results:
#     if not r["rag_correct"]:
#         print(f"Q: {r['question']}")
#         print(f"Expected: {r['ground_truth']}")
#         print(f"Got: {r['rag_ans']}\n")

Starting evaluation on 11 questions...

Test 1: What is the title of episode 462 of This American Life?
  RAG: ✅
  RAG: ✅
  Vanilla: ❌

Test 2: What is the title of episode 449 of This American Life?
  Vanilla: ❌

Test 2: What is the title of episode 449 of This American Life?
  RAG: ❌
  RAG: ❌
  Vanilla: ❌

Test 3: Which episode is titled 'In Defense of Ignorance'?
  Vanilla: ❌

Test 3: Which episode is titled 'In Defense of Ignorance'?
  RAG: ✅
  RAG: ✅
  Vanilla: ❌

Test 4: Which show is hosted by the program described as a weekly public radio program produced by WBEZ Chicago and syndicated by PRX?
  Vanilla: ❌

Test 4: Which show is hosted by the program described as a weekly public radio program produced by WBEZ Chicago and syndicated by PRX?
  RAG: ✅
  RAG: ✅
  Vanilla: ✅

Test 5: What is the original name of 'This American Life' when it first aired in 1995?
  Vanilla: ✅

Test 5: What is the original name of 'This American Life' when it first aired in 1995?
  RAG: ❌
  RAG: ❌
  Va