In [1]:
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
from typing import List, Dict, Any
import json
import re
import zipfile
import os
from dotenv import load_dotenv

In [2]:
import yaml

# load cofig.yaml
with open("../config.yaml", "r") as f:
	config = yaml.safe_load(f)

In [3]:
from llama_index.embeddings.ollama import OllamaEmbedding

embedding_model = OllamaEmbedding(
    model_name=config["OLLAMA_EMBEDDING_MODEL"],
    base_url="http://localhost:11434",
)


In [4]:
def load_simpsons_data():
    """Load all Simpsons CSV files into dataframes"""
    # Load the CSV files
    characters_df = pd.read_csv('../data/simpsons/simpsons_characters.csv')
    episodes_df = pd.read_csv('../data/simpsons/simpsons_episodes.csv')
    locations_df = pd.read_csv('../data/simpsons/simpsons_locations.csv')
    script_lines_df = pd.read_csv('../data/simpsons/simpsons_script_lines.csv')
    
    print(f"Loaded {len(characters_df)} characters, {len(episodes_df)} episodes, {len(locations_df)} locations, {len(script_lines_df)} script lines")
    
    return characters_df, episodes_df, locations_df, script_lines_df

In [5]:
def create_episode_documents(episodes_df: pd.DataFrame) -> List[Dict]:
    """Create structured documents from episodes data"""
    documents = []
    
    for _, episode in episodes_df.iterrows():
        # Create comprehensive episode description
        content = f"""
        Episode: {episode['title']}
        Season: {episode['season']}
        Episode Number: {episode['number_in_season']}
        Series Number: {episode['number_in_series']}
        Air Date: {episode['original_air_date']}
        IMDB Rating: {episode['imdb_rating']}
        US Viewers: {episode['us_viewers_in_millions']} million
        Production Code: {episode['production_code']}
        """
        
        documents.append({
            'content': content.strip(),
            'metadata': {
                'type': 'episode',
                'episode_id': episode['id'],
                'title': episode['title'],
                'season': episode['season'],
                'imdb_rating': episode['imdb_rating'],
                'air_date': episode['original_air_date']
            }
        })
    
    return documents

In [6]:
def create_character_documents(characters_df: pd.DataFrame) -> List[Dict]:
    """Create structured documents from characters data"""
    documents = []
    
    for _, character in characters_df.iterrows():
        content = f"""
        Character: {character['name']}
        Gender: {character['gender']}
        Normalized Name: {character['normalized_name']}
        """
        
        documents.append({
            'content': content.strip(),
            'metadata': {
                'type': 'character',
                'character_id': character['id'],
                'name': character['name'],
                'gender': character['gender']
            }
        })
    
    return documents

In [7]:
def create_location_documents(locations_df: pd.DataFrame) -> List[Dict]:
    """Create structured documents from locations data"""
    documents = []
    
    for _, location in locations_df.iterrows():
        content = f"""
        Location: {location['name']}
        Normalized Name: {location['normalized_name']}
        """
        
        documents.append({
            'content': content.strip(),
            'metadata': {
                'type': 'location',
                'location_id': location['id'],
                'name': location['name']
            }
        })
    
    return documents

In [8]:
def create_script_documents(script_lines_df: pd.DataFrame, episodes_df: pd.DataFrame, 
                          characters_df: pd.DataFrame, locations_df: pd.DataFrame) -> List[Dict]:
    """Create structured documents from script lines with context"""
    documents = []
    
    # Create lookup dictionaries for faster access
    episode_lookup = episodes_df.set_index('id')['title'].to_dict()
    character_lookup = characters_df.set_index('id')['name'].to_dict()
    location_lookup = locations_df.set_index('id')['name'].to_dict()
    
    # Group script lines by episode for context
    grouped_scripts = script_lines_df.groupby('episode_id')
    
    for episode_id, episode_lines in grouped_scripts:
        episode_title = episode_lookup.get(episode_id, f"Episode {episode_id}")
        
        # Combine multiple lines for richer context
        episode_script = []
        for _, line in episode_lines.iterrows():
            if pd.notna(line['spoken_words']) and line['spoken_words'].strip():
                character_name = character_lookup.get(int(line['character_id']), 'Unknown') if pd.notna(line['character_id']) else 'Unknown'
                location_name = location_lookup.get(line['location_id'], 'Unknown') if pd.notna(line['location_id']) else 'Unknown'
                
                script_text = f"{character_name}: {line['spoken_words']}"
                if location_name != 'Unknown':
                    script_text += f" [Location: {location_name}]"
                
                episode_script.append(script_text)
        
        # Join all lines for the episode
        full_script = "\n".join(episode_script)
        
        if full_script.strip():
            documents.append({
                'content': f"Episode: {episode_title}\n\nScript:\n{full_script}",
                'metadata': {
                    'type': 'script',
                    'episode_id': episode_id,
                    'episode_title': episode_title,
                    'line_count': len(episode_script)
                }
            })
    
    return documents

# Chunking

In [9]:
def split_documents(documents: List[Dict]) -> List[Dict]:
    """Split documents using RecursiveCharacterTextSplitter"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ":", ".", "!", "?", ",", " ", ""]
    )
    
    split_docs = []
    
    for doc in documents:
        chunks = text_splitter.split_text(doc['content'])
        
        for i, chunk in enumerate(chunks):
            split_docs.append({
                'content': chunk,
                'metadata': {
                    **doc['metadata'],
                    'chunk_id': i,
                    'total_chunks': len(chunks)
                }
            })
    
    return split_docs

# Embeddings

In [10]:
from typing import List, Dict, Any

def create_embeddings(documents: List[Dict]) -> tuple:
    """Create embeddings for all documents"""
    texts = [doc['content'] for doc in documents]
    # embeddings = embedding_model.encode(texts, convert_to_tensor=False)
    embeddings: List[List] = embedding_model.get_text_embedding_batch(texts, show_progress=True)
    embeddings: np.ndarray = np.array(embeddings, dtype=np.float32)
    return embeddings, texts, [doc['metadata'] for doc in documents]


In [11]:
from faiss import write_index, read_index
def build_faiss_index(embeddings: np.ndarray, output_path: str) -> faiss.Index:
	"""Build FAISS index for similarity search"""
	output_path = "output/simpsons.index"
	if os.path.exists(output_path):
		print(f"Loading existing index from {output_path}")
		index = read_index(output_path)
		return index

	dimension = embeddings.shape[1]
	index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
	# Normalize embeddings for cosine similarity
	faiss.normalize_L2(embeddings)
	index.add(embeddings.astype('float32'))
	os.makedirs("output", exist_ok=True)
	write_index(index, "output/simpsons.index")
	return index

In [12]:
def search_similar_documents(query: str, index: faiss.Index, texts: List[str], 
                           metadata: List[Dict], k: int = 5) -> List[Dict]:
    """Search for k similar documents using FAISS index"""
    query_embedding = embedding_model.get_text_embedding_batch([query])
    query_embedding: np.ndarray = np.array(query_embedding, dtype=np.float32)
    faiss.normalize_L2(query_embedding)
    
    scores, indices = index.search(query_embedding.astype('float32'), k)
    
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            'content': texts[idx],
            'metadata': metadata[idx],
            'score': float(scores[0][i])
        })
    
    return results

In [13]:
from llama_index.llms.ollama import Ollama
# Initialize Ollama LLM
# https://docs.llamaindex.ai/en/stable/api_reference/llms/ollama/
llm = Ollama(
    model=config["OLLAMA_LLM_MODEL"],
    temperature=0.7,
)

In [14]:
from llama_index.core.llms import ChatMessage

def generate_response(llm, query: str, context_docs: List[Dict]) -> str:
	"""Generate response using OpenAI with retrieved context"""
	# Prepare context from retrieved documents
	context = ""
	for doc in context_docs:
		doc_type = doc['metadata']['type']
		context += f"[{doc_type.upper()}] {doc['content']}\n\n"

	# Create prompt
	prompt = f"""
	You are a knowledgeable assistant about The Simpsons TV show. Use the following context to answer the user's question.

	Context:
	{context}

	Question: {query}

	Please provide a comprehensive answer based on the context provided. If the context doesn't contain enough information to fully answer the question, mention what information is available and what might be missing.

	Answer:
	"""

	messages = [
		ChatMessage(
			role="system", 
   			content="You are a helpful assistant specializing in The Simpsons TV show.",
		),
		ChatMessage(
			role="user",
			content=prompt
		)
	]
	response = llm.chat(messages)
	return response

In [15]:
print("Loading Simpsons data...")
characters_df, episodes_df, locations_df, script_lines_df = load_simpsons_data()

print("Creating documents...")
episode_docs = create_episode_documents(episodes_df)
character_docs = create_character_documents(characters_df)
location_docs = create_location_documents(locations_df)
script_docs = create_script_documents(script_lines_df, episodes_df, characters_df, locations_df)

# Combine all documents
all_documents = episode_docs + character_docs + location_docs + script_docs

print(f"Total documents before splitting: {len(all_documents)}")

Loading Simpsons data...


  script_lines_df = pd.read_csv('../data/simpsons/simpsons_script_lines.csv')


Loaded 6722 characters, 600 episodes, 4459 locations, 158271 script lines
Creating documents...
Total documents before splitting: 12345


In [16]:
print("Splitting documents...")
split_docs = split_documents(all_documents)

print(f"Total document chunks: {len(split_docs)}")

print("Creating embeddings...")
embeddings, texts, metadata = create_embeddings(split_docs)

Splitting documents...
Total document chunks: 28544
Creating embeddings...


Generating embeddings:   0%|          | 0/28544 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def setup_simpsons_rag():
    """Main function to set up the RAG system"""
    print("Setting up OpenAI client...")
    
    print("Loading Simpsons data...")
    characters_df, episodes_df, locations_df, script_lines_df = load_simpsons_data()
    
    print("Creating documents...")
    episode_docs = create_episode_documents(episodes_df)
    character_docs = create_character_documents(characters_df)
    location_docs = create_location_documents(locations_df)
    script_docs = create_script_documents(script_lines_df, episodes_df, characters_df, locations_df)
    
    # Combine all documents
    all_documents = episode_docs + character_docs + location_docs + script_docs
    
    print(f"Total documents before splitting: {len(all_documents)}")
    
    print("Splitting documents...")
    split_docs = split_documents(all_documents)
    
    print(f"Total document chunks: {len(split_docs)}")
    
    print("Creating embeddings...")
    embeddings, texts, metadata = create_embeddings(split_docs)
    
    print("Building FAISS index...")
    index = build_faiss_index(embeddings)
    
    print("RAG system setup complete!")
    
    return index, texts, metadata

In [17]:
def query_simpsons_rag(llm, query: str, index: faiss.Index, texts: List[str], metadata: List[Dict]) -> str:
    """Query the Simpsons RAG system"""
    print(f"Searching for: {query}")
    
    # Retrieve relevant documents
    relevant_docs = search_similar_documents(query, index, texts, metadata, k=5)
    print(f"Found {len(relevant_docs)} relevant documents")
    
    # Generate response
    response = generate_response(llm, query, relevant_docs)
    
    return response

In [18]:
from llama_index.llms.ollama import Ollama

llm = Ollama(
    model=config["OLLAMA_LLM_MODEL"], 
    request_timeout=7200.0
)

In [None]:
# Example queries
sample_queries = [
    "Which episode has the most lines by Lisa?",
    "In what season did Milhouse appear most?",
    "Which characters were in the same location most often as Mr. Burns?",
]

for query in sample_queries:
    print(f"\n{'='*50}")
    print(f"Query: {query}")
    print(f"{'='*50}")
    
    # response = query_simpsons_rag(llm, query, index, texts, metadata)
    search_similar_documents(query, index, texts, metadata, 10)
    # print(f"Response: {response}")


Query: Which episode has the most lines by Lisa?
(1, 1024)

Query: In what season did Milhouse appear most?
(1, 1024)

Query: Which characters were in the same location most often as Mr. Burns?
(1, 1024)
