In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
import openai
from typing import List, Dict, Any
import json
import re
import zipfile
import os
from dotenv import load_dotenv

In [None]:
# Initialize embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
def setup_openai_client():
    """Setup OpenAI client with secure API key handling"""
    # Try multiple methods to get the API key
    api_key = None
    
    # Method 1: Environment variable (recommended)
    api_key = os.getenv('OPENAI_API_KEY')
    
    # Method 2: From .env file (if not found in environment)
    if not api_key:
        try:
            from dotenv import load_dotenv
            load_dotenv()
            api_key = os.getenv('OPENAI_API_KEY')
        except ImportError:
            print("python-dotenv not installed. Install with: pip install python-dotenv")
    
    # Method 3: Interactive input (fallback)
    if not api_key:
        print("OpenAI API key not found in environment variables.")
        print("You can set it by:")
        print("1. Creating a .env file with: OPENAI_API_KEY=your_key_here")
        print("2. Setting environment variable: export OPENAI_API_KEY=your_key_here")
        print("3. Enter it now (not recommended for production):")
        api_key = input("Enter your OpenAI API key: ").strip()
    
    if not api_key:
        raise ValueError("OpenAI API key is required. Please set OPENAI_API_KEY environment variable.")
    
    # Set the API key
    openai.api_key = api_key
    print("OpenAI API key configured successfully!")
    
    return api_key

In [None]:
def extract_zip_file(zip_path: str, extract_to: str = "data/extracted"):
    """Extract the Simpsons zip file"""
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    
    print(f"Extracted {zip_path} to {extract_to}")
    return extract_to



In [None]:
def load_simpsons_data():
    """Load all Simpsons CSV files into dataframes"""
    zip_path = "data/thesimpsons.zip"
    extract_path = "data/extracted"
    
    # Extract zip file if it hasn't been extracted yet
    if not os.path.exists(extract_path):
        extract_zip_file(zip_path, extract_path)
    
    # Find CSV files in extracted folder
    csv_files = {}
    for root, dirs, files in os.walk(extract_path):
        for file in files:
            if file.endswith('.csv'):
                csv_files[file] = os.path.join(root, file)
    
    # Load the CSV files
    characters_df = pd.read_csv(csv_files['simpsons_characters.csv'])
    episodes_df = pd.read_csv(csv_files['simpsons_episodes.csv'])
    locations_df = pd.read_csv(csv_files['simpsons_locations.csv'])
    script_lines_df = pd.read_csv(csv_files['simpsons_script_lines.csv'])
    
    print(f"Loaded {len(characters_df)} characters, {len(episodes_df)} episodes, {len(locations_df)} locations, {len(script_lines_df)} script lines")
    
    return characters_df, episodes_df, locations_df, script_lines_df

In [None]:
def create_episode_documents(episodes_df: pd.DataFrame) -> List[Dict]:
    """Create structured documents from episodes data"""
    documents = []
    
    for _, episode in episodes_df.iterrows():
        # Create comprehensive episode description
        content = f"""
        Episode: {episode['title']}
        Season: {episode['season']}
        Episode Number: {episode['number_in_season']}
        Series Number: {episode['number_in_series']}
        Air Date: {episode['original_air_date']}
        IMDB Rating: {episode['imdb_rating']}
        US Viewers: {episode['us_viewers_in_millions']} million
        Production Code: {episode['production_code']}
        """
        
        documents.append({
            'content': content.strip(),
            'metadata': {
                'type': 'episode',
                'episode_id': episode['id'],
                'title': episode['title'],
                'season': episode['season'],
                'imdb_rating': episode['imdb_rating'],
                'air_date': episode['original_air_date']
            }
        })
    
    return documents

In [None]:
def create_character_documents(characters_df: pd.DataFrame) -> List[Dict]:
    """Create structured documents from characters data"""
    documents = []
    
    for _, character in characters_df.iterrows():
        content = f"""
        Character: {character['name']}
        Gender: {character['gender']}
        Normalized Name: {character['normalized_name']}
        """
        
        documents.append({
            'content': content.strip(),
            'metadata': {
                'type': 'character',
                'character_id': character['id'],
                'name': character['name'],
                'gender': character['gender']
            }
        })
    
    return documents

In [None]:
def create_location_documents(locations_df: pd.DataFrame) -> List[Dict]:
    """Create structured documents from locations data"""
    documents = []
    
    for _, location in locations_df.iterrows():
        content = f"""
        Location: {location['name']}
        Normalized Name: {location['normalized_name']}
        """
        
        documents.append({
            'content': content.strip(),
            'metadata': {
                'type': 'location',
                'location_id': location['id'],
                'name': location['name']
            }
        })
    
    return documents

In [None]:
def create_script_documents(script_lines_df: pd.DataFrame, episodes_df: pd.DataFrame, 
                          characters_df: pd.DataFrame, locations_df: pd.DataFrame) -> List[Dict]:
    """Create structured documents from script lines with context"""
    documents = []
    
    # Create lookup dictionaries for faster access
    episode_lookup = episodes_df.set_index('id')['title'].to_dict()
    character_lookup = characters_df.set_index('id')['name'].to_dict()
    location_lookup = locations_df.set_index('id')['name'].to_dict()
    
    # Group script lines by episode for context
    grouped_scripts = script_lines_df.groupby('episode_id')
    
    for episode_id, episode_lines in grouped_scripts:
        episode_title = episode_lookup.get(episode_id, f"Episode {episode_id}")
        
        # Combine multiple lines for richer context
        episode_script = []
        for _, line in episode_lines.iterrows():
            if pd.notna(line['spoken_words']) and line['spoken_words'].strip():
                character_name = character_lookup.get(int(line['character_id']), 'Unknown') if pd.notna(line['character_id']) else 'Unknown'
                location_name = location_lookup.get(line['location_id'], 'Unknown') if pd.notna(line['location_id']) else 'Unknown'
                
                script_text = f"{character_name}: {line['spoken_words']}"
                if location_name != 'Unknown':
                    script_text += f" [Location: {location_name}]"
                
                episode_script.append(script_text)
        
        # Join all lines for the episode
        full_script = "\n".join(episode_script)
        
        if full_script.strip():
            documents.append({
                'content': f"Episode: {episode_title}\n\nScript:\n{full_script}",
                'metadata': {
                    'type': 'script',
                    'episode_id': episode_id,
                    'episode_title': episode_title,
                    'line_count': len(episode_script)
                }
            })
    
    return documents

In [None]:
def split_documents(documents: List[Dict]) -> List[Dict]:
    """Split documents using RecursiveCharacterTextSplitter"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ":", ".", "!", "?", ",", " ", ""]
    )
    
    split_docs = []
    
    for doc in documents:
        chunks = text_splitter.split_text(doc['content'])
        
        for i, chunk in enumerate(chunks):
            split_docs.append({
                'content': chunk,
                'metadata': {
                    **doc['metadata'],
                    'chunk_id': i,
                    'total_chunks': len(chunks)
                }
            })
    
    return split_docs

# Chunking

In [None]:
def split_documents(documents: List[Dict]) -> List[Dict]:
    """Split documents using RecursiveCharacterTextSplitter"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ":", ".", "!", "?", ",", " ", ""]
    )
    
    split_docs = []
    
    for doc in documents:
        chunks = text_splitter.split_text(doc['content'])
        
        for i, chunk in enumerate(chunks):
            split_docs.append({
                'content': chunk,
                'metadata': {
                    **doc['metadata'],
                    'chunk_id': i,
                    'total_chunks': len(chunks)
                }
            })
    
    return split_docs

# Embeddings

In [None]:
def create_embeddings(documents: List[Dict]) -> tuple:
    """Create embeddings for all documents"""
    texts = [doc['content'] for doc in documents]
    embeddings = embedding_model.encode(texts, convert_to_tensor=False)
    
    return embeddings, texts, [doc['metadata'] for doc in documents]



model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunk_df["chunk"].tolist(), show_progress_bar=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

faiss.write_index(index, "simpsons_faiss.index")

In [None]:
def build_faiss_index(embeddings: np.ndarray) -> faiss.Index:
    """Build FAISS index for similarity search"""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
    
    # Normalize embeddings for cosine similarity
    faiss.normalize_L2(embeddings)
    index.add(embeddings.astype('float32'))
    
    return index

In [None]:
def search_similar_documents(query: str, index: faiss.Index, texts: List[str], 
                           metadata: List[Dict], k: int = 5) -> List[Dict]:
    """Search for similar documents using FAISS"""
    query_embedding = embedding_model.encode([query], convert_to_tensor=False)
    faiss.normalize_L2(query_embedding)
    
    scores, indices = index.search(query_embedding.astype('float32'), k)
    
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            'content': texts[idx],
            'metadata': metadata[idx],
            'score': float(scores[0][i])
        })
    
    return results

In [None]:
def generate_response(query: str, context_docs: List[Dict]) -> str:
    """Generate response using OpenAI with retrieved context"""
    # Prepare context from retrieved documents
    context = ""
    for doc in context_docs:
        doc_type = doc['metadata']['type']
        context += f"[{doc_type.upper()}] {doc['content']}\n\n"
    
    # Create prompt
    prompt = f"""
    You are a knowledgeable assistant about The Simpsons TV show. Use the following context to answer the user's question.
    
    Context:
    {context}
    
    Question: {query}
    
    Please provide a comprehensive answer based on the context provided. If the context doesn't contain enough information to fully answer the question, mention what information is available and what might be missing.
    
    Answer:
    """
    
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant specializing in The Simpsons TV show."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,
        temperature=0.7
    )
    
    return response.choices[0].message.content

In [None]:
def setup_simpsons_rag():
    """Main function to set up the RAG system"""
    print("Setting up OpenAI client...")
    setup_openai_client()
    
    print("Loading Simpsons data...")
    characters_df, episodes_df, locations_df, script_lines_df = load_simpsons_data()
    
    print("Creating documents...")
    episode_docs = create_episode_documents(episodes_df)
    character_docs = create_character_documents(characters_df)
    location_docs = create_location_documents(locations_df)
    script_docs = create_script_documents(script_lines_df, episodes_df, characters_df, locations_df)
    
    # Combine all documents
    all_documents = episode_docs + character_docs + location_docs + script_docs
    
    print(f"Total documents before splitting: {len(all_documents)}")
    
    print("Splitting documents...")
    split_docs = split_documents(all_documents)
    
    print(f"Total document chunks: {len(split_docs)}")
    
    print("Creating embeddings...")
    embeddings, texts, metadata = create_embeddings(split_docs)
    
    print("Building FAISS index...")
    index = build_faiss_index(embeddings)
    
    print("RAG system setup complete!")
    
    return index, texts, metadata

In [None]:
def query_simpsons_rag(query: str, index: faiss.Index, texts: List[str], metadata: List[Dict]) -> str:
    """Query the Simpsons RAG system"""
    print(f"Searching for: {query}")
    
    # Retrieve relevant documents
    relevant_docs = search_similar_documents(query, index, texts, metadata, k=5)
    
    print(f"Found {len(relevant_docs)} relevant documents")
    
    # Generate response
    response = generate_response(query, relevant_docs)
    
    return response

In [None]:
index, texts, metadata = setup_simpsons_rag()
    
# Example queries
sample_queries = [
    "Which episode has the most lines by Lisa?",
    "In what season did Milhouse appear most?",
    "Which characters were in the same location most often as Mr. Burns?",
]

for query in sample_queries:
    print(f"\n{'='*50}")
    print(f"Query: {query}")
    print(f"{'='*50}")
    
    response = query_simpsons_rag(query, index, texts, metadata)
    print(f"Response: {response}")