In [None]:
import pandas as pd
import numpy as np
import faiss
from typing import List, Dict, Any
import json
import re
import zipfile
import os
from dotenv import load_dotenv
from faiss import write_index, read_index
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage
from llama_index.embeddings.ollama import OllamaEmbedding
import yaml
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document

In [81]:
# load cofig.yaml
with open("../config.yaml", "r") as f:
	config = yaml.safe_load(f)

In [82]:
embedding_model = OllamaEmbedding(
    model_name=config["OLLAMA_EMBEDDING_MODEL"],
    base_url="http://localhost:11434",
)

In [None]:
"""Load all Simpsons CSV files into dataframes from data/simpsons/"""
files_path = "../data/simpsons/"

characters_df = pd.read_csv(os.path.join(files_path, 'simpsons_characters.csv'))
episodes_df = pd.read_csv(os.path.join(files_path, 'simpsons_episodes.csv'))
locations_df = pd.read_csv(os.path.join(files_path, 'simpsons_locations.csv'))
script_lines_df = pd.read_csv(
    os.path.join(files_path, 'simpsons_script_lines.csv'),
    quotechar='"',
    dtype={
        "speaking_line": "string", 
        "character_id": "string", 
    },
    na_values=["", "NaN"],
    low_memory=False
)

# speaking_line to boolean
script_lines_df["speaking_line"] = script_lines_df["speaking_line"].map({"TRUE": True, "FALSE": False})

# character_id to int64 with support for NaN
script_lines_df["character_id"] = pd.to_numeric(script_lines_df["character_id"], errors="coerce").astype("Int64")


print(f"Loaded {len(characters_df)} characters")
print(f"Loaded {len(episodes_df)} episodes")
print(f"Loaded {len(locations_df)} locations")
print(f"Loaded {len(script_lines_df)} script lines")

Loaded 6722 characters
Loaded 600 episodes
Loaded 4459 locations
Loaded 158271 script lines


In [84]:
episodes_df.sort_values(by=["season", "id"], inplace=True)
script_lines_df.sort_values(by=["episode_id", "number"], inplace=True)

# Use the episode_id from script_lines_df to get the episode title season and the number_in_season from episodes_df
merged_lines_df = script_lines_df.merge(
  episodes_df[["id", "title", "season", "number_in_season", "number_in_series"]],
  left_on="episode_id",
  right_on="id",
  how="left",
  suffixes=("", "_episode"),
)
# use the location_id from script_lines_df to get the location name from locations_df
merged_lines_df = merged_lines_df.merge(
  locations_df[["id", "normalized_name"]],
  left_on="location_id",
  right_on="id",
  how="left",
  suffixes=("", "_location"),
)
# rename the column to "location_name"
merged_lines_df.rename(columns={"normalized_name": "location_name"}, inplace=True)
# use the character_id from script_lines_df to get the character name from characters_df

# concatenate all the raw_text when speaking_line == True or true into a single string
# for a given episode_id
merged_lines_df["speaking_line"] = merged_lines_df["speaking_line"].astype(bool)

In [100]:
merged_lines_df.head(3)

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count,id_episode,title,season,number_in_season,number_in_series,id_location,location_name
0,1,1,0,(Street: ext. street - establishing - night),8000,True,,1.0,,Street,,,,1,Simpsons Roasting on an Open Fire,1,1,1,1.0,street
1,2,1,1,(Car: int. car - night),8000,True,,2.0,,Car,,,,1,Simpsons Roasting on an Open Fire,1,1,1,2.0,car
2,3,1,2,"Marge Simpson: Ooo, careful, Homer.",8000,True,1.0,2.0,Marge Simpson,Car,"Ooo, careful, Homer.",ooo careful homer,3.0,1,Simpsons Roasting on an Open Fire,1,1,1,2.0,car


In [86]:
def get_episode_text(episode_id):
	episode_lines = merged_lines_df[merged_lines_df["episode_id"] == episode_id]
	episode_lines = episode_lines[~episode_lines["normalized_text"].isna()]
	# drop those where normalized_name is NaN
	episode_lines = episode_lines.dropna()
	speaking_lines = episode_lines[episode_lines["speaking_line"]]
	locations = speaking_lines["location_name"].tolist()
	characters = speaking_lines["raw_character_text"].str.lower().values
	text_lines = speaking_lines["normalized_text"].tolist()
	# Concatenate every location name from locations list with the corresponding speaking line from text_lines list and character from characters list
	# such as: "[location] character_name: speaking line"
	text_lines = [f"[{loc}] ({char}): {text}" for loc, char, text in zip(locations, characters, text_lines)]
	# Join all the text lines into a single string, separated by newlines
	return f"\n".join(text_lines)

In [99]:
output_dir = "../output"
os.makedirs(output_dir, exist_ok=True)
# Episodes ids to generate scripts for

written_episodes = []
skipped_episodes = []

episode_ids = episodes_df['id'].to_list()
#episode_ids = [128, 129]
for episode_id in episode_ids:
    if episode_id not in merged_lines_df["episode_id"].values:
        skipped_episodes.append({
            "episode_id": episode_id,
            "reason": "no script lines"
        })
        continue

    episode_text = get_episode_text(episode_id)

    title_row = merged_lines_df[merged_lines_df["episode_id"] == episode_id]

    # skip if no usable lines (e.g. only NaNs)
    if title_row.empty or not episode_text.strip():
        print(f"Skipping episode {episode_id} (no valid lines)")
        skipped_episodes.append({
            "episode_id": episode_id,
            "reason": "no valid lines"
        })
        continue

    title = title_row["title"].iloc[0]
    season = title_row["season"].iloc[0]
    number_in_season = title_row["number_in_season"].iloc[0]
    number_in_series = title_row["number_in_series"].iloc[0]

    episode_text = f"Season: {season}, Episode: {number_in_season}, Episode in series: {number_in_series}\n\n{episode_text}"
    episode_text = f"Title: {title}\n{episode_text}"
    written_episodes.append(episode_id)

# save into a file	
    with open(f"{output_dir}/scripts/season_{season}_episode_{episode_id}_text.txt", "w") as f:
            f.write(episode_text)

with open(f"{output_dir}/skipped_episodes.json", "w") as f:
    json.dump(skipped_episodes, f, indent=2)
    
print(len(skipped_episodes),"episodes not written, check 'skipped_episodes.json' file for more info")

36 episodes not written, check 'skipped_episodes.json' file for more info


In [None]:
documents = []
all_docs_paths = os.listdir(f"../output/scripts")
# all_docs_paths = ["season_1_episode_1_text.txt"]
for doc_path in all_docs_paths:
	with open(f"../output/scripts/{doc_path}", "r") as f:
		text = f.read()
		documents.append(Document(text=text))

season_9_episode_179_text.txt
season_18_episode_391_text.txt
season_18_episode_390_text.txt
season_8_episode_162_text.txt
season_21_episode_462_text.txt
season_21_episode_463_text.txt
season_8_episode_163_text.txt
season_26_episode_563_text.txt
season_26_episode_562_text.txt
season_20_episode_431_text.txt
season_13_episode_289_text.txt
season_20_episode_430_text.txt
season_13_episode_288_text.txt
season_2_episode_29_text.txt
season_2_episode_28_text.txt
season_6_episode_104_text.txt
season_6_episode_105_text.txt
season_12_episode_249_text.txt
season_22_episode_470_text.txt
season_22_episode_471_text.txt
season_24_episode_522_text.txt
season_24_episode_523_text.txt
season_10_episode_209_text.txt
season_10_episode_208_text.txt
season_25_episode_538_text.txt
season_19_episode_415_text.txt
season_19_episode_414_text.txt
season_25_episode_539_text.txt
season_5_episode_98_text.txt
season_5_episode_99_text.txt
season_2_episode_17_text.txt
season_2_episode_16_text.txt
season_14_episode_310_tex

# Chunking

In [90]:
splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
# Nodes represent chunks of source documents in Llamaindex
nodes = splitter.get_nodes_from_documents(documents)

# Embeddings

In [101]:
texts = [node.text for node in nodes]

embeddings: List[List] = embedding_model.get_text_embedding_batch(texts, show_progress=True)
embeddings: np.ndarray = np.array(embeddings, dtype=np.float32)

Generating embeddings:   0%|          | 0/3399 [00:00<?, ?it/s]

In [103]:
np.savez_compressed(
    f"../output/embeddings/embeddings_bge_m3.npz",
    texts=np.array(texts),
    embeddings=embeddings
)

In [None]:
data = np.load(f"../output/embeddings/embeddings_bge_m3.npz", allow_pickle=True)
texts = data["texts"]
embeddings = data["embeddings"]

In [104]:
"""Build FAISS index for similarity search"""
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity

# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)
index.add(embeddings.astype('float32'))

output_faiss_path = f"../output/faiss_index_bge_m3.index"
faiss.write_index(index, output_faiss_path)

In [136]:
# Initialize Ollama LLM
# https://docs.llamaindex.ai/en/stable/api_reference/llms/ollama/
llm = Ollama(
    model=config["OLLAMA_LLM_MODEL"],
    temperature=0.7,
    request_timeout = 7200,
)

In [137]:
query = "Who are all the relatives of Bart Simpson, including their relationships to each other?"

In [138]:
query_embedding = embedding_model.get_text_embedding_batch([query])
query_embedding: np.ndarray = np.array(query_embedding, dtype=np.float32)
faiss.normalize_L2(query_embedding)

scores, indices = index.search(query_embedding.astype('float32'), k=10)

results = []

for i, idx in enumerate(indices[0]):
    results.append({
        'content': texts[idx],
        'score': float(scores[0][i])
    })
   

In [None]:
query_1 = "Who are all the relatives of Bart Simpson, including their relationships to each other?"

In [130]:
query_2 = "Which characters interacted directly with Mr. Burns on the day of the shooting?"

In [141]:
query_3 = "What are the top 5 themes discussed in the episodes?"

In [142]:
query = query_2
"""Generate response using OpenAI with retrieved context"""
# Prepare context from retrieved documents

context = ""
for doc in results:
    context += f"{doc['content']}\n\n"

# Create prompt
prompt = f"""
You are a knowledgeable assistant about The Simpsons TV show. Use the following context to answer the user's question.

Context:
{context}

Question: {query}

Please provide a comprehensive answer based ONLY on the context provided. If the context doesn't contain enough information to fully answer the question, mention what information is available and what might be missing.

Answer:
"""

messages = [
    ChatMessage(
        role="system", 
        content="You are a helpful assistant specializing in The Simpsons TV show.",
    ),
    ChatMessage(
        role="user",
        content=prompt
    )
]
response = llm.chat(messages)

In [134]:
response

ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, additional_kwargs={'tool_calls': None, 'thinking': None}, blocks=[TextBlock(block_type='text', text='Based on the provided context, the following characters interacted directly with Mr. Burns on the day of the shooting:\n\n*   **Montgomery Burns:** He initiated the meeting at the Simpson home.\n*   **Homer Simpson:** He was present at the Simpson home during Mr. Burns\' visit.\n*   **Marge Simpson:** She offered Mr. Burns tea and marshmallow squares.\n*   **Waylon Smithers:** He provided information about the jade monkey and maps.\n*   **Bart Simpson:** He made comments to Mr. Burns.\n*   **Lisa Simpson:** She was present during the interaction.\n*   **Edna Krabappel:** She was present during the interaction.\n*   **Seymour Skinner:** He was present during the interaction.\n\nThe context doesn\'t provide details about the shooting itself, only Mr. Burns\' visit to the Simpson home. Therefore, we cannot determin

In [140]:
response.message.blocks[0].text

"Based on the provided context, the following characters interacted directly with Mr. Burns on the day of the shooting:\n\n*   **Waylon Smithers:** He informed Mr. Burns that the jade monkey and maps were found in Mr. Burns' glove compartment.\n*   **Charles Montgomery Burns:** He directly addressed Waylon Smithers and gave instructions regarding the jade monkey and the next full moon.\n\n\n\nThe context does not provide information about any other characters interacting with Mr. Burns on that day."