In [2]:
import pandas as pd

In [3]:
characters_df = pd.read_csv("data/simpsons_characters.csv")
episodes_df = pd.read_csv("data/simpsons_episodes.csv")
locations_df = pd.read_csv("data/simpsons_locations.csv")
script_lines_df = pd.read_csv("data/simpsons_script_lines.csv")

  script_lines_df = pd.read_csv("data/simpsons_script_lines.csv")


In [6]:
episodes_df.sort_values(by=["season", "id"], inplace=True)

In [9]:
script_lines_df.sort_values(by=["episode_id", "number"], inplace=True)

In [12]:
# Use the episode_id from script_lines_df to get the episode title season and the number_in_season from episodes_df
script_lines_df = script_lines_df.merge(
	episodes_df[["id", "title", "season", "number_in_season", "number_in_series"]],
	left_on="episode_id",
	right_on="id",
	suffixes=("", "_episode"),
)
# use the location_id from script_lines_df to get the location name from locations_df
script_lines_df = script_lines_df.merge(
	locations_df[["id", "normalized_name"]],
	left_on="location_id",
	right_on="id",
	suffixes=("", "_location"),
)
# rename the column to "location_name"
script_lines_df.rename(columns={"normalized_name": "location_name"}, inplace=True)
# use the character_id from script_lines_df to get the character name from characters_df
# take into account that character_id can be NaN, so we use a left join
characters_df['id'] = characters_df['id'].astype(str)
script_lines_df = script_lines_df.merge(
	characters_df[["id", "normalized_name"]],
	left_on="character_id",
	right_on="id",
	suffixes=("", "_character"),
)
# rename the column to "character_name"
script_lines_df.rename(columns={"normalized_name": "character_name"}, inplace=True)

In [14]:
# concatenate all the raw_text when speaking_line == True or true into a single string
# for a given episode_id
script_lines_df["speaking_line"] = script_lines_df["speaking_line"].astype(bool)
def get_episode_text(episode_id):
	episode_lines = script_lines_df[script_lines_df["episode_id"] == episode_id]
	# drop those where normalized_name is NaN
	episode_lines = episode_lines.dropna()
	speaking_lines = episode_lines[episode_lines["speaking_line"]]
	locations = speaking_lines["location_name"].tolist()
	characters = speaking_lines["character_name"].tolist()
	text_lines = speaking_lines["normalized_text"].tolist()
	# Concatenate every location name from locations list with the corresponding speaking line from text_lines list and character from characters list
	# such as: "[location] character_name: speaking line"
	text_lines = [f"[{loc}] ({char}): {text}" for loc, char, text in zip(locations, characters, text_lines)]
	# text_lines = [f"[{loc}] {text}" for loc, text in zip(locations, text_lines)]
	# Join all the text lines into a single string, separated by newlines
	return f"\n".join(text_lines)

Title: Simpsons Roasting on an Open Fire
Season: 1, Episode: 1, Episode in series: 1

[car] (marge simpson): ooo careful homer
[car] (homer simpson): theres no time to be careful
[car] (homer simpson): were late
[auditorium] (marge simpson): sorry excuse us pardon me
[auditorium] (homer simpson): hey norman hows it going so you got dragged down here too heh heh how ya doing fred excuse me fred
[auditorium] (homer simpson): pardon my galoshes
[auditorium] (seymour skinner): wasnt that wonderful and now santas of many lands as presented by the entire second grade class
[auditorium] (marge simpson): oh lisas class
[auditorium] (janey): frohlich weihnachten -- thats german for merry christmas in germany santas servant ruprecht gives presents to good children and whipping rods to the parents of bad ones
[auditorium] (todd flanders): meri kurimasu i am hotseiosha a japanese priest who acts like santa claus i have eyes in the back of my head so children better behave when im nearby
[auditoriu

In [17]:
os.makedirs("scripts", exist_ok=True)
episode_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9]
for episode_id in episode_ids:
	episode_text = get_episode_text(episode_id)
	# concatenate title, season and number in season
	title = script_lines_df[script_lines_df["episode_id"] == episode_id]["title"]
	season = script_lines_df[script_lines_df["episode_id"] == episode_id]["season"]
	number_in_season = script_lines_df[script_lines_df["episode_id"] == episode_id]["number_in_season"]
	number_in_series = script_lines_df[script_lines_df["episode_id"] == episode_id]["number_in_series"]
	episode_text = f"Season: {season.iloc[0]}, Episode: {number_in_season.iloc[0]}, Episode in series: {number_in_series.iloc[0]}\n\n{episode_text}"
	episode_text = f"Title: {title.iloc[0]}\n{episode_text}"
	# save into a file
	with open(f"scripts/season_{season.iloc[0]}_episode_{episode_id}_text.txt", "w") as f:
		f.write(episode_text)