# Setup

## Python version

In [None]:
from platform import python_version
print(python_version())

## Libraries

In [None]:
from bs4 import BeautifulSoup as bs
from warnings import warn

import souphelper 
from souphelper import *

import random
from pprint import pprint

# Episodes Scraping from Fandom 

We want scraping from Fandom site all the simpsons episodes with: 

* Number total [Integer]
* Number in season [Integer]
* Season number [Integer]
* Title [String]
* Air date [Date with no time]
* Production code [string]
* Main characters [list of strings]
* Written by [String]
* Directed by [String]

In [None]:
BASE_URL = "https://simpsons.fandom.com"

In [None]:
STR_SEPARATOR = ","

def title(episodeInfobox:bs):
    if episodeInfobox:
        titleTag = episodeInfobox.find("h2")
        if titleTag:
            return str(titleTag.string).strip()
    return None

def image(episodeInfobox:bs):
    if episodeInfobox:
        imageTag = episodeInfobox.figure
        if imageTag:
            return imageTag.a.img["src"]
    return None

def episode_number(episodeInfobox:bs):
    if episodeInfobox:
        episode_numberTag = episodeInfobox.find(attrs={"data-source": "Episode Number"})
        if episode_numberTag:
            return str(episode_numberTag.string).strip()
    return None

def production_code(episodeInfobox:bs):
    if episodeInfobox:
        production_codeTag = episodeInfobox.find(attrs={"data-source": "productionCode"})
        if production_codeTag:
            return str(production_codeTag.string).strip()
    return None

def airdate(episodeInfobox:bs):
    if episodeInfobox:
        airdateTag = episodeInfobox.find(attrs={"data-source": "originalAirdate"})
        if airdateTag:
            return str(airdateTag.string).strip()
    return None

def maincharacters(episodeInfobox:bs):
    if episodeInfobox:
        maincharactersTag = episodeInfobox.find(attrs={"data-source": "main_character(s)"})
        if maincharactersTag:
            maincharactersContent = maincharactersTag.div
            handleLinebreaks(maincharactersContent, STR_SEPARATOR)
    return None

def writtenby(episodeInfobox:bs):
    if episodeInfobox:
        writtenbyTag = episodeInfobox.find(attrs={"data-source": "Written By"})
        if writtenbyTag:
            return str(writtenbyTag.string).strip()
    return None

def directedby(episodeInfobox:bs):
    if episodeInfobox:
        directedbyTag = episodeInfobox.find(attrs={"data-source": "Directed By"})
        if directedbyTag:
            return str(directedbyTag.string).strip()
    return None

def episodeAttrs(episodePage:bs, **moreAttributes):
    infobox = episodePage.find(class_="portable-infobox")
    return {
        **moreAttributes,
        "title": episodePage.find(id="firstHeading").string.strip(),
        "image": image(infobox),
        "episode number": episode_number(infobox),
        "production code": production_code(infobox),
        "airdate": airdate(infobox),
        "main character(s)": maincharacters(infobox),
        "written by": writtenby(infobox),
        "directed by": directedby(infobox)
    }

In [None]:
def esepisodesPage(page:bs):
    return page.find(id="firstHeading").string.strip() == "List of Episodes"

def episodesURLs(episodesPage:bs):
    if not esepisodesPage(episodesPage):
        raise ValueError("Soup received is not a episodes page")
    
    episodesSection = episodesPage.find_all(class_="oLeft")[0]
    episodes = episodesSection.find_all(class_="oLeft")

    links = []

    for episode in episodes:
        links.append(BASE_URL + episode.a["href"])
    return links


In [None]:
START_PAGE = "https://simpsons.fandom.com/wiki/List_of_Episodes"

TEST = True # scrape some random episodes
EPISODE_TEST_URL = None # scrape only this if not None

def scrapeEpisode(url:str):
    """Returns a episode dictionary given the url to that episode page, or returns None if the scraping failed"""
    if url.find("User:") != -1:
        return None
    if url.find("Category:") != -1:
        return None
    episodePage = soup(url)
    episode = episodeAttrs(episodePage, url=url)
    return episode

def scrapeEpisodesPage(url:str):
    """Returns a list of episodes given a episodes page (page containing a list of episodes)"""
    episodes = []
    episodesPage = soup(url)
    if TEST:
        episodeURL = random.choice(episodesURLs(episodesPage))
        print("Testing", episodeURL)
        episode = scrapeEpisode(episodeURL)
        pprint(episode, sort_dicts=False)
        episodes.append(episode)
    else:
        for episodeURL in episodesURLs(episodesPage):
            episode = scrapeEpisode(episodeURL)
            if episode: # don't append if episode is None
                episodes.append(episode)
    return episodes

def scrapeEpisodes(startPage = START_PAGE):
    """Returns the list of all episodes of the wiki"""
    episodes = []
    episodesPages = []
    episodesPageURL = startPage
    while episodesPageURL:
        episodesPages.append(episodesPageURL)
        pageEpisodes, episodesPageURL = scrapeEpisodesPage(episodesPageURL)
        episodes.extend(pageEpisodes)
        if TEST and random.random() < 0.2:
            episodesPageURL = None
    return episodes, episodesPages

if __name__ == "__main__":
    if EPISODE_TEST_URL:
        print("Testing single episode " + EPISODE_TEST_URL)
        pprint(scrapeEpisode(EPISODE_TEST_URL), sort_dicts=False)
    else:
        if TEST:
            scrapeEpisodes()