# Setup

## Python version

In [1]:
from platform import python_version
print(python_version())

3.9.13


## Libraries

In [2]:
from bs4 import BeautifulSoup as bs
from warnings import warn

import souphelper 
from souphelper import *

import random
from pprint import pprint

# Episodes Scraping from Fandom 

We want scraping from Fandom site all the simpsons episodes with: 

* Number total [Integer]
* Number in season [Integer]
* Season number [Integer]
* Title [String]
* Air date [Date with no time]
* Production code [string]
* Main characters [list of strings]
* Written by [String]
* Directed by [String]

In [3]:
BASE_URL = "https://simpsons.fandom.com"

In [4]:
STR_SEPARATOR = ","

def title(episodeInfobox:bs):
    if episodeInfobox:
        titleTag = episodeInfobox.find("h2")
        if titleTag:
            return str(titleTag.string).strip()
    return None

def image(episodeInfobox:bs):
    if episodeInfobox:
        imageTag = episodeInfobox.figure
        if imageTag:
            return imageTag.a.img["src"]
    return None

def episode_number(episodeInfobox:bs):
    if episodeInfobox:
        episodeNumberTag = episodeInfobox.find(attrs={"data-source": "Episode Number"})
        if episodeNumberTag:
            return str(episodeNumberTag.div.string).strip()
    return None

def production_code(episodeInfobox:bs):
    if episodeInfobox:
        production_codeTag = episodeInfobox.find(attrs={"data-source": "productionCode"})
        if production_codeTag:
            return str(production_codeTag.div.string).strip()
    return None

def airdate(episodeInfobox:bs):
    if episodeInfobox:
        airdateTag = episodeInfobox.find(attrs={"data-source": "originalAirdate"})
        if airdateTag:
            return str(airdateTag.div.string).strip()
    return None

def maincharacters(episodeInfobox:bs):
    if episodeInfobox:
        mainCharactersTag = episodeInfobox.find(attrs={"data-source": "main_character(s)"})
        if mainCharactersTag:
            mainCharactersContent = mainCharactersTag.div
            handleP(mainCharactersContent)
            handleLinebreaks(mainCharactersContent, STR_SEPARATOR)
            return str(mainCharactersContent.string).strip()
    return None

def writtenby(episodeInfobox:bs):
    if episodeInfobox:
        writtenbyTag = episodeInfobox.find(attrs={"data-source": "Written By"})
        if writtenbyTag:
            writtenbyContent = writtenbyTag.div
            handleP(writtenbyContent)
            handleLinebreaks(writtenbyContent, STR_SEPARATOR)
            return str(writtenbyTag.div.string).strip()
    return None

def directedby(episodeInfobox:bs):
    if episodeInfobox:
        directedbyTag = episodeInfobox.find(attrs={"data-source": "Directed By"})
        if directedbyTag:
            return str(directedbyTag.div.string).strip()
    return None

def episodeAttrs(episodePage:bs, **moreAttributes):
    infobox = episodePage.find(class_="portable-infobox")
    return {
        **moreAttributes,
        "title": episodePage.find(id="firstHeading").string.strip(),
        "image": image(infobox),
        "episode number": episode_number(infobox),
        "production code": production_code(infobox),
        "airdate": airdate(infobox),
        "main character(s)": maincharacters(infobox),
        "written by": writtenby(infobox),
        "directed by": directedby(infobox)
    }

In [5]:
def esepisodesPage(page:bs):
    return page.find(id="firstHeading").string.strip() == "List of Episodes"

def episodesURLs(episodesPage:bs):
    if not esepisodesPage(episodesPage):
        raise ValueError("Soup received is not a episodes page")
    
    episodesSection = episodesPage.find_all(class_="oLeft")[0]
    episodes = episodesSection.find_all(class_="oLeft")

    links = []

    for episode in episodes:
        links.append(BASE_URL + episode.a["href"])
    return links


In [6]:
EPISODE_PAGE = "https://simpsons.fandom.com/wiki/List_of_Episodes"

TEST = True # scrape some random episodes
EPISODE_TEST_URL = "https://simpsons.fandom.com/wiki/Homer%27s_Odyssey" # scrape only this if not None

def scrapeEpisode(url:str):
    episodePage = soup(url)
    episode = episodeAttrs(episodePage, url=url)
    return episode


def scrapeEpisodes(EpisodePage = EPISODE_PAGE):
    episodes = []
    
    episode = scrapeEpisode(EpisodePage)
    pprint(episode, sort_dicts=False)
    episodes.append(episode)

    return episodes

if __name__ == "__main__":
    if EPISODE_TEST_URL:
        print("Testing single episode " + EPISODE_TEST_URL)
        pprint(scrapeEpisode(EPISODE_TEST_URL), sort_dicts=False)
    else:
        if TEST:
            scrapeEpisodes()

Testing single episode https://simpsons.fandom.com/wiki/Homer%27s_Odyssey
{'url': 'https://simpsons.fandom.com/wiki/Homer%27s_Odyssey',
 'title': "Homer's Odyssey",
 'image': 'https://static.wikia.nocookie.net/simpsons/images/e/ed/Homer%27s_Odyssey_%28Mr._Burns_Looking_Out_the_Window%29.png/revision/latest/scale-to-width-down/350?cb=20200915011051',
 'episode number': '3',
 'production code': '7G03',
 'airdate': 'January 21, 1990',
 'main character(s)': 'None',
 'written by': 'None',
 'directed by': 'Wes Archer'}
