# Setup

## Python version

In [1]:
from platform import python_version
print(python_version())

3.9.13


## Libraries

In [2]:
from bs4 import BeautifulSoup as bs
from warnings import warn

import souphelper 
from souphelper import *

import random
from pprint import pprint

import time
import csv

# Episodes Scraping from Fandom 

We want scraping from Fandom site all the simpsons episodes with: 

* Number total [Integer]
* Number in season [Integer]
* Season number [Integer]
* Title [String]
* Air date [Date with no time]
* Production code [string]
* Main characters [list of strings]
* Written by [String]
* Directed by [String]

In [3]:
BASE_URL = "https://simpsons.fandom.com"

We define a series of functions for the attributes, to which we are interested, of the episodes

In [4]:
STR_SEPARATOR = ","

def title(episodeInfobox:bs):
    if episodeInfobox:
        titleTag = episodeInfobox.find("h2")
        if titleTag:
            return str(titleTag.string).strip()
    return None

def image(episodeInfobox:bs):
    if episodeInfobox:
        imageTag = episodeInfobox.figure
        if imageTag:
            return imageTag.a.img["src"]
    return None

def episode_number(episodeInfobox:bs):
    if episodeInfobox:
        episodeNumberTag = episodeInfobox.find(attrs={"data-source": "Episode Number"})
        if episodeNumberTag:
            return str(episodeNumberTag.div.string).strip()
    return None

def season(episodePage):
    if episodePage:
        number_season = episodePage.find(attrs={"data-tracking-label": "categories-top-more-0"}).string.strip()
        res = [int(i) for i in number_season.split() if i.isdigit()]
        return res
    return None

def production_code(episodeInfobox:bs):
    if episodeInfobox:
        production_codeTag = episodeInfobox.find(attrs={"data-source": "productionCode"})
        if production_codeTag:
            return str(production_codeTag.div.string).strip()
    return None

def airdate(episodeInfobox:bs):
    if episodeInfobox:
        airdateTag = episodeInfobox.find(attrs={"data-source": "originalAirdate"})
        if airdateTag:
            return str(airdateTag.div.string).strip()
    return None

def maincharacters(episodeInfobox:bs):
    if episodeInfobox:
        mainCharactersTag = episodeInfobox.find(attrs={"data-source": "main_character(s)"})
        if mainCharactersTag:
            mainCharactersContent = mainCharactersTag.div
            handleP(mainCharactersContent)
            handleLinks(mainCharactersContent)
            handleLinebreaks(mainCharactersContent, STR_SEPARATOR)
            return str(mainCharactersContent.string).strip()
    return None

def writtenby(episodeInfobox:bs):
    if episodeInfobox:
        writtenbyTag = episodeInfobox.find(attrs={"data-source": "Written By"})
        if writtenbyTag:
            writtenbyContent = writtenbyTag.div
            handleP(writtenbyContent)
            handleLinks(writtenbyContent)
            handleLinebreaks(writtenbyContent, STR_SEPARATOR)
            return str(writtenbyTag.div.string).strip()
    return None

def directedby(episodeInfobox:bs):
    if episodeInfobox:
        directedbyTag = episodeInfobox.find(attrs={"data-source": "Directed By"})
        if directedbyTag:
            directedbyContent =directedbyTag.div
            handleP(directedbyContent)
            handleLinks(directedbyContent)
            handleLinebreaks(directedbyContent, STR_SEPARATOR)
            return str(directedbyTag.div.string).strip()
    return None

def episodeAttrs(episodePage:bs, **moreAttributes):
    infobox = episodePage.find(class_="portable-infobox")
    return {
        **moreAttributes,
        "title": episodePage.find(id="firstHeading").string.strip(),
        "image": image(infobox),
        "season": season(episodePage),
        "episode number": episode_number(infobox),
        "production code": production_code(infobox),
        "airdate": airdate(infobox),
        "main character(s)": maincharacters(infobox),
        "written by": writtenby(infobox),
        "directed by": directedby(infobox)
    }

Other necessary functions

In [5]:
EPISODE_PAGE = "https://simpsons.fandom.com/wiki/List_of_Episodes"

In [6]:
def esepisodesPage(page:bs):
    return page.find(id="firstHeading").string.strip() == "List of Episodes"

def episodesURLs(episodesPage:bs):
    if not esepisodesPage(episodesPage):
        raise ValueError("Soup received is not a episodes page")
    
    episodes = episodesPage.find_all("a", class_="oLeft")[0]
    
    links = []
    for episode in episodes:
        links.append(BASE_URL + episode.a["href"])
    return links


In [7]:
def scrapeEpisode(url:str):
    episodePage = soup(url)
    episode = episodeAttrs(episodePage, url=url)
    return episode

def scrapeEpisodes(EpisodePage = EPISODE_PAGE):
    episodes = []
    episodesPageURL = EpisodePage

    for episodesURL in episodesURLs(EpisodePage) :
        episode = scrapeEpisode(EpisodePage)
        if episode:
            episodes.append(episode)

    return episodes

Define function to write csv files

In [8]:
CSV_FILE_NAME = "episodes"

def writeCsv(data:list[dict], filename = CSV_FILE_NAME):
    if not data or len(data) <= 0:
        return
        
    with open(filename+".csv", 'w', encoding='utf-8' , newline='') as f:
        writer = csv.DictWriter(f,
            fieldnames=data[0].keys(),
            delimiter=';',
            quotechar='"',
            escapechar="\\"
        )
        writer.writeheader()
        writer.writerows(data)

Scraping a single episode (if we insert link in EPISODE_TEST_URL variable) or all episodes (otherwise). In the second case we have the time spent for scraping, the time for export and the total time.

In [9]:
EPISODE_TEST_URL = "https://simpsons.fandom.com/wiki/The_Old_Man_and_the_Key" # scrape only this if not None

if __name__ == "__main__":
    if EPISODE_TEST_URL:
        print("Testing single episode " + EPISODE_TEST_URL)
        pprint(scrapeEpisode(EPISODE_TEST_URL), sort_dicts=False)
    else:
        # Scrape
        startScrapingTime = time.time()
        episodes, _ = scrapeEpisodes()
        scrapingTime = time.time() - startScrapingTime
        print("Scraping completed in " + str(int(scrapingTime/60)) + " min (" + str(scrapingTime) + " sec)")
        
        # Export
        startExportTime = time.time()
        print("Now exporting to " + CSV_FILE_NAME)
        writeCsv(episodes)
        exportTime = time.time() - startExportTime
        print("Export completed in " + str(int(exportTime/60)) + " min (" + str(exportTime) + " sec)")
        
        # Total
        totalTime = time.time() - startScrapingTime
        print("Number of episodes scraped: " + str(len(episodes)))
        print("Finished in " + str(int(totalTime/60)) + " min (" + str(totalTime) + " sec)")

Testing single episode https://simpsons.fandom.com/wiki/Another_Simpsons_Clip_Show
{'url': 'https://simpsons.fandom.com/wiki/Another_Simpsons_Clip_Show',
 'title': 'Another Simpsons Clip Show',
 'image': 'https://static.wikia.nocookie.net/simpsons/images/6/6c/Al.jpg/revision/latest/scale-to-width-down/350?cb=20191003181647',
 'season': [6],
 'episode number': '106',
 'production code': '2F33',
 'airdate': 'September 25, 1994',
 'main character(s)': 'Simpson family,Jacques,Mindy Simmons,Ralph Wiggum,Laura '
                      'Powers',
 'written by': 'Penny Wise',
 'directed by': 'David Silverman'}
