# Setup

## Python version

In [None]:
from platform import python_version
print(python_version())

## Libraries

In [None]:
from bs4 import BeautifulSoup as bs
from warnings import warn

import souphelper 
from souphelper import *

import episodesEx 
from episodesEx import *

import random
from pprint import pprint

import time
import csv

# Episodes Scraping from Fandom 

We want scraping from Fandom site all the simpsons episodes with: 

* URL episode [string]
* Number total [Integer]
* Season number [Integer]
* Title [String]
* Air date [Date with no time]
* Production code [string]
* Main characters [list of strings]
* Written by [String]
* Directed by [String]

In [None]:
BASE_URL = "https://simpsons.fandom.com"

We define a series of functions for the attributes, to which we are interested, of the episodes

In [None]:
STR_SEPARATOR = ","

def title(episodeInfobox:bs):
    if episodeInfobox:
        titleTag = episodeInfobox.find("h2")
        if titleTag:
            return str(titleTag.string).strip()
    return None

def image(episodeInfobox:bs):
    if episodeInfobox:
        imageTag = episodeInfobox.figure
        if imageTag:
            return imageTag.a.img["src"]
    return None

def episode_number(episodeInfobox:bs):
    if episodeInfobox:
        episodeNumberTag = episodeInfobox.find(attrs={"data-source": "Episode Number"})
        if episodeNumberTag:
            return str(episodeNumberTag.div.string).strip()
    return None

def season(episodePage):
    if episodePage:
        number_season = episodePage.find(attrs={"data-tracking-label": "categories-top-more-0"}).string.strip()
        res = [int(i) for i in number_season.split() if i.isdigit()]
        if len(res) == 1:
            return res[0]
        elif len(res) == 0:
            return None
        else:
            warn("More than one season number forund" + str(res))
            return res[0]
        return res
    return None

def production_code(episodeInfobox:bs):
    if episodeInfobox:
        production_codeTag = episodeInfobox.find(attrs={"data-source": "productionCode"})
        if production_codeTag:
            return str(production_codeTag.div.string).strip()
    return None

def airdate(episodeInfobox:bs):
    if episodeInfobox:
        airdateTag = episodeInfobox.find(attrs={"data-source": "originalAirdate"})
        if airdateTag:
            return str(airdateTag.div.string).strip()
    return None

def maincharacters(episodeInfobox:bs):
    if episodeInfobox:
        mainCharactersTag = episodeInfobox.find(attrs={"data-source": "main_character(s)"})
        if mainCharactersTag:
            mainCharactersContent = mainCharactersTag.div
            handleP(mainCharactersContent)
            handleLinks(mainCharactersContent)
            handleLinebreaks(mainCharactersContent, STR_SEPARATOR)
            return str(mainCharactersContent.string).strip()
    return None

def writtenby(episodeInfobox:bs):
    if episodeInfobox:
        writtenbyTag = episodeInfobox.find(attrs={"data-source": "Written By"})
        if writtenbyTag:
            writtenbyContent = writtenbyTag.div
            handleP(writtenbyContent)
            handleLinks(writtenbyContent)
            handleLinebreaks(writtenbyContent, STR_SEPARATOR)
            return str(writtenbyTag.div.string).strip()
    return None

def directedby(episodeInfobox:bs):
    if episodeInfobox:
        directedbyTag = episodeInfobox.find(attrs={"data-source": "Directed By"})
        if directedbyTag:
            directedbyContent =directedbyTag.div
            handleP(directedbyContent)
            handleLinks(directedbyContent)
            handleLinebreaks(directedbyContent, STR_SEPARATOR)
            return str(directedbyTag.div.string).strip()
    return None

def episodeAttrs(episodePage:bs, **moreAttributes):
    infobox = episodePage.find(class_="portable-infobox")
    return {
        **moreAttributes,
        "title": episodePage.find(id="firstHeading").string.strip(),
        "image_url": image(infobox),
        "season": season(episodePage),
        "episode_number_absolute": episode_number(infobox),
        "production_code": production_code(infobox),
        "airdate": airdate(infobox),
        "main_characters": maincharacters(infobox),
        "written_by": writtenby(infobox),
        "directed_by": directedby(infobox)
    }

Other necessary functions

In [None]:
EPISODE_PAGE = "https://simpsons.fandom.com/wiki/List_of_Episodes"

In [None]:
def esepisodesPage(page:bs):
    return page.find(id="firstHeading").string.strip() == "List of Episodes"

def episodesURLs(episodesPage:bs):
    if not esepisodesPage(episodesPage):
        raise ValueError("Soup received is not a episodes page")
    
    episodes = episodesPage.find_all("td", class_="oLeft")
    
    links = []
    for episode in episodes:
        links.append(BASE_URL + episode.b.a["href"])
    return links


In [None]:
def scrapeEpisode(url:str):
    if url in exceptions:
        return None
    episodePage = soup(url)
    episode = episodeAttrs(episodePage, url=url)
    return episode

def scrapeEpisodesPage(url:str):
    episodes = []
    episodesPage = soup(url)

    for episodeURL in episodesURLs(episodesPage):
        episode = scrapeEpisode(episodeURL)
        if episode:
            episodes.append(episode)
    return episodes

def scrapeEpisodes(EpisodePage = EPISODE_PAGE):
    episodes = []
    episodesPageURL = EpisodePage
    
    pageEpisodes = scrapeEpisodesPage(episodesPageURL)
    episodes.extend(pageEpisodes)

    return episodes

Define function to write csv files

In [None]:
CSV_FILE_NAME = "episodes"
CSV_FILE_EXTENSION = "csv"

def writeCsv(data:list[dict], filename = CSV_FILE_NAME):
    if not data or len(data) <= 0:
        return
        
    with open(filename+"."+CSV_FILE_EXTENSION, 'w', encoding='utf-8' , newline='') as f:
        writer = csv.DictWriter(f,
            fieldnames=data[0].keys(),
            delimiter=';',
            quotechar='"',
            escapechar="\\",
            quoting=csv.QUOTE_NONNUMERIC
        )
        writer.writeheader()
        writer.writerows(data)

Scraping a single episode (if we insert link in EPISODE_TEST_URL variable) or all episodes (otherwise). In the second case we have the time spent for scraping, the time for export and the total time.

In [None]:
EPISODE_TEST_URL = None # scrape only this if not None

if __name__ == "__main__":
    if EPISODE_TEST_URL:
        print("Testing single episode " + EPISODE_TEST_URL)
        pprint(scrapeEpisode(EPISODE_TEST_URL), sort_dicts=False)
    else:
        # Scrape
        startScrapingTime = time.time()
        episodes = scrapeEpisodes()
        scrapingTime = time.time() - startScrapingTime
        print("Scraping completed in " + str(int(scrapingTime/60)) + " min (" + str(scrapingTime) + " sec)")
        
        # Export
        startExportTime = time.time()
        print("Now exporting to " + CSV_FILE_NAME)
        writeCsv(episodes)
        exportTime = time.time() - startExportTime
        print("Export completed in " + str(int(exportTime/60)) + " min (" + str(exportTime) + " sec)")
        
        # Total
        totalTime = time.time() - startScrapingTime
        print("Number of episodes scraped: " + str(len(episodes)))
        print("Finished in " + str(int(totalTime/60)) + " min (" + str(totalTime) + " sec)")

## Data Cleaning

### Libraries
Pandas

In [None]:
import pandas as pd

In [None]:
episodes = pd.read_csv("C:\GitHub\Data-Management\episodes.csv", sep=";")

Deleting the episode without airdate

In [10]:
episodes.drop(episodes.index[episodes["airdate"] == "TBA"], inplace=True)

# Data Quality

Calculating the number of NaN for each column

In [11]:
episodes_nan = pd.DataFrame(episodes.isna().sum(), columns=['Total NaN'])
episodes_nan

Unnamed: 0,Total NaN
url,0
title,0
image_url,3
season,130
episode_number_absolute,0
production_code,0
airdate,0
main_characters,309
written_by,0
directed_by,0


Calculatin the percentage of NaN for each column

In [None]:
episodes_nan['% NaN'] = (episodes_nan['Total NaN']/753)*100
episodes_nan = episodes_nan.round(decimals=2)
episodes_nan

Completeness for Fandom Episodes

In [12]:
tot = 0
for item in episodes.isna().sum():
  tot += item
completezza_tab =1 -(tot /(episodes.shape[0]*episodes.shape[1])) 
print(completezza_tab)

TypeError: unsupported operand type(s) for +=: 'int' and 'str'