In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import lxml

In [2]:
max_seasons = 28
season_page_url_base = 'https://simpsons.fandom.com/wiki/Season_'
season_page_urls = [season_page_url_base+str(i+1) for i in range(max_seasons)]

In [104]:
total_counter = 0

def get_links_by_season(season_id: int, verbose=False):
    req = requests.get(season_page_urls[season_id])
    if verbose:
        print(f"Request terminated with status code {req.status_code}")
        print(f"Response encoded with {req.encoding}")
    season_page = BeautifulSoup(req.text, 'html.parser')

    episodes_links = {"title" : [], "url": [], "episode": [], "season": []}

    tables = season_page('table')
    table_id = 0
    for i, table in enumerate(tables):
        if table.has_attr('class') and table['class'][0] == 'wikitable':
            table_id = i
            break
    rows = tables[table_id].find_all('tr')
    episode_num = (len(rows)-1)
    episode_counter = 0
    for i in range(episode_num+1):
        cols = rows[i].find_all('td')
        title_col_id = 4 if (16 < season_id < 21) else (3 if season_id == 21 else 2)
        if title_col_id < len(cols):
            link = cols[title_col_id].find_all('a', href=True)
            if len(link) > 0:
                episodes_links['title'].append(link[0].text)
                episodes_links['url'].append('https://simpsons.fandom.com'+link[0]['href'])
                episodes_links['season'].append(season_id+1)
                episode_counter = episode_counter + 1
                episodes_links['episode'].append(episode_counter)
    return episodes_links

In [107]:
links_dict = [get_links_by_season(i) for i in range(max_seasons)]
links_df = [pd.DataFrame(links_dict[i]) for i in range(max_seasons)]
links_per_episode_df = pd.concat(links_df)
links_per_episode_df[['episode', 'season']] = links_per_episode_df[['episode', 'season']].astype(int)
links_per_episode_df.to_csv('simpsons_fandom_wiki_links.csv', index=False)
links_per_episode_df.head(10)

Unnamed: 0,title,url,episode,season
0,Simpsons Roasting on an Open Fire,https://simpsons.fandom.com/wiki/Simpsons_Roas...,1,1
1,Bart the Genius,https://simpsons.fandom.com/wiki/Bart_the_Genius,2,1
2,Homer's Odyssey,https://simpsons.fandom.com/wiki/Homer%27s_Ody...,3,1
3,There's No Disgrace Like Home,https://simpsons.fandom.com/wiki/There%27s_No_...,4,1
4,Bart the General,https://simpsons.fandom.com/wiki/Bart_the_General,5,1
5,Moaning Lisa,https://simpsons.fandom.com/wiki/Moaning_Lisa,6,1
6,The Call of the Simpsons,https://simpsons.fandom.com/wiki/The_Call_of_t...,7,1
7,The Telltale Head,https://simpsons.fandom.com/wiki/The_Telltale_...,8,1
8,Life on the Fast Lane,https://simpsons.fandom.com/wiki/Life_on_the_F...,9,1
9,Homer's Night Out,https://simpsons.fandom.com/wiki/Homer%27s_Nig...,10,1


In [106]:
links_per_episode_df.describe()

Unnamed: 0,episode,season
count,617.0,617.0
mean,11.623987,14.482982
std,6.514066,7.979486
min,1.0,1.0
25%,6.0,8.0
50%,12.0,14.0
75%,17.0,21.0
max,25.0,28.0


In [6]:
req = requests.get('https://www.imdb.com/title/tt0096697/episodes?season=1')
print(f"Request terminated with status code {req.status_code}")
print(f"Response encoded with {req.encoding}")

Request terminated with status code 200
Response encoded with UTF-8


In [7]:
episodes_df = pd.read_csv('Datasets/data-society-the-simpsons-by-the-data/simpsons_episodes.csv')

In [8]:
episodes_df.head(10)

Unnamed: 0,id,image_url,imdb_rating,imdb_votes,number_in_season,number_in_series,original_air_date,original_air_year,production_code,season,title,us_viewers_in_millions,video_url,views
0,10,http://static-media.fxx.com/img/FX_Networks_-_...,7.4,1511.0,10,10,1990-03-25,1990,7G10,1,Homer's Night Out,30.3,http://www.simpsonsworld.com/video/275197507879,50816.0
1,12,http://static-media.fxx.com/img/FX_Networks_-_...,8.3,1716.0,12,12,1990-04-29,1990,7G12,1,Krusty Gets Busted,30.4,http://www.simpsonsworld.com/video/288019523914,62561.0
2,14,http://static-media.fxx.com/img/FX_Networks_-_...,8.2,1638.0,1,14,1990-10-11,1990,7F03,2,"Bart Gets an ""F""",33.6,http://www.simpsonsworld.com/video/260539459671,59575.0
3,17,http://static-media.fxx.com/img/FX_Networks_-_...,8.1,1457.0,4,17,1990-11-01,1990,7F01,2,Two Cars in Every Garage and Three Eyes on Eve...,26.1,http://www.simpsonsworld.com/video/260537411822,64959.0
4,19,http://static-media.fxx.com/img/FX_Networks_-_...,8.0,1366.0,6,19,1990-11-15,1990,7F08,2,Dead Putting Society,25.4,http://www.simpsonsworld.com/video/260539459670,50691.0
5,21,http://static-media.fxx.com/img/FX_Networks_-_...,8.4,1522.0,8,21,1990-12-06,1990,7F06,2,Bart the Daredevil,26.2,http://www.simpsonsworld.com/video/260539459702,57605.0
6,23,http://static-media.fxx.com/img/FX_Networks_-_...,7.8,1340.0,10,23,1991-01-10,1991,7F10,2,Bart Gets Hit by a Car,24.8,http://www.simpsonsworld.com/video/260550723760,56486.0
7,26,http://static-media.fxx.com/img/FX_Networks_-_...,8.0,1329.0,13,26,1991-02-07,1991,7F13,2,Homer vs. Lisa and the 8th Commandment,26.2,http://www.simpsonsworld.com/video/260820547692,58277.0
8,28,http://static-media.fxx.com/img/FX_Networks_-_...,8.2,1413.0,15,28,1991-02-21,1991,7F16,2,"Oh Brother, Where Art Thou?",26.8,http://www.simpsonsworld.com/video/272046659561,47426.0
9,30,http://static-media.fxx.com/img/FX_Networks_-_...,7.6,1243.0,17,30,1991-03-28,1991,7F17,2,Old Money,21.2,http://www.simpsonsworld.com/video/263504963955,44331.0
