In [131]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [12]:
links_df_original = pd.read_csv('simpsons_fandom_wiki_links.csv')

In [19]:
links_df = links_df_original.sort_values(['season', 'episode'], ascending=[True, True])
links_df.index = np.arange(1,len(links_df)+1)
links_df.head(10)

Unnamed: 0,title,url,episode,season
1,Simpsons Roasting on an Open Fire,https://simpsons.fandom.com/wiki/Simpsons_Roas...,1,1
2,Bart the Genius,https://simpsons.fandom.com/wiki/Bart_the_Genius,2,1
3,Homer's Odyssey,https://simpsons.fandom.com/wiki/Homer%27s_Ody...,3,1
4,There's No Disgrace Like Home,https://simpsons.fandom.com/wiki/There%27s_No_...,4,1
5,Bart the General,https://simpsons.fandom.com/wiki/Bart_the_General,5,1
6,Moaning Lisa,https://simpsons.fandom.com/wiki/Moaning_Lisa,6,1
7,The Call of the Simpsons,https://simpsons.fandom.com/wiki/The_Call_of_t...,7,1
8,The Telltale Head,https://simpsons.fandom.com/wiki/The_Telltale_...,8,1
9,Life on the Fast Lane,https://simpsons.fandom.com/wiki/Life_on_the_F...,9,1
10,Homer's Night Out,https://simpsons.fandom.com/wiki/Homer%27s_Nig...,10,1


In [105]:
ref_page_link = 'https://simpsons.fandom.com/wiki/Simpsons_Roasting_on_an_Open_Fire/References'
req = requests.get(ref_page_link)
print(f"Request terminated with status code {req.status_code}")
print(f"Response encoded with {req.encoding}")
ref_page = BeautifulSoup(req.text, 'html.parser')

Request terminated with status code 200
Response encoded with UTF-8


In [191]:
ref_types = ['Trivia', 'Cultural references', 'Continuity', 'Goofs', 'Cultural References', 'References', 'Appearances in other media', 'Trivia/Goofs', 'Goofs/Trivia', 'Previous Episode References', 'Production Notes', 'Citation', 'Connections to future episodes', "Krusty's Birthday Buddies", 'Movie Moment', 'Citations', 'Trivia/Cultural References', 'Censorship', 'Cultural Reference', 'U.S. Syndication Cuts', 'Beatles references', 'Notes and references', 'Notes', 'Legacy', 'Running Gags:', 'Seven Deadly Sins', 'Previous and Future Episode References', 'Goofs and Continuity Errors', 'Pants Goof', 'Deleted scenes', 'DVD Release', 'Censorship and Bans', 'Reception and Legacy', 'Reception', 'Airings', 'Pranks', 'Alternate Versions', 'Call-Backs', 'External Links', 'Goofs in the "Italian-American-Mexican Standoff" scene', 'Premiere', 'Previous episode references', 'Broadcast', 'Aerospace References', 'Awards', 'China References', 'Production', 'Goofs/errors', 'Broadcasting Information', 'Errors', 'World War II references', 'Goof', 'Notes on Known Profiles', 'Reference links', 'Songs', 'Differences from Lady and the Tramp', 'Confusion', 'Differences from Snow White and The Seven Dwarfs', 'Lists', 'References to other episodes', 'Music', 'Factual Errors', 'International premieres', 'Cuts', 'References to Toy Story in Condiments', "Goofs'", 'Continuity Errors', 'In-show references', 'Culture References', 'Characters seen in the Advent Calendar', 'Continually', 'Previous Episodes References']

In [193]:
# for all headers in the list find <ul> lists that may be separated with figures or smaller header
# count <li> elements in each list
def get_ref_counts(episode_id):
    ref_page_link = links_df['url'].iloc[episode_id-1] + '/References'
    req = requests.get(ref_page_link)
    ref_page = BeautifulSoup(req.text, 'html.parser')
    headers = ref_page.find_all('h2')
    counter_dict = {}
    h_pointer = headers[0]
    for header in headers:
        if header.text in ref_types:
            h_pointer = header
            break

    counter_dict[h_pointer.text] = 0
    breakers_list = []
    for sibling in h_pointer.next_siblings:
        if sibling.name in ['h2', 'ul']:
            # switch pointer to new header if it was encountered
            if sibling.name == 'h2':
                if sibling.text in ref_types:
                    h_pointer = sibling
                    counter_dict[h_pointer.text] = 0
            else:
                # count number of li elements in ul tag
                counter = 0
                for li in sibling.contents:
                    if li.name == 'li':
                        counter = counter + 1
                # increment total number of il elements of a pointed h2
                counter_dict[h_pointer.text] = counter_dict[h_pointer.text] + counter
        elif sibling.name in ['h3','figure',None]:
            # skip decorator elements
            continue
        else:
            breakers_list.append(sibling.name)
            break
    return counter_dict

In [194]:
counter_dicts = []
for i in range(len(links_df)):
    cd = get_ref_counts(i+1)
    counter_dicts.append(cd)

In [195]:
list_df = []
for i, cd in enumerate(counter_dicts):
    list_df.append(pd.DataFrame(cd, index=[i]))
joined_df = pd.concat(list_df)

In [200]:
new_joined_df = joined_df.fillna(0)
new_joined_df = new_joined_df.astype(int)
new_joined_df.to_csv('references.csv', index=False)
new_joined_df.head(10)

Unnamed: 0,Trivia,Cultural references,Continuity,Goofs,Cultural References,References,Appearances in other media,Trivia/Goofs,Goofs/Trivia,Previous Episode References,...,International premieres,Cuts,References to Toy Story in Condiments,Goofs',Continuity Errors,In-show references,Culture References,Characters seen in the Advent Calendar,Continually,Previous Episodes References
0,20,11,16,33,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,0,6,7,14,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28,0,0,17,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20,0,0,18,7,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,20,0,0,6,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,20,0,0,5,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,31,0,0,8,7,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,14,0,0,7,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,14,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [197]:
links_df.iloc[[0,1,2,3,4,5]]['url']

1    https://simpsons.fandom.com/wiki/Simpsons_Roas...
2     https://simpsons.fandom.com/wiki/Bart_the_Genius
3    https://simpsons.fandom.com/wiki/Homer%27s_Ody...
4    https://simpsons.fandom.com/wiki/There%27s_No_...
5    https://simpsons.fandom.com/wiki/Bart_the_General
6        https://simpsons.fandom.com/wiki/Moaning_Lisa
Name: url, dtype: object