In [56]:
# !pip install requests beautifulsoup4 pandas
# #Successfully installed beautifulsoup4-4.12.3 certifi-2024.8.30 charset-normalizer-3.3.2 idna-3.10 numpy-2.1.1 pandas-2.2.3 pytz-2024.2 requests-2.32.3 soupsieve-2.6 tzdata-2024.2 urllib3-2.2.3
# Requirement already satisfied: tqdm in /home/meher/miniforge3/envs/reel-talk/lib/python3.12/site-packages (4.66.5)

In [3]:
import requests
from bs4 import BeautifulSoup
import re

url = "https://en.wikipedia.org/wiki/How_I_Met_Your_Mother_episodes"

# Fetch the page content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all episode tables
episode_tables = soup.find_all('table', class_='wikitable plainrowheaders wikiepisodetable')

In [4]:
episode_list = []

# Extract episodes from each season
for season, table in enumerate(episode_tables, 1):
    rows = table.find_all('tr')[1:]
    for row in rows:
        try:
            title = row.find('td', class_='summary').get_text(strip=True)
            link_tag = row.find('td', class_='summary').find('a')
            href = f"https://en.wikipedia.org{link_tag['href']}" if link_tag else None

            airdate = row.find_all('td')[4].get_text(strip=True)
            date_pattern = r'\(([^)]+)\)'
            match = re.search(date_pattern, airdate)
            if match:
                extracted_text = match.group(1)
                airdate = extracted_text
            viewership = row.find_all('td')[6].get_text(strip=True)
            citation_pattern = r'\[\d+\]|\(\d+\)'
            viewership = re.sub(citation_pattern, '', viewership)
            episode_list.append({"Season": season, "Title": title, "Link": href, "Airdate": airdate, "Viewership(Mil)": viewership})
        except:
            continue


In [5]:
import pandas as pd

df = pd.DataFrame(episode_list)
df

Unnamed: 0,Season,Title,Link,Airdate,Viewership(Mil)
0,1,"""Pilot""",https://en.wikipedia.org/wiki/Pilot_(How_I_Met...,2005-09-19,10.94
1,1,"""Purple Giraffe""",,2005-09-26,10.40
2,1,"""Sweet Taste of Liberty""",,2005-10-03,10.44
3,1,"""Return of the Shirt""",,2005-10-10,9.84
4,1,"""Okay Awesome""",,2005-10-17,10.14
...,...,...,...,...,...
200,9,"""Vesuvius""",https://en.wikipedia.org/wiki/Vesuvius_(How_I_...,2014-03-03,9.11
201,9,"""Daisy""",https://en.wikipedia.org/wiki/Daisy_(How_I_Met...,2014-03-10,7.70
202,9,"""Gary Blauman""",https://en.wikipedia.org/wiki/Gary_Blauman,2014-03-17,7.78
203,9,"""The End of the Aisle""",,2014-03-24,9.04


In [43]:
import time

In [41]:
def scrape_episode(url:str):
    if not url:
        return ""
    try:
        response = requests.get(url)
    except Exception:
        time.sleep(5)
        response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')
    main_content = soup.find('div', class_='mw-parser-output')

    article_text = ""
    for element in main_content.find_all(['p', 'h2', 'h3']):
        if element.name == 'h2' and 'Production' in element.get_text():
            break
        article_text += element.get_text(separator=' ', strip=True) + ' '

    clean_text = article_text.replace('\xa0', ' ')
    return clean_text
    


In [49]:
from tqdm import tqdm
tqdm.pandas()
df["content"] = df["Link"].progress_apply(scrape_episode)

  0%|          | 0/205 [00:00<?, ?it/s]

100%|██████████| 205/205 [00:41<00:00,  4.94it/s]


In [44]:
df

Unnamed: 0,Season,Title,Link,Airdate,Viewership(Mil),content
0,1,"""Pilot""",https://en.wikipedia.org/wiki/Pilot_(How_I_Met...,2005-09-19,10.94,""" Pilot "" is the pilot episode and the first e..."
1,1,"""Purple Giraffe""",,2005-09-26,10.40,
2,1,"""Sweet Taste of Liberty""",,2005-10-03,10.44,
3,1,"""Return of the Shirt""",,2005-10-10,9.84,
4,1,"""Okay Awesome""",,2005-10-17,10.14,
...,...,...,...,...,...,...
200,9,"""Vesuvius""",https://en.wikipedia.org/wiki/Vesuvius_(How_I_...,2014-03-03,9.11,""" Vesuvius "" is the nineteenth episode of the..."
201,9,"""Daisy""",https://en.wikipedia.org/wiki/Daisy_(How_I_Met...,2014-03-10,7.70,""" Daisy "" is the twentieth episode of the nin..."
202,9,"""Gary Blauman""",https://en.wikipedia.org/wiki/Gary_Blauman,2014-03-17,7.78,""" Gary Blauman "" is the 21st episode of the n..."
203,9,"""The End of the Aisle""",,2014-03-24,9.04,


In [59]:
for i in tqdm(range(1000), "This is it"):
    pass

This is it: 100%|██████████| 1000/1000 [00:00<00:00, 1931968.68it/s]


In [1]:
url_fandom = 'https://how-i-met-your-mother.fandom.com/wiki/Episode_Guide'

In [2]:
from util.parser_util import get_all_episodes_df
# from util.data_util import save_content_to_path
import pandas as pd

In [4]:
df = get_all_episodes_df(url_fandom)

In [24]:
import time
import requests
from bs4 import BeautifulSoup

def scrape_episode_fandom(url:str):
    if not url:
        return ""
    try:
        response = requests.get(url)
    except Exception:
        time.sleep(5)
        response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')
    main_content = soup.find('div', class_='mw-parser-output')

    article_text = ""
    for element in main_content.find_all(['p', 'li', 'h2', 'h3']):
        if element.name == 'h2' and ('Gallery' in element.get_text() or 'References' in element.get_text() or 'External Links' in element.get_text()):
            continue
        article_text += element.get_text(separator=' ', strip=True) + ' '

    clean_text = article_text.replace('\xa0', ' ')
    clean_text = clean_text.replace('\u200b', ' ')
    return clean_text

In [25]:
scrape_episode_fandom(df['link'].iloc[0])

'Images (13) Future Ted begins telling his two kids the story of how he met their mother. He introduces Barney, Lily, Marshall and Robin, telling his kids how Lily and Marshall got engaged and how he met Robin.  Contents 1 Recap 2 Continuity 3 Future References (Contains Spoilers) 4 Gallery 5 Memorable Quotes 6 Notes and Trivia 6.1 Goofs and Errors 6.2 Allusions and Outside References 6.3 Music 6.4 Other Notes 6.1 Goofs and Errors 6.2 Allusions and Outside References 6.3 Music 6.4 Other Notes 7 Awards 8 Guests 9 Reception 10 References 11 External links Recap [ ] The episode (and series) begins with future Ted announcing to his two teenaged children that he will be telling them the story of how he met their mother . They are unamused. In 2005, law student Marshall decides to propose to kindergarten teacher Lily , his longtime girlfriend. As he rehearses the proposal to their college friend/roommate Ted at the Apartment , Marshall asks Ted what he\'ll be doing that night while they get 

In [26]:
from tqdm import tqdm

tqdm.pandas(desc="Extracting individual episode data from links")
df["content"] = df["link"].progress_apply(scrape_episode_fandom)

Extracting individual episode data from links: 100%|██████████| 238/238 [03:47<00:00,  1.04it/s]


In [7]:
df

Unnamed: 0,title,link
0,Pilot,https://how-i-met-your-mother.fandom.com/wiki/...
1,Purple Giraffe,https://how-i-met-your-mother.fandom.com/wiki/...
2,Sweet Taste of Liberty,https://how-i-met-your-mother.fandom.com/wiki/...
3,Return of the Shirt,https://how-i-met-your-mother.fandom.com/wiki/...
4,Okay Awesome,https://how-i-met-your-mother.fandom.com/wiki/...
...,...,...
233,The Jersey Connection,https://how-i-met-your-mother.fandom.com/wiki/...
234,Out Of Sync,https://how-i-met-your-mother.fandom.com/wiki/...
235,Parent Trap,https://how-i-met-your-mother.fandom.com/wiki/...
236,Shady Parker,https://how-i-met-your-mother.fandom.com/wiki/...
