In [56]:
# !pip install requests beautifulsoup4 pandas
# #Successfully installed beautifulsoup4-4.12.3 certifi-2024.8.30 charset-normalizer-3.3.2 idna-3.10 numpy-2.1.1 pandas-2.2.3 pytz-2024.2 requests-2.32.3 soupsieve-2.6 tzdata-2024.2 urllib3-2.2.3
# Requirement already satisfied: tqdm in /home/meher/miniforge3/envs/reel-talk/lib/python3.12/site-packages (4.66.5)

In [3]:
import requests
from bs4 import BeautifulSoup
import re

url = "https://en.wikipedia.org/wiki/How_I_Met_Your_Mother_episodes"

# Fetch the page content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all episode tables
episode_tables = soup.find_all('table', class_='wikitable plainrowheaders wikiepisodetable')

In [4]:
episode_list = []

# Extract episodes from each season
for season, table in enumerate(episode_tables, 1):
    rows = table.find_all('tr')[1:]
    for row in rows:
        try:
            title = row.find('td', class_='summary').get_text(strip=True)
            link_tag = row.find('td', class_='summary').find('a')
            href = f"https://en.wikipedia.org{link_tag['href']}" if link_tag else None

            airdate = row.find_all('td')[4].get_text(strip=True)
            date_pattern = r'\(([^)]+)\)'
            match = re.search(date_pattern, airdate)
            if match:
                extracted_text = match.group(1)
                airdate = extracted_text
            viewership = row.find_all('td')[6].get_text(strip=True)
            citation_pattern = r'\[\d+\]|\(\d+\)'
            viewership = re.sub(citation_pattern, '', viewership)
            episode_list.append({"Season": season, "Title": title, "Link": href, "Airdate": airdate, "Viewership(Mil)": viewership})
        except:
            continue


In [5]:
import pandas as pd

df = pd.DataFrame(episode_list)
df

Unnamed: 0,Season,Title,Link,Airdate,Viewership(Mil)
0,1,"""Pilot""",https://en.wikipedia.org/wiki/Pilot_(How_I_Met...,2005-09-19,10.94
1,1,"""Purple Giraffe""",,2005-09-26,10.40
2,1,"""Sweet Taste of Liberty""",,2005-10-03,10.44
3,1,"""Return of the Shirt""",,2005-10-10,9.84
4,1,"""Okay Awesome""",,2005-10-17,10.14
...,...,...,...,...,...
200,9,"""Vesuvius""",https://en.wikipedia.org/wiki/Vesuvius_(How_I_...,2014-03-03,9.11
201,9,"""Daisy""",https://en.wikipedia.org/wiki/Daisy_(How_I_Met...,2014-03-10,7.70
202,9,"""Gary Blauman""",https://en.wikipedia.org/wiki/Gary_Blauman,2014-03-17,7.78
203,9,"""The End of the Aisle""",,2014-03-24,9.04


In [43]:
import time

In [41]:
def scrape_episode(url:str):
    if not url:
        return ""
    try:
        response = requests.get(url)
    except Exception:
        time.sleep(5)
        response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')
    main_content = soup.find('div', class_='mw-parser-output')

    article_text = ""
    for element in main_content.find_all(['p', 'h2', 'h3']):
        if element.name == 'h2' and 'Production' in element.get_text():
            break
        article_text += element.get_text(separator=' ', strip=True) + ' '

    clean_text = article_text.replace('\xa0', ' ')
    return clean_text
    


In [49]:
from tqdm import tqdm
tqdm.pandas()
df["content"] = df["Link"].progress_apply(scrape_episode)

  0%|          | 0/205 [00:00<?, ?it/s]

100%|██████████| 205/205 [00:41<00:00,  4.94it/s]


In [44]:
df

Unnamed: 0,Season,Title,Link,Airdate,Viewership(Mil),content
0,1,"""Pilot""",https://en.wikipedia.org/wiki/Pilot_(How_I_Met...,2005-09-19,10.94,""" Pilot "" is the pilot episode and the first e..."
1,1,"""Purple Giraffe""",,2005-09-26,10.40,
2,1,"""Sweet Taste of Liberty""",,2005-10-03,10.44,
3,1,"""Return of the Shirt""",,2005-10-10,9.84,
4,1,"""Okay Awesome""",,2005-10-17,10.14,
...,...,...,...,...,...,...
200,9,"""Vesuvius""",https://en.wikipedia.org/wiki/Vesuvius_(How_I_...,2014-03-03,9.11,""" Vesuvius "" is the nineteenth episode of the..."
201,9,"""Daisy""",https://en.wikipedia.org/wiki/Daisy_(How_I_Met...,2014-03-10,7.70,""" Daisy "" is the twentieth episode of the nin..."
202,9,"""Gary Blauman""",https://en.wikipedia.org/wiki/Gary_Blauman,2014-03-17,7.78,""" Gary Blauman "" is the 21st episode of the n..."
203,9,"""The End of the Aisle""",,2014-03-24,9.04,


In [59]:
for i in tqdm(range(1000), "This is it"):
    pass

This is it: 100%|██████████| 1000/1000 [00:00<00:00, 1931968.68it/s]


In [1]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result

  from tqdm.autonotebook import tqdm, trange


[-0.04895174130797386,
 -0.03986191004514694,
 -0.02156277932226658,
 0.009908479638397694,
 -0.038103990256786346,
 0.01268437784165144,
 0.04349461942911148,
 0.07183395326137543,
 0.009748543612658978,
 -0.006987019907683134,
 0.06352809816598892,
 -0.030322683975100517,
 0.013839451596140862,
 0.025805924087762833,
 -0.0011362511431798339,
 -0.014563609845936298,
 0.041640277951955795,
 0.03622831776738167,
 -0.02680085599422455,
 0.025120683014392853,
 -0.024978596717119217,
 -0.0045332517474889755,
 -0.026667218655347824,
 0.004100722260773182,
 -0.05204799771308899,
 -0.009930439293384552,
 -0.052065253257751465,
 0.008992079645395279,
 -0.0383005328476429,
 -0.044058412313461304,
 -0.004204399883747101,
 0.07047971338033676,
 0.005133938509970903,
 -0.07161536812782288,
 1.697531615718617e-06,
 -0.0060477349907159805,
 -0.011076342314481735,
 0.017513394355773926,
 -0.022299883887171745,
 0.04095498472452164,
 0.03379017859697342,
 0.05665036290884018,
 -0.07114937156438828,
 0

In [2]:
text = "This is another test document."
query_result = embeddings.embed_query(text)
query_result

[-0.02904650568962097,
 -0.056806765496730804,
 -0.025184525176882744,
 0.011193639598786831,
 -0.04380374401807785,
 0.020693380385637283,
 0.05873769521713257,
 0.09078864753246307,
 -0.002211709273979068,
 0.0026602342259138823,
 0.05585940554738045,
 -0.023299995809793472,
 0.011224741116166115,
 0.029922282323241234,
 0.016054773703217506,
 -0.029463985934853554,
 0.03384746238589287,
 0.027453282848000526,
 -0.03623248264193535,
 0.008865992538630962,
 -0.018837008625268936,
 -0.0003995356382802129,
 -0.028773291036486626,
 0.0029593759682029486,
 -0.04093953222036362,
 -0.004181044176220894,
 -0.05309860408306122,
 0.01912405900657177,
 -0.042742785066366196,
 -0.03839045763015747,
 0.012305662035942078,
 0.05483442172408104,
 0.007023625075817108,
 -0.055449262261390686,
 1.7402693401891156e-06,
 -0.02985570952296257,
 0.010298192501068115,
 0.017197260633111,
 -0.027191143482923508,
 0.03268635272979736,
 0.03124351054430008,
 0.06132078543305397,
 -0.06366582214832306,
 0.032