In [56]:
# !pip install requests beautifulsoup4 pandas
# #Successfully installed beautifulsoup4-4.12.3 certifi-2024.8.30 charset-normalizer-3.3.2 idna-3.10 numpy-2.1.1 pandas-2.2.3 pytz-2024.2 requests-2.32.3 soupsieve-2.6 tzdata-2024.2 urllib3-2.2.3
# Requirement already satisfied: tqdm in /home/meher/miniforge3/envs/reel-talk/lib/python3.12/site-packages (4.66.5)

In [3]:
import requests
from bs4 import BeautifulSoup
import re

url = "https://en.wikipedia.org/wiki/How_I_Met_Your_Mother_episodes"

# Fetch the page content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all episode tables
episode_tables = soup.find_all('table', class_='wikitable plainrowheaders wikiepisodetable')

In [4]:
episode_list = []

# Extract episodes from each season
for season, table in enumerate(episode_tables, 1):
    rows = table.find_all('tr')[1:]
    for row in rows:
        try:
            title = row.find('td', class_='summary').get_text(strip=True)
            link_tag = row.find('td', class_='summary').find('a')
            href = f"https://en.wikipedia.org{link_tag['href']}" if link_tag else None

            airdate = row.find_all('td')[4].get_text(strip=True)
            date_pattern = r'\(([^)]+)\)'
            match = re.search(date_pattern, airdate)
            if match:
                extracted_text = match.group(1)
                airdate = extracted_text
            viewership = row.find_all('td')[6].get_text(strip=True)
            citation_pattern = r'\[\d+\]|\(\d+\)'
            viewership = re.sub(citation_pattern, '', viewership)
            episode_list.append({"Season": season, "Title": title, "Link": href, "Airdate": airdate, "Viewership(Mil)": viewership})
        except:
            continue


In [5]:
import pandas as pd

df = pd.DataFrame(episode_list)
df

Unnamed: 0,Season,Title,Link,Airdate,Viewership(Mil)
0,1,"""Pilot""",https://en.wikipedia.org/wiki/Pilot_(How_I_Met...,2005-09-19,10.94
1,1,"""Purple Giraffe""",,2005-09-26,10.40
2,1,"""Sweet Taste of Liberty""",,2005-10-03,10.44
3,1,"""Return of the Shirt""",,2005-10-10,9.84
4,1,"""Okay Awesome""",,2005-10-17,10.14
...,...,...,...,...,...
200,9,"""Vesuvius""",https://en.wikipedia.org/wiki/Vesuvius_(How_I_...,2014-03-03,9.11
201,9,"""Daisy""",https://en.wikipedia.org/wiki/Daisy_(How_I_Met...,2014-03-10,7.70
202,9,"""Gary Blauman""",https://en.wikipedia.org/wiki/Gary_Blauman,2014-03-17,7.78
203,9,"""The End of the Aisle""",,2014-03-24,9.04


In [43]:
import time

In [41]:
def scrape_episode(url:str):
    if not url:
        return ""
    try:
        response = requests.get(url)
    except Exception:
        time.sleep(5)
        response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')
    main_content = soup.find('div', class_='mw-parser-output')

    article_text = ""
    for element in main_content.find_all(['p', 'h2', 'h3']):
        if element.name == 'h2' and 'Production' in element.get_text():
            break
        article_text += element.get_text(separator=' ', strip=True) + ' '

    clean_text = article_text.replace('\xa0', ' ')
    return clean_text
    


In [49]:
from tqdm import tqdm
tqdm.pandas()
df["content"] = df["Link"].progress_apply(scrape_episode)

  0%|          | 0/205 [00:00<?, ?it/s]

100%|██████████| 205/205 [00:41<00:00,  4.94it/s]


In [44]:
df

Unnamed: 0,Season,Title,Link,Airdate,Viewership(Mil),content
0,1,"""Pilot""",https://en.wikipedia.org/wiki/Pilot_(How_I_Met...,2005-09-19,10.94,""" Pilot "" is the pilot episode and the first e..."
1,1,"""Purple Giraffe""",,2005-09-26,10.40,
2,1,"""Sweet Taste of Liberty""",,2005-10-03,10.44,
3,1,"""Return of the Shirt""",,2005-10-10,9.84,
4,1,"""Okay Awesome""",,2005-10-17,10.14,
...,...,...,...,...,...,...
200,9,"""Vesuvius""",https://en.wikipedia.org/wiki/Vesuvius_(How_I_...,2014-03-03,9.11,""" Vesuvius "" is the nineteenth episode of the..."
201,9,"""Daisy""",https://en.wikipedia.org/wiki/Daisy_(How_I_Met...,2014-03-10,7.70,""" Daisy "" is the twentieth episode of the nin..."
202,9,"""Gary Blauman""",https://en.wikipedia.org/wiki/Gary_Blauman,2014-03-17,7.78,""" Gary Blauman "" is the 21st episode of the n..."
203,9,"""The End of the Aisle""",,2014-03-24,9.04,


In [59]:
for i in tqdm(range(1000), "This is it"):
    pass

This is it: 100%|██████████| 1000/1000 [00:00<00:00, 1931968.68it/s]


In [1]:
url_fandom = 'https://how-i-met-your-mother.fandom.com/wiki/Episode_Guide'

In [2]:
from util.parser_util import get_all_episodes_df
# from util.data_util import save_content_to_path
import pandas as pd

In [4]:
df = get_all_episodes_df(url_fandom)

In [24]:
import time
import requests
from bs4 import BeautifulSoup

def scrape_episode_fandom(url:str):
    if not url:
        return ""
    try:
        response = requests.get(url)
    except Exception:
        time.sleep(5)
        response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')
    main_content = soup.find('div', class_='mw-parser-output')

    article_text = ""
    for element in main_content.find_all(['p', 'li', 'h2', 'h3']):
        if element.name == 'h2' and ('Gallery' in element.get_text() or 'References' in element.get_text() or 'External Links' in element.get_text()):
            continue
        article_text += element.get_text(separator=' ', strip=True) + ' '

    clean_text = article_text.replace('\xa0', ' ')
    clean_text = clean_text.replace('\u200b', ' ')
    return clean_text

In [25]:
scrape_episode_fandom(df['link'].iloc[0])

'Images (13) Future Ted begins telling his two kids the story of how he met their mother. He introduces Barney, Lily, Marshall and Robin, telling his kids how Lily and Marshall got engaged and how he met Robin.  Contents 1 Recap 2 Continuity 3 Future References (Contains Spoilers) 4 Gallery 5 Memorable Quotes 6 Notes and Trivia 6.1 Goofs and Errors 6.2 Allusions and Outside References 6.3 Music 6.4 Other Notes 6.1 Goofs and Errors 6.2 Allusions and Outside References 6.3 Music 6.4 Other Notes 7 Awards 8 Guests 9 Reception 10 References 11 External links Recap [ ] The episode (and series) begins with future Ted announcing to his two teenaged children that he will be telling them the story of how he met their mother . They are unamused. In 2005, law student Marshall decides to propose to kindergarten teacher Lily , his longtime girlfriend. As he rehearses the proposal to their college friend/roommate Ted at the Apartment , Marshall asks Ted what he\'ll be doing that night while they get 

In [26]:
from tqdm import tqdm

tqdm.pandas(desc="Extracting individual episode data from links")
df["content"] = df["link"].progress_apply(scrape_episode_fandom)

Extracting individual episode data from links: 100%|██████████| 238/238 [03:47<00:00,  1.04it/s]


In [7]:
df

Unnamed: 0,title,link
0,Pilot,https://how-i-met-your-mother.fandom.com/wiki/...
1,Purple Giraffe,https://how-i-met-your-mother.fandom.com/wiki/...
2,Sweet Taste of Liberty,https://how-i-met-your-mother.fandom.com/wiki/...
3,Return of the Shirt,https://how-i-met-your-mother.fandom.com/wiki/...
4,Okay Awesome,https://how-i-met-your-mother.fandom.com/wiki/...
...,...,...
233,The Jersey Connection,https://how-i-met-your-mother.fandom.com/wiki/...
234,Out Of Sync,https://how-i-met-your-mother.fandom.com/wiki/...
235,Parent Trap,https://how-i-met-your-mother.fandom.com/wiki/...
236,Shady Parker,https://how-i-met-your-mother.fandom.com/wiki/...


In [1]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result

  from tqdm.autonotebook import tqdm, trange


[-0.04895174130797386,
 -0.03986191004514694,
 -0.02156277932226658,
 0.009908479638397694,
 -0.038103990256786346,
 0.01268437784165144,
 0.04349461942911148,
 0.07183395326137543,
 0.009748543612658978,
 -0.006987019907683134,
 0.06352809816598892,
 -0.030322683975100517,
 0.013839451596140862,
 0.025805924087762833,
 -0.0011362511431798339,
 -0.014563609845936298,
 0.041640277951955795,
 0.03622831776738167,
 -0.02680085599422455,
 0.025120683014392853,
 -0.024978596717119217,
 -0.0045332517474889755,
 -0.026667218655347824,
 0.004100722260773182,
 -0.05204799771308899,
 -0.009930439293384552,
 -0.052065253257751465,
 0.008992079645395279,
 -0.0383005328476429,
 -0.044058412313461304,
 -0.004204399883747101,
 0.07047971338033676,
 0.005133938509970903,
 -0.07161536812782288,
 1.697531615718617e-06,
 -0.0060477349907159805,
 -0.011076342314481735,
 0.017513394355773926,
 -0.022299883887171745,
 0.04095498472452164,
 0.03379017859697342,
 0.05665036290884018,
 -0.07114937156438828,
 0

In [2]:
text = "This is another test document."
query_result = embeddings.embed_query(text)
query_result

[-0.02904650568962097,
 -0.056806765496730804,
 -0.025184525176882744,
 0.011193639598786831,
 -0.04380374401807785,
 0.020693380385637283,
 0.05873769521713257,
 0.09078864753246307,
 -0.002211709273979068,
 0.0026602342259138823,
 0.05585940554738045,
 -0.023299995809793472,
 0.011224741116166115,
 0.029922282323241234,
 0.016054773703217506,
 -0.029463985934853554,
 0.03384746238589287,
 0.027453282848000526,
 -0.03623248264193535,
 0.008865992538630962,
 -0.018837008625268936,
 -0.0003995356382802129,
 -0.028773291036486626,
 0.0029593759682029486,
 -0.04093953222036362,
 -0.004181044176220894,
 -0.05309860408306122,
 0.01912405900657177,
 -0.042742785066366196,
 -0.03839045763015747,
 0.012305662035942078,
 0.05483442172408104,
 0.007023625075817108,
 -0.055449262261390686,
 1.7402693401891156e-06,
 -0.02985570952296257,
 0.010298192501068115,
 0.017197260633111,
 -0.027191143482923508,
 0.03268635272979736,
 0.03124351054430008,
 0.06132078543305397,
 -0.06366582214832306,
 0.032

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [2]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="episodes_df",
    embedding_function=embeddings,
    persist_directory="../data/chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [3]:
import pandas as pd

df = pd.read_csv("../data/episode_df.csv")

In [4]:
df.head()

Unnamed: 0,title,content,episode_num
0,Pilot,Images (13) Future Ted begins telling his two ...,1
1,Purple Giraffe,Images (12) Ted continues to try and impress R...,2
2,Sweet Taste of Liberty,Images (10) Barney convinces Ted to pick up g...,3
3,Return of the Shirt,"Images (8) After reuniting with an old shirt, ...",4
4,Okay Awesome,Images (6) Ted and Barney join Robin at a nigh...,5


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter # Importing text splitter from Langchain

In [6]:
content_epi = df.iloc[0]

In [7]:
content_epi.index

Index(['title', 'content', 'episode_num'], dtype='object')

In [9]:
from langchain_core.documents import Document

document = [Document(
    page_content=content_epi["content"],
    metadata={"episode": "1", "title": "Pilot"},
)]


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2048, # Size of each chunk in characters
    chunk_overlap=128, # Overlap between consecutive chunks
    length_function=len, # Function to compute the length of the text
    add_start_index=True, # Flag to add start index to each chunk
)

# Split documents into smaller chunks using text splitter
chunks = text_splitter.split_documents(document)
print(f"Split {len(document)} documents into {len(chunks)} chunks.")

# Print example of page content and metadata for a chunk
document = chunks[0]
print(document.page_content)
print(document.metadata)

Split 1 documents into 8 chunks.
Images (13) Future Ted begins telling his two kids the story of how he met their mother. He introduces Barney, Lily, Marshall and Robin, telling his kids how Lily and Marshall got engaged and how he met Robin.  Contents 1 Recap 2 Continuity 3 Future References (Contains Spoilers) 4 Gallery 5 Memorable Quotes 6 Notes and Trivia 6.1 Goofs and Errors 6.2 Allusions and Outside References 6.3 Music 6.4 Other Notes 6.1 Goofs and Errors 6.2 Allusions and Outside References 6.3 Music 6.4 Other Notes 7 Awards 8 Guests 9 Reception 10 References 11 External links Recap [ ] The episode (and series) begins with future Ted announcing to his two teenaged children that he will be telling them the story of how he met their mother . They are unamused. In 2005, law student Marshall decides to propose to kindergarten teacher Lily , his longtime girlfriend. As he rehearses the proposal to their college friend/roommate Ted at the Apartment , Marshall asks Ted what he'll be d

In [13]:
from langchain_core.documents import Document

def create_document(row):
    document = Document(
        page_content=row["content"],
        metadata={"episode": row["episode_num"], "title": row["title"]},
    )
    return document


def add_to_vector_store(df):
    df["document"] = df.apply(create_document, axis=1)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2048, # Size of each chunk in characters
        chunk_overlap=128, # Overlap between consecutive chunks
        length_function=len, # Function to compute the length of the text
        add_start_index=True, # Flag to add start index to each chunk
    )

    # Split documents into smaller chunks using text splitter
    chunks = text_splitter.split_documents(df["document"])
    print(f"Split {len(df["document"])} documents into {len(chunks)} chunks.")

    batch_size = 100
    for i in tqdm(range(0, len(chunks), batch_size), desc="Adding to vector store"):
        vector_store.add_documents(chunks[i:i+batch_size])


In [16]:
add_to_vector_store(df)

Split 238 documents into 1343 chunks.


NameError: name 'tqdm' is not defined

In [12]:
vector_store.add_documents(chunks)

['fa038ace-1995-464c-9fed-251f46791567',
 'd734c428-3082-4c9e-838f-70ee1daf6cc8',
 '71a49358-607f-4a2b-96df-0382f49ab962',
 'c084de41-d420-4b47-a243-de7f44ad43de',
 'df6479e5-210a-445d-bad9-16b0dc4345f8',
 '3aa430a9-9726-4dbc-ae55-810a79617d4f',
 'd39855c6-f96f-4872-b9d9-0d985630cc06',
 'b1c0826a-a1f3-4cc6-94db-c6790fa97e5a',
 '75422dc9-a26a-46e7-999c-81329c9f4df1',
 'e702b8f7-8c09-4410-9703-e84f6ac78cc9',
 '009bb28b-2edf-4009-8ac6-d1c59016667f',
 '2006d8c0-aa04-4df4-aee0-0407d14d4913',
 'de193f19-9771-4168-bf3f-284f856ee846',
 '26b7ec6d-cc12-4ecc-9b08-2f437b3d6690',
 'be23713e-ee2d-4f1d-8569-8cb623c1a83d',
 'a271fc25-9ee9-4cb1-ab00-c7c17b2ced15',
 'fb0a6a76-a2c3-4ded-9b93-87bdb2f50b39',
 '87872505-ab07-44ab-b156-2c0e11add9ec',
 'f5c21e82-5fb7-4a1c-8310-651f0d856315',
 '3e66b506-df9e-4c64-9185-692841decdf5',
 '041c60f1-fbdb-4ee5-a474-e1e7901717f7',
 'b543296a-9501-4115-9fba-73d57e4297f3',
 '9a64cc60-3acf-4f76-b63b-a4034db3bb2c',
 'c1be404b-c003-4750-acc0-7b58ebb24d27',
 '153a3f89-1c74-

In [19]:
import hashlib

def generate_doc_id(document):
    id = hashlib.md5(document.page_content.encode()).hexdigest()
    document.metadata["id"] = id 
    return document
chunks_new = list(map(generate_doc_id, chunks))
chunks_new 

[Document(metadata={'episode': 1, 'title': 'Pilot', 'start_index': 0, 'id': '36961e0d72a0a704b3ea1d521b0e5542'}, page_content='Images (13) Future Ted begins telling his two kids the story of how he met their mother. He introduces Barney, Lily, Marshall and Robin, telling his kids how Lily and Marshall got engaged and how he met Robin.  Contents 1 Recap 2 Continuity 3 Future References (Contains Spoilers) 4 Gallery 5 Memorable Quotes 6 Notes and Trivia 6.1 Goofs and Errors 6.2 Allusions and Outside References 6.3 Music 6.4 Other Notes 6.1 Goofs and Errors 6.2 Allusions and Outside References 6.3 Music 6.4 Other Notes 7 Awards 8 Guests 9 Reception 10 References 11 External links Recap [ ] The episode (and series) begins with future Ted announcing to his two teenaged children that he will be telling them the story of how he met their mother . They are unamused. In 2005, law student Marshall decides to propose to kindergarten teacher Lily , his longtime girlfriend. As he rehearses the prop

In [20]:
chunks

[Document(metadata={'episode': 1, 'title': 'Pilot', 'start_index': 0, 'id': '36961e0d72a0a704b3ea1d521b0e5542'}, page_content='Images (13) Future Ted begins telling his two kids the story of how he met their mother. He introduces Barney, Lily, Marshall and Robin, telling his kids how Lily and Marshall got engaged and how he met Robin.  Contents 1 Recap 2 Continuity 3 Future References (Contains Spoilers) 4 Gallery 5 Memorable Quotes 6 Notes and Trivia 6.1 Goofs and Errors 6.2 Allusions and Outside References 6.3 Music 6.4 Other Notes 6.1 Goofs and Errors 6.2 Allusions and Outside References 6.3 Music 6.4 Other Notes 7 Awards 8 Guests 9 Reception 10 References 11 External links Recap [ ] The episode (and series) begins with future Ted announcing to his two teenaged children that he will be telling them the story of how he met their mother . They are unamused. In 2005, law student Marshall decides to propose to kindergarten teacher Lily , his longtime girlfriend. As he rehearses the prop

In [None]:
from vector_store.chromadb import ChromaDB
import pandas as pd

In [None]:
chromadb_store = ChromaDB(
    collection_name="how_i_met_your_mother", 
    embedding_function_name="sentence-transformers/all-MiniLM-L6-v2", 
    persist_directory="../data/chroma_langchain_db"
)

  from tqdm.autonotebook import tqdm, trange


In [None]:
episode_df = pd.read_csv("../data/how_i_met_your_mother.csv")
episode_df.head()

Unnamed: 0,title,content,episode_num
0,Pilot,Images (13) Future Ted begins telling his two ...,1
1,Purple Giraffe,Images (12) Ted continues to try and impress R...,2
2,Sweet Taste of Liberty,Images (10) Barney convinces Ted to pick up g...,3
3,Return of the Shirt,"Images (8) After reuniting with an old shirt, ...",4
4,Okay Awesome,Images (6) Ted and Barney join Robin at a nigh...,5


In [None]:
chromadb_store.add_episode_df_to_vector_store(episode_df)

Split 238 documents into 1343 chunks.


Adding to vector store: 100%|██████████| 14/14 [00:24<00:00,  1.73s/it]


In [None]:
len(chromadb_store.vector_store.get()['documents'])

1343

In [None]:
chromadb_store.vector_store.get()["ids"]

['36961e0d72a0a704b3ea1d521b0e5542',
 'c9bdb0afa76fd6c891dff9f492818316',
 '9bbeefb80494877f631e7387936eba47',
 'bafa6539edfbebf0e8ae63a22de71263',
 'fb985170c9e49c31ea1e734547eec5fb',
 '7629aa1a0814b28b3954a46f09a724fc',
 '3b69310565c26421f8f24e5ca0f0859c',
 'a4965c6f3b7b3e5b0fc1603a58cff5bc',
 'f47d0a8e74611195c7bc2b905cf4915f',
 'f5194c0b31b3006ad9dbba2facb2f90a',
 '8d49d5820bfb1562dc037490cdab2a72',
 '88f99a8b52dd1d65eb912e07827d7ba9',
 '72e95f0ca9027bcf4f7a190369fcba76',
 '9c92c1e86706bcc526908a2cb6308c5a',
 '3dae60fadb5ca109c04aefbf8d9d6ee4',
 '12f1896a2eb036f84215b26b5d495ec0',
 '5fec77872f222be1d484ed7225a59bb1',
 'cc34f7f1fe7b2ac87889f881ef446c98',
 'fae4d5ebe1ac0aca67c90627e61ea21e',
 '82bec753484cd009a29b9fe4f163a25c',
 '2dbcb6d0920f0cc895ef3b12f89ce62a',
 'd5a96d5b3bb766aea9b8cd30cf1edffe',
 '6530dca86d92f1c0c78f081c0d8771f8',
 '25859afcd3babff7333160af8252f317',
 'e442493024cee50a99bb22b1a30c48e5',
 'ee8b98f9cdce45382ac70d94a565cdf5',
 'a5a040b03845f52dafc707b51147ff21',
 

In [None]:
chromadb_store.vector_store.get()["metadatas"]

[{'episode': 1, 'start_index': 0, 'title': 'Pilot'},
 {'episode': 1, 'start_index': 1920, 'title': 'Pilot'},
 {'episode': 1, 'start_index': 3841, 'title': 'Pilot'},
 {'episode': 1, 'start_index': 5765, 'title': 'Pilot'},
 {'episode': 1, 'start_index': 7694, 'title': 'Pilot'},
 {'episode': 1, 'start_index': 9615, 'title': 'Pilot'},
 {'episode': 1, 'start_index': 11533, 'title': 'Pilot'},
 {'episode': 1, 'start_index': 13453, 'title': 'Pilot'},
 {'episode': 2, 'start_index': 0, 'title': 'Purple Giraffe'},
 {'episode': 2, 'start_index': 1921, 'title': 'Purple Giraffe'},
 {'episode': 2, 'start_index': 3840, 'title': 'Purple Giraffe'},
 {'episode': 2, 'start_index': 5753, 'title': 'Purple Giraffe'},
 {'episode': 2, 'start_index': 7673, 'title': 'Purple Giraffe'},
 {'episode': 3, 'start_index': 1, 'title': 'Sweet Taste of Liberty'},
 {'episode': 3, 'start_index': 1916, 'title': 'Sweet Taste of Liberty'},
 {'episode': 3, 'start_index': 3835, 'title': 'Sweet Taste of Liberty'},
 {'episode': 3,

In [1]:
!pip install langchain_groq

Collecting langchain_groq
  Downloading langchain_groq-0.2.0-py3-none-any.whl.metadata (2.9 kB)
Collecting groq<1,>=0.4.1 (from langchain_groq)
  Downloading groq-0.11.0-py3-none-any.whl.metadata (13 kB)
Downloading langchain_groq-0.2.0-py3-none-any.whl (14 kB)
Downloading groq-0.11.0-py3-none-any.whl (106 kB)
Installing collected packages: groq, langchain_groq
Successfully installed groq-0.11.0 langchain_groq-0.2.0


In [5]:
from llm import get_llm

model = get_llm()

GroqError: The api_key client option must be set either by passing api_key to the client or by setting the GROQ_API_KEY environment variable

In [4]:
model

NameError: name 'model' is not defined