# Introduction

**Table of Contents**
1. Collect webpage URLs from the specified domain
2. Extract webpages and parse as LangChain's Documents
    - 2.1 Clean up the webpage content 
3. Index document in a vector store using LangChain

**References**:
- [DeepLarning.AI | Functions, Tools and Agents with LangChain](https://learn.deeplearning.ai/courses/functions-tools-agents-langchain/lesson/5/tagging-and-extraction)


In [1]:
#!pip install python-dotenv \
#             langchain==0.2.5 \
#             langchain_core==0.2.9 \
#             langchain_groq==0.1.6
#             langchain_openai==0.1.16 \
#             wikipedia==1.4.0

In [2]:
# Change the current working directory to the pachage root
# That's step is due to the way settings.py is defined

import os


ROOT_DIR = os.path.join(*os.path.split(os.getcwd())[:-1])
os.chdir(ROOT_DIR)
os.getcwd()

'd:\\Projects\\agentic-rag-chatbot'

In [3]:
from dotenv import load_dotenv

conf_dir = os.path.join("conf")
conf_file = ".env"

conf_path = os.path.join(conf_dir, conf_file)
_ = load_dotenv(conf_path)

# 1. Collect webpage URLs from the specified domain

In [4]:
# Perform web scraping to collect webpage URLs within a specific domain

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def crawl_site_to_fetch_urls(
        start_url, 
        min_length_text_body=1000, 
        max_number_visited_urls=None, 
        max_number_qualified_urls=None,
        visited_urls=None
    ):
    """
    Crawl the given website starting from start_url and collect all unique page URLs within the same domain
    that have a body text content longer than the specified min_length_text_body.
    
    Parameters:
    - start_url (str): The starting URL to begin crawling from.
    - min_length_text_body (int): Minimum length of the text content in the body to include the URL in the result.
    - max_number_visited_urls (int, optional): Maximum number of URLs to visit.
    - max_number_qualified_urls (int, optional): Maximum number of URLs to return.
    - visited_urls (set, optional): Set of URLs that have been visited.

    Returns:
    - list: URLs meeting the text content length requirement.
    """
    if visited_urls is None:
        visited_urls = set()

    urls_to_visit = [start_url]
    base_domain = urlparse(start_url).netloc
    qualifying_urls = []

    while urls_to_visit:
        current_url = urls_to_visit.pop(0)

        if current_url in visited_urls:
            continue

        visited_urls.add(current_url)
        print(f"Visiting: ({len(visited_urls)}) {current_url}", end="")

        try:
            response = requests.get(current_url)
            response.raise_for_status()  # Ensure the request was successful
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract text from the <body> tag, excluding script and style elements
            text = soup.body.get_text(separator=' ', strip=True) if soup.body else ""
            text = ' '.join(text.split())  # Normalize whitespace

            if len(text) >= min_length_text_body:
                qualifying_urls.append(current_url)
                print('')
            else:
                print(' (DISGARDED)')

            # Find and process all hyperlinks within the same domain
            for link in soup.find_all('a', href=True):
                full_url = urljoin(current_url, link['href'])
                if urlparse(full_url).netloc == base_domain:
                    urls_to_visit.append(full_url)

        except requests.RequestException as e:
            print(f"Failed to fetch {current_url}: {e}")

        if max_number_visited_urls and len(visited_urls) >= max_number_visited_urls:
            break

        if max_number_qualified_urls and len(qualifying_urls) >= max_number_qualified_urls:
            break

    return qualifying_urls, visited_urls


In [5]:
# Starting point of the website to crawl and max number of visited webpages
start_page = 'https://benuri.org/'
min_length_text_body = 1000
max_number_visited_urls = 2000
max_number_qualified_urls = 1000

qualified_urls, visited_urls = crawl_site_to_fetch_urls(
    start_url=start_page, 
    min_length_text_body = min_length_text_body,
    max_number_visited_urls=max_number_visited_urls,
    max_number_qualified_urls = max_number_qualified_urls
)

Visiting: (1) https://benuri.org/
Visiting: (2) https://benuri.org/#main_content
Visiting: (3) https://benuri.org/about/
Visiting: (4) https://benuri.org/whats-on/
Visiting: (5) https://benuri.org/visit-us/
Visiting: (6) https://benuri.org/exhibition/
Visiting: (7) https://benuri.org/collections/
Visiting: (8) https://benuri.org/researchunit/
Visiting: (9) https://benuri.org/diaspora/
Visiting: (10) https://benuri.org/loans/
Visiting: (11) https://benuri.org/essays-catalogues/
Visiting: (12) https://benuri.org/butv-videos/
Visiting: (13) https://benuri.org/audioadventures/
Visiting: (14) https://benuri.org/publications/
Visiting: (15) https://benuri.org/schoolsandfamilies/
Visiting: (16) https://benuri.org/artsandhealth/
Visiting: (17) https://benuri.org/bloomberg-connects/
Visiting: (18) https://benuri.org/support/
Visiting: (19) https://benuri.org/contact/
Visiting: (20) https://benuri.org/faq/
Visiting: (21) https://benuri.org/contact/form/ (DISGARDED)
Visiting: (22) https://benuri.

In [6]:
print(f"Number of visited URLs found: {len(visited_urls)}")
print(f"Number of qualified URLs found: {len(qualified_urls)}")

Number of visited URLs found: 1023
Number of qualified URLs found: 1000


In [7]:
import pandas as pd

from datetime import date, timedelta

# Get today's date and format the date as YYYY-MM-DD
today = date.today()
yesterday = today - timedelta(days=1)
last_ingestion_date_str = yesterday.strftime("%Y-%m-%d")
first_ingestion_date_str = "2024-07-16"

urls = list(visited_urls)
is_qualified = [True if url in qualified_urls else False for url in visited_urls]
first_ingestion_date = [first_ingestion_date_str for url in urls]
last_ingestion_date = [last_ingestion_date_str for url in urls]
last_content_update = [None for url in urls]
is_to_ingest = [True for url in urls]

data = {
    'url': urls,
    'is_qualified': is_qualified,
    'first_ingestion_date': first_ingestion_date,
    'last_ingestion_date': last_ingestion_date,
    'last_content_update': last_content_update,
    'is_to_ingest': is_to_ingest
}

webpage_ingestion_control = pd.DataFrame(data)

In [8]:
file_path = 'data/webpage_ingestion_control.csv'
webpage_ingestion_control.to_csv(file_path, index=False)

In [9]:
webpage_ingestion_control = pd.read_csv(file_path)

webpage_ingestion_control.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1023 entries, 0 to 1022
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   url                   1023 non-null   object 
 1   is_qualified          1023 non-null   bool   
 2   first_ingestion_date  1023 non-null   object 
 3   last_ingestion_date   1023 non-null   object 
 4   last_content_update   0 non-null      float64
 5   is_to_ingest          1023 non-null   bool   
dtypes: bool(2), float64(1), object(3)
memory usage: 34.1+ KB


In [10]:
webpage_ingestion_control

Unnamed: 0,url,is_qualified,first_ingestion_date,last_ingestion_date,last_content_update,is_to_ingest
0,https://benuri.org/news/6-bu-elected-to-the-wo...,True,2024-07-16,2024-07-20,,True
1,https://benuri.org/artists/173-mario-dubsky/ov...,True,2024-07-16,2024-07-20,,True
2,https://benuri.org/content/feature/731/image5249/,True,2024-07-16,2024-07-20,,True
3,https://benuri.org/content/feature/727/image5152/,True,2024-07-16,2024-07-20,,True
4,https://benuri.org/news/11-what-is-the-secret-...,True,2024-07-16,2024-07-20,,True
...,...,...,...,...,...,...
1018,https://benuri.org/events/31/,True,2024-07-16,2024-07-20,,True
1019,https://benuri.org/exhibitions/43/works/artwor...,True,2024-07-16,2024-07-20,,True
1020,https://benuri.org/exhibitions/37-david-bomber...,True,2024-07-16,2024-07-20,,True
1021,https://benuri.org/exhibitions/64-towards-abst...,True,2024-07-16,2024-07-20,,True


# 2. Extract webpages and parse as LangChain's Documents

In [11]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader(qualified_urls)
loader.requests_per_second = 50
loader.continue_on_failure = True

documents = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [12]:
type(documents)

list

In [13]:
type(documents[0])

langchain_core.documents.base.Document

In [14]:
documents[0]

Document(metadata={'source': 'https://benuri.org/', 'title': 'Ben Uri Gallery and Museum', 'description': 'Jewish Art', 'language': 'en'}, page_content="\n\n\n\n\n\nBen Uri Gallery and Museum\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nBen Uri Gallery and Museum\n\n\nSkip to main content\n\n\n\n\n\n\nMenu\n\nHomepage\nAbout Ben Uri\nWhat's on\nVisit Us\nExhibitions\nCollections\nResearch Unit\nDiaspora Artists\nMuseum Partnerships\nEssays / Catalogues\nBU TV\nPodcasts\nBookshop\nKids Programme\nArts and Mental Health\nBloomberg Connects\nSupport Us\nContact Us\nFAQ\n\n\n\n\n\n\n\n\n\n\nFacebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCart\n\n\n0 items\n\n£\n\n\n\n\n\n\nCheckout\n\n\n\nItem added to cart\n\nView cart & checkout\nContinue shoppi

In [15]:
documents[0].metadata

{'source': 'https://benuri.org/',
 'title': 'Ben Uri Gallery and Museum',
 'description': 'Jewish Art',
 'language': 'en'}

In [16]:
documents[0].page_content

"\n\n\n\n\n\nBen Uri Gallery and Museum\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nBen Uri Gallery and Museum\n\n\nSkip to main content\n\n\n\n\n\n\nMenu\n\nHomepage\nAbout Ben Uri\nWhat's on\nVisit Us\nExhibitions\nCollections\nResearch Unit\nDiaspora Artists\nMuseum Partnerships\nEssays / Catalogues\nBU TV\nPodcasts\nBookshop\nKids Programme\nArts and Mental Health\nBloomberg Connects\nSupport Us\nContact Us\nFAQ\n\n\n\n\n\n\n\n\n\n\nFacebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCart\n\n\n0 items\n\n£\n\n\n\n\n\n\nCheckout\n\n\n\nItem added to cart\n\nView cart & checkout\nContinue shopping\n\n\n\n\n\n\n\n\nFacebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens i

## 2.1 Clean up the webpage content 
To remove unnecessary whitespaces, newlines, and non-printable characters like \xa0.

In [17]:
import re

def clean_webpage_text(text):
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove non-printable characters like \xa0
    text = re.sub(r'\xa0', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

In [18]:
for doc in documents:
    raw_text = doc.page_content
    clean_text = clean_webpage_text(raw_text)
    doc.page_content = clean_text

In [19]:
documents[0].page_content

"Ben Uri Gallery and Museum Ben Uri Gallery and Museum Skip to main content Menu Homepage About Ben Uri What's on Visit Us Exhibitions Collections Research Unit Diaspora Artists Museum Partnerships Essays / Catalogues BU TV Podcasts Bookshop Kids Programme Arts and Mental Health Bloomberg Connects Support Us Contact Us FAQ Facebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email Cart 0 items £ Checkout Item added to cart View cart & checkout Continue shopping Facebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email Menu Home Marie Louise von Motesiczky Marie Louise von Motesiczky Clare Winsten Clare Winsten Josef Herman Josef Herman Jacob Epstein Jacob Epstein Image of Lydia by Jacob Epstein Arthur Segal Arthur Segal Eva Frankfurther Eva Frankfurther Marc Chagall Marc Chagall Image of Ap

# 3. Index document in a vector store using LangChain

Just follow my own code from the respository [resume-worth](https://github.com/luisrodriguesphd/resume-worth/tree/main).