# Introduction

**Table of Contents**
1. Collect webpage URLs from the specified domain
2. Extract webpages and parse as LangChain's Documents
    - 2.1 Clean up the webpage content 
3. Index document in a vector store using LangChain

**References**:
- [DeepLarning.AI | Functions, Tools and Agents with LangChain](https://learn.deeplearning.ai/courses/functions-tools-agents-langchain/lesson/5/tagging-and-extraction)


In [None]:
#!pip install python-dotenv \
#             langchain==0.2.5 \
#             langchain_core==0.2.9 \
#             langchain_groq==0.1.6
#             langchain_openai==0.1.16 \
#             wikipedia==1.4.0 \
#             pymongo==4.8.0

In [1]:
# Change the current working directory to the pachage root
# That's step is due to the way settings.py is defined

import os


ROOT_DIR = os.path.join(*os.path.split(os.getcwd())[:-1])
os.chdir(ROOT_DIR)
os.getcwd()

'd:\\Projects\\agentic-rag-chatbot'

In [2]:
from dotenv import load_dotenv

conf_dir = os.path.join("conf")
conf_file = ".env"

conf_path = os.path.join(conf_dir, conf_file)
_ = load_dotenv(conf_path)

# 1. Collect webpage URLs from the specified domain

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse


def crawl_site_to_fetch_urls(
        start_url, 
        min_length_text_body=1000, 
        max_number_visited_urls=None, 
        max_number_qualified_urls=None,
        visited_urls=[],
        urls_to_visit=[]
    ):
    """
    Crawl the given website starting from start_url and collect all unique page URLs within the same domain
    that have a body text content longer than the specified min_length_text_body.
    
    Parameters:
    - start_url (str): The starting URL to begin crawling from.
    - min_length_text_body (int): Minimum length of the text content in the body to include the URL in the result.
    - max_number_visited_urls (int, optional): Maximum number of URLs to visit.
    - max_number_qualified_urls (int, optional): Maximum number of URLs to return.
    - visited_urls (set, optional): Set of URLs that have been visited.
    - urls_to_visit (list, optional): Set of URLs that have been visited. If none, it's set as start_url, otherwise, start_url is ignored.

    Returns:
    - list: URLs meeting the text content length requirement.
    """
    n_visited_urls_previously = len(visited_urls)
    visited_urls = set(visited_urls)
    
    if not urls_to_visit:
        urls_to_visit = [start_url]
    urls_to_visit = urls_to_visit.copy()

    base_domain = urlparse(start_url).netloc
    qualifying_urls = set()

    while urls_to_visit:
        current_url = urls_to_visit.pop(0)

        if current_url in visited_urls:
            continue

        visited_urls.add(current_url)
        print(f"Visiting: ({len(visited_urls)}) {current_url}", end="")

        try:
            response = requests.get(current_url)
            response.raise_for_status()  # Ensure the request was successful
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract text from the <body> tag, excluding script and style elements
            text = soup.body.get_text(separator=' ', strip=True) if soup.body else ""
            text = ' '.join(text.split())  # Normalize whitespace

            if len(text) >= min_length_text_body:
                qualifying_urls.add(current_url)
                print('')
            else:
                print(' (DISGARDED)')

            # Find and process all hyperlinks within the same domain
            for link in soup.find_all('a', href=True):
                full_url = urljoin(current_url, link['href'])
                if urlparse(full_url).netloc == base_domain \
                    and full_url[-1] == '/' \
                    and full_url not in visited_urls \
                    and full_url not in urls_to_visit:
                    urls_to_visit.append(full_url)

        except requests.RequestException as e:
            print(f"Failed to fetch {current_url}: {e}")

        if max_number_visited_urls and len(visited_urls)-n_visited_urls_previously >= max_number_visited_urls:
            break

        if max_number_qualified_urls and len(qualifying_urls) >= max_number_qualified_urls:
            break

    return list(qualifying_urls), list(visited_urls), urls_to_visit

In [4]:
# Starting point of the website to crawl and max number of visited webpages
start_page = 'https://benuri.org/'
min_length_text_body = 1000
max_number_visited_urls = 15
max_number_qualified_urls = 100
visited_urls_previously=[]
urls_to_visit_previously=[]


qualified_urls, visited_urls, urls_to_visit = crawl_site_to_fetch_urls(
    start_url=start_page, 
    min_length_text_body = min_length_text_body,
    max_number_visited_urls=max_number_visited_urls,
    max_number_qualified_urls = max_number_qualified_urls,
    visited_urls = visited_urls_previously,
    urls_to_visit = urls_to_visit_previously,
)

Visiting: (1) https://benuri.org/
Visiting: (2) https://benuri.org/about/
Visiting: (3) https://benuri.org/whats-on/
Visiting: (4) https://benuri.org/visit-us/
Visiting: (5) https://benuri.org/exhibition/
Visiting: (6) https://benuri.org/collections/
Visiting: (7) https://benuri.org/researchunit/
Visiting: (8) https://benuri.org/diaspora/
Visiting: (9) https://benuri.org/loans/
Visiting: (10) https://benuri.org/essays-catalogues/
Visiting: (11) https://benuri.org/butv-videos/
Visiting: (12) https://benuri.org/audioadventures/
Visiting: (13) https://benuri.org/publications/
Visiting: (14) https://benuri.org/schoolsandfamilies/
Visiting: (15) https://benuri.org/artsandhealth/


In [5]:
def print_stats_collected_web_data(qualified_urls, visited_urls, urls_to_visit, visited_urls_previously, urls_to_visit_previously):
    print(f"Total number of visited URLs: {len(visited_urls)}")
    print(f"Number of visited URLs today: {len(visited_urls)-len(visited_urls_previously)}")
    print(f"Number of qualified URLs found: {len(qualified_urls)}")
    print(f"Number of URLs to still visit: {len(urls_to_visit)}")
    print(f"Number of URLs to visit new: {len(urls_to_visit)-len(urls_to_visit_previously)-(len(visited_urls)-len(visited_urls_previously))}")

In [6]:
print_stats_collected_web_data(qualified_urls, visited_urls, urls_to_visit, visited_urls_previously, urls_to_visit_previously)

Total number of visited URLs: 15
Number of visited URLs today: 15
Number of qualified URLs found: 15
Number of URLs to still visit: 343
Number of URLs to visit new: 328


In [7]:
from datetime import date

def parse_collected_web_data(qualified_urls, visited_urls, urls_to_visit, visited_urls_previously=[], urls_to_visit_previously=[]):
    # Get today's date and format the date as YYYY-MM-DD
    today = date.today()
    today_str = today.strftime("%Y-%m-%d")

    old_urls_visited = [url for url in visited_urls if url in urls_to_visit_previously]
    new_urls_visited = [url for url in visited_urls if url not in visited_urls_previously+urls_to_visit_previously]
    new_urls_to_visit = [url for url in urls_to_visit if url not in urls_to_visit_previously]
    urls = old_urls_visited + new_urls_visited + new_urls_to_visit

    is_new = [True if url not in old_urls_visited else False for url in urls]
    is_visited = [True if url in visited_urls else False for url in urls]
    is_qualified = [True if url in qualified_urls else False for url in urls]
    visited_date = [today_str if url in visited_urls else None for url in urls]
    is_to_ingest = is_qualified

    web_data_to_update = []
    web_data_to_add = []
    for url, new, iv, vd, iq, ii in  zip(urls, is_new, is_visited, visited_date, is_qualified, is_to_ingest):
        data = {
            'url': url,
            'is_visited': iv,
            'visited_date': vd,
            'is_qualified': iq,
            'is_to_ingest': ii,
        }
        if new:
            web_data_to_add.append(data)
        else:
            web_data_to_update.append(data)

    return web_data_to_update, web_data_to_add

In [8]:
web_data_to_update, web_data_to_add = parse_collected_web_data(qualified_urls, visited_urls, urls_to_visit)

print(len(web_data_to_update)+len(web_data_to_add))

358


In [9]:
web_data_to_update[:2]

[]

In [10]:
web_data_to_add[:2]

[{'url': 'https://benuri.org/loans/',
  'is_visited': True,
  'visited_date': '2024-08-02',
  'is_qualified': True,
  'is_to_ingest': True},
 {'url': 'https://benuri.org/researchunit/',
  'is_visited': True,
  'visited_date': '2024-08-02',
  'is_qualified': True,
  'is_to_ingest': True}]

In [None]:
from pymongo import MongoClient

MONGODB_ATLAS_CLUSTER_URI = os.environ.get('MONGODB_ATLAS_CLUSTER_URI')
MONGODB_ATLAS_DB_NAME = os.environ.get('MONGODB_ATLAS_DB_NAME')
MONGODB_ATLAS_INGESTION_COLLECTION_NAME = os.environ.get('MONGODB_ATLAS_INGESTION_COLLECTION_NAME')

# Connect to the MongoDB client
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)

# Select the database
db = client['mydatabase']

# Select the collection
collection = client[MONGODB_ATLAS_DB_NAME][MONGODB_ATLAS_INGESTION_COLLECTION_NAME]

In [62]:
# Function to insert a list of documents
def insert_documents(docs: list[dict]):
    """Function to insert a list of documents in MongoDB"""

    # Inserting the data into the collection
    result = collection.insert_many(docs)

    # Output the inserted documents' IDs
    inserted_ids = [str(object_id) for object_id in result.inserted_ids]

    return inserted_ids

In [11]:
# Inserting the data into the collection
inserted_ids = insert_documents(web_data_to_add)

# Output the inserted documents' IDs
print("Inserted document ids:", inserted_ids)

Inserted document ids: ['66acd1880cc0215c6a0f3858', '66acd1880cc0215c6a0f3859', '66acd1880cc0215c6a0f385a', '66acd1880cc0215c6a0f385b', '66acd1880cc0215c6a0f385c', '66acd1880cc0215c6a0f385d', '66acd1880cc0215c6a0f385e', '66acd1880cc0215c6a0f385f', '66acd1880cc0215c6a0f3860', '66acd1880cc0215c6a0f3861', '66acd1880cc0215c6a0f3862', '66acd1880cc0215c6a0f3863', '66acd1880cc0215c6a0f3864', '66acd1880cc0215c6a0f3865', '66acd1880cc0215c6a0f3866', '66acd1880cc0215c6a0f3867', '66acd1880cc0215c6a0f3868', '66acd1880cc0215c6a0f3869', '66acd1880cc0215c6a0f386a', '66acd1880cc0215c6a0f386b', '66acd1880cc0215c6a0f386c', '66acd1880cc0215c6a0f386d', '66acd1880cc0215c6a0f386e', '66acd1880cc0215c6a0f386f', '66acd1880cc0215c6a0f3870', '66acd1880cc0215c6a0f3871', '66acd1880cc0215c6a0f3872', '66acd1880cc0215c6a0f3873', '66acd1880cc0215c6a0f3874', '66acd1880cc0215c6a0f3875', '66acd1880cc0215c6a0f3876', '66acd1880cc0215c6a0f3877', '66acd1880cc0215c6a0f3878', '66acd1880cc0215c6a0f3879', '66acd1880cc0215c6a0f387

In [22]:
visited_urls_previously = [data['url'] for data in collection.find({"is_visited": True})]

len(visited_urls_previously)

15

In [23]:
urls_to_visit_previously = [doc['url'] for doc in collection.find({"is_visited": False})]

len(urls_to_visit_previously)

343

In [26]:
max_number_visited_urls = int(max_number_visited_urls/2)

qualified_urls, visited_urls, urls_to_visit = crawl_site_to_fetch_urls(
    start_url=start_page, 
    min_length_text_body = min_length_text_body,
    max_number_visited_urls=max_number_visited_urls,
    max_number_qualified_urls = max_number_qualified_urls,
    visited_urls = visited_urls_previously,
    urls_to_visit = urls_to_visit_previously,
)

Visiting: (16) https://benuri.org/bloomberg-connects/
Visiting: (17) https://benuri.org/support/
Visiting: (18) https://benuri.org/contact/
Visiting: (19) https://benuri.org/faq/
Visiting: (20) https://benuri.org/contact/form/ (DISGARDED)
Visiting: (21) https://benuri.org/store/basket/
Visiting: (22) https://benuri.org/press/


In [27]:
print_stats_collected_web_data(qualified_urls, visited_urls, urls_to_visit, visited_urls_previously, urls_to_visit_previously)

Total number of visited URLs: 22
Number of visited URLs today: 7
Number of qualified URLs found: 6
Number of URLs to still visit: 359
Number of URLs to visit new: 9


In [76]:
web_data_to_update, web_data_to_add = parse_collected_web_data(qualified_urls, visited_urls, urls_to_visit, visited_urls_previously, urls_to_visit_previously)


In [77]:
print(len(web_data_to_update)+len(web_data_to_add))

30


In [78]:
web_data_to_update[:2]

[{'url': 'https://benuri.org/faq/',
  'is_visited': True,
  'visited_date': '2024-08-02',
  'is_qualified': True,
  'is_to_ingest': True},
 {'url': 'https://benuri.org/bloomberg-connects/',
  'is_visited': True,
  'visited_date': '2024-08-02',
  'is_qualified': True,
  'is_to_ingest': True}]

In [79]:
# Function to update a list of documents
def update_documents(docs: list[dict], filter_field: str = 'url'):
    """Function to update a list of documents in MongoDB"""

    num_updated_ids = []
    for doc in docs:
        # Define the filter to find the document
        filter = {filter_field:  doc[filter_field]}
        
        # Define the new values to update
        del doc[filter_field]
        new_values = {"$set": doc}
        
        # Update the first document that matches the filter
        result = collection.update_one(filter, new_values)

        # Output the number of updated documents' IDs
        num_updated_ids.append(result.modified_count)

    return num_updated_ids

In [80]:
# Update the data into the collection
num_updated_ids = update_documents(web_data_to_update)

# Output the number of updated documents' IDs
print("Number of updated document ids:", num_updated_ids)

Number of updated document ids: [0, 0, 0, 0, 0, 0, 0]


In [32]:
web_data_to_add[:2]

[{'url': 'https://benuri.org/video/149-digital-future/',
  'is_visited': False,
  'visited_date': None,
  'is_qualified': False,
  'is_to_ingest': False},
 {'url': 'https://benuri.org/video/150-mental-health-for-the-70-community/',
  'is_visited': False,
  'visited_date': None,
  'is_qualified': False,
  'is_to_ingest': False}]

In [63]:
# Inserting the data into the collection
inserted_ids = insert_documents(web_data_to_add)

# Output the inserted documents' IDs
print("Inserted document ids:", inserted_ids)

Inserted document ids: ['66acda0f0cc0215c6a0f39be', '66acda0f0cc0215c6a0f39bf', '66acda0f0cc0215c6a0f39c0', '66acda0f0cc0215c6a0f39c1', '66acda0f0cc0215c6a0f39c2', '66acda0f0cc0215c6a0f39c3', '66acda0f0cc0215c6a0f39c4', '66acda0f0cc0215c6a0f39c5', '66acda0f0cc0215c6a0f39c6', '66acda0f0cc0215c6a0f39c7', '66acda0f0cc0215c6a0f39c8', '66acda0f0cc0215c6a0f39c9', '66acda0f0cc0215c6a0f39ca', '66acda0f0cc0215c6a0f39cb', '66acda0f0cc0215c6a0f39cc', '66acda0f0cc0215c6a0f39cd', '66acda0f0cc0215c6a0f39ce', '66acda0f0cc0215c6a0f39cf', '66acda0f0cc0215c6a0f39d0', '66acda0f0cc0215c6a0f39d1', '66acda0f0cc0215c6a0f39d2', '66acda0f0cc0215c6a0f39d3', '66acda0f0cc0215c6a0f39d4']


# 2. Extract webpages and parse as LangChain's Documents

In [84]:
qualified_urls = [data['url'] for data in collection.find({"is_to_ingest": True})]

len(qualified_urls)

9

In [85]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader(qualified_urls)
loader.requests_per_second = 50
loader.continue_on_failure = True

documents = loader.load()

In [86]:
type(documents)

list

In [87]:
type(documents[0])

langchain_core.documents.base.Document

In [88]:
documents[0]

Document(metadata={'source': 'https://benuri.org/butv-videos/', 'title': 'BU TV | Ben Uri Gallery and Museum', 'description': 'Watch over 100 short films about artists, exhibitions and Ben Uri’s history and ethos Below are films and video content accompanying Ben Uri exhibitions, curator talks and related events, followed by important interviews reflecting personal experiences about periods and artworks under scrutiny. If you are interested in upcoming videos and...', 'language': 'en'}, page_content='\n\n\n\n\n\nBU TV | Ben Uri Gallery and Museum\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nBen Uri Gallery and Museum\n\n\nSkip to main content\n\n\n\n\n\n\nMenu\n\nHomepage\nAbout Ben Uri\nWhat\'s on\nVisit Us\nExhibitions\nCollections\nResearch Unit\nDiaspora Artists\nMuseum Partnerships\nEssays / Catalogues\nBU TV\nPodcasts\nBookshop\nKids Programme\nArts and Mental Health\nBloom

In [89]:
documents[0].metadata

{'source': 'https://benuri.org/butv-videos/',
 'title': 'BU TV | Ben Uri Gallery and Museum',
 'description': 'Watch over 100 short films about artists, exhibitions and Ben Uri’s history and ethos Below are films and video content accompanying Ben Uri exhibitions, curator talks and related events, followed by important interviews reflecting personal experiences about periods and artworks under scrutiny. If you are interested in upcoming videos and...',
 'language': 'en'}

In [90]:
documents[0].page_content

'\n\n\n\n\n\nBU TV | Ben Uri Gallery and Museum\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nBen Uri Gallery and Museum\n\n\nSkip to main content\n\n\n\n\n\n\nMenu\n\nHomepage\nAbout Ben Uri\nWhat\'s on\nVisit Us\nExhibitions\nCollections\nResearch Unit\nDiaspora Artists\nMuseum Partnerships\nEssays / Catalogues\nBU TV\nPodcasts\nBookshop\nKids Programme\nArts and Mental Health\nBloomberg Connects\nSupport Us\nContact Us\nFAQ\n\n\n\n\n\n\n\n\n\n\nFacebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCart\n\n\n0 items\n\n£\n\n\n\n\n\n\nCheckout\n\n\n\nItem added to cart\n\nView cart & checkout\nContinue shopping\n\n\n\n\n\n\n\n\nFacebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn

## 2.1 Clean up the webpage content 
To remove unnecessary whitespaces, newlines, and non-printable characters like \xa0.

In [91]:
import re

def clean_webpage_text(text):
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove non-printable characters like \xa0
    text = re.sub(r'\xa0', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

In [92]:
for doc in documents:
    raw_text = doc.page_content
    clean_text = clean_webpage_text(raw_text)
    doc.page_content = clean_text

In [93]:
documents[0].page_content

'BU TV | Ben Uri Gallery and Museum Ben Uri Gallery and Museum Skip to main content Menu Homepage About Ben Uri What\'s on Visit Us Exhibitions Collections Research Unit Diaspora Artists Museum Partnerships Essays / Catalogues BU TV Podcasts Bookshop Kids Programme Arts and Mental Health Bloomberg Connects Support Us Contact Us FAQ Facebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email Cart 0 items £ Checkout Item added to cart View cart & checkout Continue shopping Facebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email Menu BU TV BU TV Watch over 100 short films about artists, exhibitions and Ben Uri’s history and ethos Below are films and video content accompanying Ben Uri exhibitions, curator talks and related events, followed by important interviews reflecting personal experience

# 3. Index document in a vector store using LangChain

Just follow my own code from the respository [resume-worth](https://github.com/luisrodriguesphd/resume-worth/tree/main).