# Introduction

**Table of Contents**
1. Collect webpage URLs from the specified domain
2. Extract webpages and parse as LangChain's Documents
    - 2.1 Clean up the webpage content 
3. Index document in a vector store using LangChain

**References**:
- [DeepLarning.AI | Functions, Tools and Agents with LangChain](https://learn.deeplearning.ai/courses/functions-tools-agents-langchain/lesson/5/tagging-and-extraction)


In [50]:
#!pip install python-dotenv \
#             langchain==0.2.5 \
#             langchain_core==0.2.9 \
#             langchain_groq==0.1.6
#             langchain_openai==0.1.16 \
#             wikipedia==1.4.0

In [1]:
# Change the current working directory to the pachage root
# That's step is due to the way settings.py is defined

import os


ROOT_DIR = os.path.join(*os.path.split(os.getcwd())[:-1])
os.chdir(ROOT_DIR)
os.getcwd()

'd:\\Projects\\agentic-rag-chatbot'

In [2]:
from dotenv import load_dotenv

conf_dir = os.path.join("conf")
conf_file = ".env"

conf_path = os.path.join(conf_dir, conf_file)
_ = load_dotenv(conf_path)

# 1. Collect webpage URLs from the specified domain

In [3]:
# Perform web scraping to collect webpage URLs within a specific domain

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def crawl_site_to_fetch_urls(start_url, max_number_visited_urls:int=None, visited_urls:set=set()):
    """
    Crawl the given website starting from start_url and collect all unique page URLs within the same domain.
    """

    urls_to_visit = [start_url]
    base_domain = urlparse(start_url).netloc

    while urls_to_visit:

        current_url = urls_to_visit.pop(0)

        if current_url in visited_urls:
            continue

        visited_urls.add(current_url)
        print(f"Visiting: ({len(visited_urls)}) {current_url}")

        try:
            response = requests.get(current_url)
            response.raise_for_status()  # Check if the request was successful
            soup = BeautifulSoup(response.text, 'html.parser')

            for link in soup.find_all('a', href=True):
                full_url = urljoin(current_url, link['href'])
                if urlparse(full_url).netloc == base_domain:
                    urls_to_visit.append(full_url)

        except requests.RequestException as e:
            print(f"Failed to fetch {current_url}: {e}")

        if max_number_visited_urls:
            if len(visited_urls) > max_number_visited_urls:
                break

    return list(visited_urls)

In [4]:
# Starting point of the website to crawl and max number of visited webpages
start_page = 'https://benuri.org/'
max_number_visited_urls = 2000

webpage_urls = crawl_site_to_fetch_urls(start_page, max_number_visited_urls)

Visiting: (1) https://benuri.org/
Visiting: (2) https://benuri.org/#main_content
Visiting: (3) https://benuri.org/about/
Visiting: (4) https://benuri.org/whats-on/
Visiting: (5) https://benuri.org/visit-us/
Visiting: (6) https://benuri.org/exhibition/
Visiting: (7) https://benuri.org/collections/
Visiting: (8) https://benuri.org/researchunit/
Visiting: (9) https://benuri.org/diaspora/
Visiting: (10) https://benuri.org/loans/
Visiting: (11) https://benuri.org/essays-catalogues/
Visiting: (12) https://benuri.org/butv-videos/
Visiting: (13) https://benuri.org/audioadventures/
Visiting: (14) https://benuri.org/publications/
Visiting: (15) https://benuri.org/schoolsandfamilies/
Visiting: (16) https://benuri.org/artsandhealth/
Visiting: (17) https://benuri.org/bloomberg-connects/
Visiting: (18) https://benuri.org/support/
Visiting: (19) https://benuri.org/contact/
Visiting: (20) https://benuri.org/faq/
Visiting: (21) https://benuri.org/contact/form/
Visiting: (22) https://benuri.org/store/ba

In [5]:
print(f"Total unique URLs found: {len(webpage_urls)}")
for url in sorted(webpage_urls):
    print(url)

Total unique URLs found: 2001
http://benuri.org/
http://benuri.org/#main_content
http://benuri.org/3d-exhibitions/
http://benuri.org/about/
http://benuri.org/accessibility-policy/
http://benuri.org/artists/230-marc-chagall/
http://benuri.org/artists/33-jankel-adler/
http://benuri.org/artsandhealth/
http://benuri.org/artworks/1470-jacob-epstein-bust-of-jacob-kramer-1921/
http://benuri.org/artworks/2119-vitaly-komar-and-alexander-melamid-yalta-1945-1/
http://benuri.org/artworks/2230-zory-shahrokhi-revolution-street-2/
http://benuri.org/artworks/2240-jankel-adler-mother-and-child-ii-1941/
http://benuri.org/audioadventures/
http://benuri.org/bloomberg-connects/
http://benuri.org/butv-videos/
http://benuri.org/collections/
http://benuri.org/contact/
http://benuri.org/contact/form/
http://benuri.org/cookie-policy/
http://benuri.org/diaspora/
http://benuri.org/essays-catalogues/
http://benuri.org/exhibition/
http://benuri.org/exhibitions/31-art-exit-1939-a-very-different-europe-adler-chagall-

In [6]:
import pandas as pd

first_ingestion_date = [None for url in webpage_urls]
last_ingestion_date = [None for url in webpage_urls]
last_content_update = [None for url in webpage_urls]
is_to_ingest = [True for url in webpage_urls]

data = {
    'url': webpage_urls,
    'first_ingestion_date': first_ingestion_date,
    'last_ingestion_date': last_ingestion_date,
    'last_content_update': last_content_update,
    'is_to_ingest': is_to_ingest
}

webpage_ingestion_control = pd.DataFrame(data)

file_path = 'data/webpage_ingestion_control.csv'
webpage_ingestion_control.to_csv(file_path, index=False)

webpage_ingestion_control

Unnamed: 0,url,first_ingestion_date,last_ingestion_date,last_content_update,is_to_ingest
0,https://benuri.org/artists/187-anatoli-kaplan/...,,,,True
1,https://benuri.org/content/feature/484/image40...,,,,True
2,https://benuri.org/video/90-a-family-portrait-...,,,,True
3,https://benuri.org/support/,,,,True
4,https://benuri.org/exhibitions/25/works/image_...,,,,True
...,...,...,...,...,...
1996,https://benuri.org/content/feature/484/image41...,,,,True
1997,https://benuri.org/press/16-national-gallery-a...,,,,True
1998,https://benuri.org/exhibitions/70-homeless-hid...,,,,True
1999,https://benuri.org/buru/weeks-features/,,,,True


In [7]:
webpage_ingestion_control = pd.read_csv(file_path)

webpage_ingestion_control.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2001 entries, 0 to 2000
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   url                   2001 non-null   object 
 1   first_ingestion_date  0 non-null      float64
 2   last_ingestion_date   0 non-null      float64
 3   last_content_update   0 non-null      float64
 4   is_to_ingest          2001 non-null   bool   
dtypes: bool(1), float64(3), object(1)
memory usage: 64.6+ KB


# 2. Extract webpages and parse as LangChain's Documents

In [22]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader(webpage_urls)

documents = loader.load()

In [31]:
type(documents)

list

In [33]:
type(documents[0])

langchain_core.documents.base.Document

In [23]:
documents[0]

Document(metadata={'source': 'https://benuri.org/about/#main_content', 'title': 'About | Ben Uri Gallery and Museum', 'description': '', 'language': 'en'}, page_content="\n\n\n\n\n\nAbout | Ben Uri Gallery and Museum\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nBen Uri Gallery and Museum\n\n\nSkip to main content\n\n\n\n\n\n\nMenu\n\nHomepage\nAbout Ben Uri\nWhat's on\nVisit Us\nExhibitions\nCollections\nResearch Unit\nDiaspora Artists\nMuseum Partnerships\nEssays / Catalogues\nBU TV\nPodcasts\nBookshop\nKids Programme\nArts and Mental Health\nBloomberg Connects\nSupport Us\nContact Us\nFAQ\n\n\n\n\n\n\n\n\n\n\nFacebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCart\n\n\n0 items\n\n£\n\n\n\n\n\n\nCheckout\n\n\n\nItem added to cart\n\nView cart & 

In [24]:
documents[0].metadata

{'source': 'https://benuri.org/about/#main_content',
 'title': 'About | Ben Uri Gallery and Museum',
 'description': '',
 'language': 'en'}

In [25]:
documents[0].page_content

"\n\n\n\n\n\nAbout | Ben Uri Gallery and Museum\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nBen Uri Gallery and Museum\n\n\nSkip to main content\n\n\n\n\n\n\nMenu\n\nHomepage\nAbout Ben Uri\nWhat's on\nVisit Us\nExhibitions\nCollections\nResearch Unit\nDiaspora Artists\nMuseum Partnerships\nEssays / Catalogues\nBU TV\nPodcasts\nBookshop\nKids Programme\nArts and Mental Health\nBloomberg Connects\nSupport Us\nContact Us\nFAQ\n\n\n\n\n\n\n\n\n\n\nFacebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCart\n\n\n0 items\n\n£\n\n\n\n\n\n\nCheckout\n\n\n\nItem added to cart\n\nView cart & checkout\nContinue shopping\n\n\n\n\n\n\n\n\nFacebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn,

## 2.1 Clean up the webpage content 
To remove unnecessary whitespaces, newlines, and non-printable characters like \xa0.

In [28]:
import re

def clean_webpage_text(text):
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove non-printable characters like \xa0
    text = re.sub(r'\xa0', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

In [29]:
for doc in documents:
    raw_text = doc.page_content
    clean_text = clean_webpage_text(raw_text)
    doc.page_content = clean_text

In [30]:
documents[0].page_content

"About | Ben Uri Gallery and Museum Ben Uri Gallery and Museum Skip to main content Menu Homepage About Ben Uri What's on Visit Us Exhibitions Collections Research Unit Diaspora Artists Museum Partnerships Essays / Catalogues BU TV Podcasts Bookshop Kids Programme Arts and Mental Health Bloomberg Connects Support Us Contact Us FAQ Facebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email Cart 0 items £ Checkout Item added to cart View cart & checkout Continue shopping Facebook, opens in a new tab.Youtube, opens in a new tab.Instagram, opens in a new tab.Twitter, opens in a new tab.LinkedIn, opens in a new tab.Send an email Menu About About Ben Uri Gallery & Museum A New Era Staff Trustees Strategic objectives Annual Reports Opportunities Policies Legal notice our core programming revolves around our research unit, our collection, and ARTS AND MENTAL HEALTH Research Unit (BURU) BURU incor

# 3. Index document in a vector store using LangChain

Just follow my own code from the respository [resume-worth](https://github.com/luisrodriguesphd/resume-worth/tree/main).