In [1]:
!pip install beautifulsoup4 requests



In [4]:
## web scraping for the FAQs and Articles links

import requests
from bs4 import BeautifulSoup

def extract_articles_and_FAQs(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        faqs_and_articles_links = []
        faqs_and_articles = soup.select('div.card.kb-category.mb-4')
        for section in faqs_and_articles:
            links = section.select('a[href]')
            for link in links:
                href = link.get('href')
                faqs_and_articles_links.append("https://clients.hostsailor.com" + href)

        return faqs_and_articles_links

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return {'faq_links': [], 'article_links': []}

# Example usage:
if __name__ == "__main__":
    knowledgebase_url = "https://clients.hostsailor.com/knowledgebase"
    articles_and_FAQs_links = extract_articles_and_FAQs(knowledgebase_url)
    print(articles_and_FAQs_links)

['https://clients.hostsailor.com/index.php?rp=/knowledgebase/3/FAQ', 'https://clients.hostsailor.com/index.php?rp=/knowledgebase/1/Tutorials']


In [5]:
## get the links of FAQs and Articles

import requests
from bs4 import BeautifulSoup

def extract_nested_links(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Navigate the nested structure
        target_section = soup.select_one('section#main-body div.container div.row div.col-lg-8.col-xl-9.primary-content')

        links = []
        if target_section:
            # Within this section, find all <a> tags
            anchor_tags = target_section.select('div.card div.list-group.list-group-flush a[href]')
            for tag in anchor_tags:
                href = tag['href']
                # Make sure the link is complete
                if href.startswith('/'):
                    href = 'https://clients.hostsailor.com' + href
                links.append(href)
        else:
            print("Target section not found.")

        return links

    except Exception as e:
        print(f"Error while scraping {url}: {e}")
        return []

# Example usage:
if __name__ == "__main__":
    all_links = []
    for link in articles_and_FAQs_links:
        nested_links = extract_nested_links(link)
        all_links.extend(nested_links)

    print("Found links:")
    for link in all_links:
        print(link)
    print(f"{len(all_links)}")

Found links:
https://clients.hostsailor.com/index.php?rp=/knowledgebase/130/Adobe-flash-Internet-explorer-security-error.html
https://clients.hostsailor.com/index.php?rp=/knowledgebase/129/Disk-space-upgrade.html
https://clients.hostsailor.com/index.php?rp=/knowledgebase/6/Does-your-VPS-support-IPv6.html
https://clients.hostsailor.com/index.php?rp=/knowledgebase/122/HostSailorandsharp039s-private-DNS.html
https://clients.hostsailor.com/index.php?rp=/knowledgebase/7/How-is-VSwap-Calculated.html
https://clients.hostsailor.com/index.php?rp=/knowledgebase/4/List-Of-Operating-System-Templates.html
https://clients.hostsailor.com/index.php?rp=/knowledgebase/44/SolusVM-Error-You-Have-Been-Blacklisted---Contact-Support..html
https://clients.hostsailor.com/index.php?rp=/knowledgebase/5/Where-is-my-VPS-information-email.html
https://clients.hostsailor.com/index.php?rp=/knowledgebase/8/You-canandsharp039t-use-special-characters-for-the-for-resetting-password-in-SolusVM.html
https://clients.hostsai

In [6]:
## get the article and its title from the knowledge base

import requests
from bs4 import BeautifulSoup

def extract_title_and_article(url):
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title (from <h1>)
        h1_tag = soup.select_one('h1')
        if h1_tag:
            for a_tag in h1_tag.find_all('a'):
                a_tag.decompose()  # Delete <a> tags from the tree to remove the word [Print]
            title = h1_tag.get_text(strip=True)

        # Extract article (from <article>)
        article = soup.select_one('.card-body article').text.strip()

        # Print results
        print(f"Title: {title}")
        print(f"Article:\n{article}")

    except Exception as e:
        print(f"Error: {e}")
    return title, article

#### That's are some calls to test the scrapping function `extract_title_and_article`

---



In [7]:
url = "https://clients.hostsailor.com/index.php?rp=/knowledgebase/6/Does-your-VPS-support-IPv6.html"

extract_title_and_article(url)

Title: Does your VPS support IPv6?
Article:
Yes, our vps support IPv6 just send us a ticket to request one or more ipv6.


('Does your VPS support IPv6?',
 'Yes, our vps support IPv6 just send us a ticket to request one or more ipv6.')

In [8]:
url = "https://clients.hostsailor.com/index.php?rp=/knowledgebase/129/Disk-space-upgrade.html"

extract_title_and_article(url)

Title: Disk space upgrade
Article:
Once you upgrade your VPS the RAM and CPU are easily upgradable, with the disk it's a different matter:With OpenVZ your disk space will be upgraded using the same partition.With XEN Linux we can also upgrade it using the same partition.
With KVM Linux, we have to add a new partition using the extra disk space.With KVM Windows, once the upgrade is completed, we can resize it via the disk management feature under the windows tools.


('Disk space upgrade',
 "Once you upgrade your VPS the RAM and CPU are easily upgradable, with the disk it's a different matter:With OpenVZ your disk space will be upgraded using the same partition.With XEN Linux we can also upgrade it using the same partition.\nWith KVM Linux, we have to add a new partition using the extra disk space.With KVM Windows, once the upgrade is completed, we can resize it via the disk management feature under the windows tools.")

In [None]:
url = "https://clients.hostsailor.com/index.php?rp=/knowledgebase/386/Enabling-cPhulk-Brute-Force-Protection-in-WHM.html"

extract_title_and_article(url)

Title: Enabling cPhulk Brute Force Protection in WHM
Article:
The cPHulk Brute Force Protection interface can be enabled from WHM >> Home >> Security Center >> cPHulk Brute Force Protection which allows you to configure cPHulk, a default service that provides protection for your server against brute force attacks. A brute force attack uses an automated system to guess the password of your web server or services. You can watch the video tutorial from this link. Here I am providing the steps about how to enable and configure it on your WHM panel:

Login to WHM as root and open the 'cPHulk Brute Force Protection' option from the 'Security Center'. 
Make sure that its status is at the top of the page. You can see a few configuration options which manage the way in which the cPhulk blocks an IP like how many attempts, time, etc. 
You can see the Whitelist and Blacklist Management options in which you can add your own IP to the whitelist and also block some IPs manually when required.
Using 

('Enabling cPhulk Brute Force Protection in WHM',
 "The cPHulk Brute Force Protection interface can be enabled from WHM >> Home >> Security Center >> cPHulk Brute Force Protection which allows you to configure cPHulk, a default service that provides protection for your server against brute force attacks. A brute force attack uses an automated system to guess the password of your web server or services. You can watch the video tutorial from this link. Here I am providing the steps about how to enable and configure it on your WHM panel:\n\nLogin to WHM as root and open the 'cPHulk Brute Force Protection' option from the 'Security Center'.\xa0\nMake sure that its status is at the top of the page. You can see a few configuration options which manage the way in which the cPhulk blocks an IP like how many attempts, time, etc.\xa0\nYou can see the Whitelist and Blacklist Management options in which you can add your own IP to the whitelist and also block some IPs manually when required.\nUsing

In [36]:
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document

def extract_title_and_article_metadata(url):
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title
        h1_tag = soup.select_one('h1')
        if h1_tag:
            for a_tag in h1_tag.find_all('a'):
                a_tag.decompose()
            title = h1_tag.get_text(strip=True)
        else:
            title = "Untitled"

        # Extract article content
        article_element = soup.select_one('.card-body article')
        article = article_element.get_text(strip=True) if article_element else ""

        # Wrap into LangChain Document with metadata
        doc = Document(
            page_content=article,
            metadata={"title": title, "source": url}
        )

        return doc

    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

In [39]:
import time
import random


documents = []
for url in all_links:
    doc = extract_title_and_article_metadata(url)
    if doc:
        print(doc)
        documents.append(doc)
    # Introduce a random delay between 2 and 5 seconds
    time.sleep(random.uniform(2, 5))

page_content='Many users including ourselves, are having problems installing adobe flash with firefox, it keeps displaying the security errors from Internet explorer although you're installing it for firefox, correct? well the simple and quickiest solution is to use this full installer instead on your windows VPS or dedicated server:http://download.macromedia.com/pub/flashplayer/latest/help/install_flash_player.exe' metadata={'title': 'Adobe flash Internet explorer security error', 'source': 'https://clients.hostsailor.com/index.php?rp=/knowledgebase/130/Adobe-flash-Internet-explorer-security-error.html'}
page_content='Once you upgrade your VPS the RAM and CPU are easily upgradable, with the disk it's a different matter:With OpenVZ your disk space will be upgraded using the same partition.With XEN Linux we can also upgrade it using the same partition.With KVM Linux, we have to add a new partition using the extra disk space.With KVM Windows, once the upgrade is completed, we can resize 

In [41]:
documents[:3]

[Document(metadata={'title': 'Adobe flash Internet explorer security error', 'source': 'https://clients.hostsailor.com/index.php?rp=/knowledgebase/130/Adobe-flash-Internet-explorer-security-error.html'}, page_content="Many users including ourselves, are having problems installing adobe flash with firefox, it keeps displaying the security errors from Internet explorer although you're installing it for firefox, correct? well the simple and quickiest solution is to use this full installer instead on your windows VPS or dedicated server:http://download.macromedia.com/pub/flashplayer/latest/help/install_flash_player.exe"),
 Document(metadata={'title': 'Disk space upgrade', 'source': 'https://clients.hostsailor.com/index.php?rp=/knowledgebase/129/Disk-space-upgrade.html'}, page_content="Once you upgrade your VPS the RAM and CPU are easily upgradable, with the disk it's a different matter:With OpenVZ your disk space will be upgraded using the same partition.With XEN Linux we can also upgrade 

In [42]:
len(documents)

296

In [34]:
len(all_links)   # (287 Article + 9 FAQs)

296

In [44]:
type(documents)

list

In [45]:
type(documents[0])

#### Save the `documents` list as a json file for later usage without need to scrap each time.

In [46]:
import json
from langchain.docstore.document import Document

print(f"Initial documents to save: {len(documents)}")

# --- Saving the documents to JSONL ---
file_path_jsonl = "knowledge_base_articles.jsonl" # A descriptive filename
with open(file_path_jsonl, 'w', encoding='utf-8') as f:
    for doc in documents:
        # LangChain Document objects have .page_content and .metadata attributes
        doc_dict = {
            "page_content": doc.page_content,
            "metadata": doc.metadata
        }
        # Use ensure_ascii=False to correctly handle non-ASCII characters (e.g., Arabic)
        f.write(json.dumps(doc_dict, ensure_ascii=False) + '\n')
print(f"Documents successfully saved to {file_path_jsonl}")

Initial documents to save: 296
Documents successfully saved to knowledge_base_articles.jsonl


In [50]:
from google.colab import drive

drive.mount('/content/MyDrive')

Mounted at /content/MyDrive


In [53]:
!cp /content/knowledge_base_articles.jsonl "/content/MyDrive/MyDrive/Customer Service Chatbot Assistance"

#### Try load the `knowledge_base_articles.jsonl` file from the drive to make sure it's well saved

In [54]:
# --- Loading the documents from JSONL (when you're ready to work on RAG) ---

file_path_jsonl = "/content/MyDrive/MyDrive/Customer Service Chatbot Assistance/knowledge_base_articles.jsonl"
loaded_documents = []
with open(file_path_jsonl, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line.strip()) # .strip() removes leading/trailing whitespace, including the newline
        # Reconstruct the LangChain Document object
        doc = Document(
            page_content=data["page_content"],
            metadata=data["metadata"]
        )
        loaded_documents.append(doc)

print(f"\nLoaded {len(loaded_documents)} documents from {file_path_jsonl}")
print("First loaded document example:")
print(loaded_documents[0])
print(f"Type of loaded document: {type(loaded_documents[0])}")

# You can verify that the content and metadata are correctly loaded
# print(loaded_documents[0].page_content)
# print(loaded_documents[0].metadata['title'])


Loaded 296 documents from /content/MyDrive/MyDrive/Customer Service Chatbot Assistance/knowledge_base_articles.jsonl
First loaded document example:
page_content='Many users including ourselves, are having problems installing adobe flash with firefox, it keeps displaying the security errors from Internet explorer although you're installing it for firefox, correct? well the simple and quickiest solution is to use this full installer instead on your windows VPS or dedicated server:http://download.macromedia.com/pub/flashplayer/latest/help/install_flash_player.exe' metadata={'title': 'Adobe flash Internet explorer security error', 'source': 'https://clients.hostsailor.com/index.php?rp=/knowledgebase/130/Adobe-flash-Internet-explorer-security-error.html'}
Type of loaded document: <class 'langchain_core.documents.base.Document'>


In [56]:
len(loaded_documents)

296

In [57]:
documents == loaded_documents

True

In [58]:
loaded_documents

[Document(metadata={'title': 'Adobe flash Internet explorer security error', 'source': 'https://clients.hostsailor.com/index.php?rp=/knowledgebase/130/Adobe-flash-Internet-explorer-security-error.html'}, page_content="Many users including ourselves, are having problems installing adobe flash with firefox, it keeps displaying the security errors from Internet explorer although you're installing it for firefox, correct? well the simple and quickiest solution is to use this full installer instead on your windows VPS or dedicated server:http://download.macromedia.com/pub/flashplayer/latest/help/install_flash_player.exe"),
 Document(metadata={'title': 'Disk space upgrade', 'source': 'https://clients.hostsailor.com/index.php?rp=/knowledgebase/129/Disk-space-upgrade.html'}, page_content="Once you upgrade your VPS the RAM and CPU are easily upgradable, with the disk it's a different matter:With OpenVZ your disk space will be upgraded using the same partition.With XEN Linux we can also upgrade 