# Parser

In [7]:
# backend/sitemap_parser.py
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse, urlunparse

HEADERS = {"User-Agent": "Mozilla/5.0"}

def normalize_url(url: str) -> str:
    url = url.lower().strip() # Remove leading/trailing whitespace and convert to lowercase
    if not url.startswith("http"):
        url = "https://" + url
    
    if url.startswith("https://") and not url.startswith("https://www."):
        url = url.replace("https://", "https://www.", 1)
    elif not url.startswith("https://www."):
        url = "https://www." + url
    
    parsed = urlparse(url)

    # Normalize to scheme + netloc only (strip path, params, query, fragment)
    normalized_url = urlunparse((parsed.scheme, parsed.netloc, '', '', '', ''))
    return normalized_url

def fetch_sitemap(url): # Fetch each xml sitemap in one layer.
    try:
        response = requests.get(url, headers=HEADERS, timeout=5)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml-xml')
        return [loc.text for loc in soup.find_all('loc')]
    except Exception as e:
        print(f"Error: {e}")
        return []
def parse_sitemap(url): # Parse the sitemap and return a dictionary of URLs. Applies fetch_sitemap to each xml sitemap layer by layer.
    locs = fetch_sitemap(url)
    if not locs:
        return {url: []}
    
    tree = {}
    urls = []

    for loc in locs:
        if loc.endswith('.xml'):
            tree[loc] = parse_sitemap(loc)
        else:
            urls.append(loc)

    if tree and urls:
        tree["_final_urls"] = urls
        return tree
    elif urls:
        return urls
    else:
        return {url: tree}

def extract_final_urls(url): # List all URLs in the sitemap.
    
    url = normalize_url(url)
    final_urls = [url]
    if not url.endswith('/sitemap.xml'):
        url += '/sitemap.xml'
    tree = parse_sitemap(url)
    

    def _walk_tree(node):
        if isinstance(node, dict):
            for key, value in node.items():
                if key == "_final_urls" and isinstance(value, list):
                    final_urls.extend(value)
                else:
                    _walk_tree(value)
        elif isinstance(node, list):
            final_urls.extend([v for v in node if not v.endswith('.xml')])

    _walk_tree(tree)

    
    return final_urls, tree


# Scraper

In [8]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time


HEADERS = {"User-Agent": "Mozilla/5.0"}

def scrape(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)
        if text:
            return text
    except:
        pass

    try:
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        time.sleep(3)
        elems = driver.find_elements(By.TAG_NAME, "p")
        text = ' '.join(elem.text for elem in elems).strip()
        driver.quit()
        return text
    except:
        return ""

# The output here is an input for the RAG model.

# RAG model (using Langchain)
## Database loader (run everyday to update the database based on recent information)

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import init_chat_model
from langchain_community.vectorstores import SupabaseVectorStore
from supabase import create_client, Client
from tqdm import tqdm
import json
import getpass
import os
import dotenv
from langchain_core.documents import Document
# Load environment variables from .env file
dotenv.load_dotenv()

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")
else:
  print(f"Openai API key successfully imported from .env file.")
  #print(f"Key: {os.environ.get('OPENAI_API_KEY')}")

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

vector_store = SupabaseVectorStore(
    client=supabase,
    embedding=embeddings,
    table_name="nlai_content",  # You can change this
    query_name="nlai_match_documents"  # Needs to be created in Supabase SQL
)

def RAG_scraper_loader(company_name, website):

    # Clean the table before loading new data
    supabase.table("nlai_content").delete().not_.is_("id", None).execute()

    # Extract sitemap URLs
    url_list, tree = extract_final_urls(website)
    
    
    

    for link in tqdm(url_list, desc=f"Scraping & indexing ({company_name})", unit="link"):
        # Scrape the URL
        text = scrape(link)
        if not text.strip():
            continue  # skip empty pages

        # Create metadata and document content
        metadata={
                "source": str(link),
                "website": str(website)
            }

        docs = Document(
            page_content=text,
            metadata=metadata
        )
        
        # Chunking
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=500)
        chunks = text_splitter.split_documents([docs])

        # Index chunks and store in Supabase
        for chunk in chunks:
            # Generate embedding
            vector = embeddings.embed_query(chunk.page_content)

            # Insert into Supabase
            supabase.rpc("insert_webcontent_nlai", {
                "content": chunk.page_content,
                "metadata": metadata,
                "embedding": vector  # list of floats; pgvector input accepted here
            }).execute()

company_name = "Northlight AI"
website = "https://northlightai.com"
RAG_scraper_loader(company_name, website)

Openai API key successfully imported from .env file.


Scraping & indexing (Northlight AI): 100%|██████████| 34/34 [00:27<00:00,  1.24link/s]


## Step 2) Retrieval and Generation

### Prompts

In [10]:
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from typing_extensions import List, TypedDict
from langgraph.graph import START, StateGraph

## Prompt - custom
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use five sentences maximum and keep the answer as concise as possible.
Always start the answer with a sentence like "Thanks for asking question about North Light AI!"; but be innovative and each time use a similar welcoming message.

{context}

Question: {question}

Helpful Answer:"""
prompt = PromptTemplate.from_template(template)

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

## State and Nodes
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

## Compile the graph
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()


### Usage

In [11]:
def chatbot():
    question = input("What would you like to know? ")
    state = graph.invoke({"question": question})
    return print(state["answer"])


In [12]:
chatbot()

Thanks for asking a question about North Light AI! Andrew Mitchell is the Co-Founder and President of North Light AI, where he is focused on building a collaborative Innovation Lab for practical AI adoption in industry and academia. He has held leadership roles, including Associate Director at the University of New Hampshire's Center for Business Analytics. His expertise lies in leveraging AI tools to address real-world challenges and enhance business growth, especially for small to medium-sized organizations. Andrew's strategic vision is aimed at creating tangible impacts in today's digital landscape.
