In [11]:
import re, time, hashlib, requests, tldextract
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup

In [12]:
#Crawler

In [13]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urldefrag
import time

HEADERS = {"User-Agent": "MySimpleCrawler/1.0"}
SKIP_EXT = (".pdf", ".jpg", ".jpeg", ".png", ".gif", ".zip", ".mp4", ".mp3")

def normalize_url(url):
    url = urldefrag(url)[0]
    return url.strip()

def get_links(base_url, html):
    soup = BeautifulSoup(html, "html.parser")
    return [urljoin(base_url, a["href"]) for a in soup.find_all("a", href=True)]

def clean_text(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    return soup.get_text(" ", strip=True)

def crawl_site(seed_url, max_pages=3, delay=1):
    seed_url = normalize_url(seed_url)
    visited = set()
    to_visit = [seed_url]
    pages = []

    while to_visit and len(pages) < max_pages:
        url = to_visit.pop(0)
        if url in visited or url.lower().endswith(SKIP_EXT):
            continue
        try:
            print(f"Fetching: {url}")
            resp = requests.get(url, headers=HEADERS, timeout=10)
            if "text/html" not in resp.headers.get("Content-Type", ""):
                continue
            text = clean_text(resp.text)
            pages.append({"url": url, "text": text})
            visited.add(url)
            to_visit.extend([normalize_url(link) for link in get_links(url, resp.text) if link not in visited])
            time.sleep(delay)
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            continue

    return pages


In [14]:
#chunker

def chunk_text(text, max_words=220, overlap=40):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunks.append(" ".join(words[i:i+max_words]))
        if i + max_words >= len(words):
            break
        i += max_words - overlap
    return chunks

def pages_to_chunks(pages, max_words=220, overlap=40):
    docs = []
    for p in pages:
        chs = chunk_text(p["text"], max_words=max_words, overlap=overlap)
        for idx, ch in enumerate(chs):
            docs.append({
                "url": p["url"],
                "content": ch,
                "chunk_id": f'{p["url"]}#chunk={idx}'
            })
    return docs


In [15]:
#FAISS VectorStore (save as my_vectorstore.py)

In [16]:
#4. retriver
from my_vectorstore import SimpleVectorStore

class Retriever:
    def __init__(self, index_dir: str):
        self.store = SimpleVectorStore(index_dir)
        self.store.load()

    def get_context(self, question: str, top_k=5):
        return self.store.search(question, top_k=top_k)


In [17]:
#gpt 4 all q and a

from gpt4all import GPT4All

# Load local GPT4All model (small free version)
#model = GPT4All("ggml-gpt4all-j-v1.3-groovy")  # runs on CPU in Colab

#from gpt4all import GPT4All
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
# with model.chat_session():
#     print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))


SYSTEM_PROMPT = "Answer the question only using the provided context. Cite sources. If not found, say 'Not found on this site.'"


def format_context(chunks):
    formatted = []
    for i, c in enumerate(chunks, 1):
        formatted.append(f"[{i}] {c['content']}\n(Source: {c['url']})")
    return "\n\n".join(formatted)

def answer_with_gpt4all(question, chunks):
    context = format_context(chunks)
    prompt = f"{SYSTEM_PROMPT}\n\nContext:\n{context}\n\nQuestion: {question}"
    response = model.generate(prompt, max_tokens=400)

    return response


In [18]:
# 1️⃣ Crawl website
pages = crawl_site("https://en.wikipedia.org/wiki/Python_(programming_language", max_pages=1)

# 2️⃣ Chunk pages
chunks = pages_to_chunks(pages, max_words=100, overlap=20)

# 3️⃣ Build FAISS vector store
store = SimpleVectorStore("vector_index")
store.build(chunks)
store.save()

# 4️⃣ Use Retriever
retriever = Retriever("vector_index")
relevant_chunks = retriever.get_context("what is python", top_k=3)

# 5️⃣ Ask GPT4All locally
answer = answer_with_gpt4all("what is python", relevant_chunks)
print("Answer:\n", answer)


Fetching: https://en.wikipedia.org/wiki/Python_(programming_language
Answer:
  programming language?
Answer:
According to the provided context from Wikipedia [1], Python (programming language) is described as:

"Python is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more."

Source: https://en.wikipedia.org/wiki/Python_(programming_language)

Note: The provided context does not contain any information about the page being deleted or created. [2] and [3] are irrelevant to this question.

References:
[1] Wikipedia (2020). Python (programming language). Retrieved from <https://en.wikipedia.org/wiki/Python_(programming_language)>.
Not found on this site: The provided context does not contain any information about the page being deleted or created. [2] and [3] are irrelevant to this question. Therefore, there is no reference for this part of the answer.
