In [23]:
# offline_indexer.py

import re
import time
import requests
from urllib.parse import urljoin, urldefrag, urlparse
from bs4 import BeautifulSoup
from langdetect import detect_langs
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import deque, defaultdict
import pickle

# Ensure stopwords are available
import nltk
try:
    stopwords.words("english")
except LookupError:
    nltk.download("stopwords")

In [24]:
class OfflineCrawler:
    """Manually triggered crawler: from seed URLs, collect page texts."""
    def __init__(self, seeds, max_depth=2, delay=0.5):
        self.seeds = seeds
        self.max_depth = max_depth
        self.delay = delay

    def run(self):
        frontier = deque([(url, 0) for url in self.seeds])
        visited = set()
        pages = []  # list of (url, raw_text)

        while frontier:
            url, depth = frontier.popleft()
            if url in visited or depth > self.max_depth:
                continue
            visited.add(url)

            try:
                resp = requests.get(url, timeout=5)
                resp.raise_for_status()
                html = resp.text
            except Exception as e:
                print(f"[ERROR] fetching {url}: {e}")
                continue

            # Extract visible text
            soup = BeautifulSoup(html, "html.parser")
            for tag in soup(["script", "style"]):
                tag.decompose()
            text = soup.get_text(separator=" ")
            print(text[:1000], "...")  # Print first 100 chars for debugging

            # Language filter
            langs = []
            try:
                langs = detect_langs(text)
            except:
                pass
            if not any(l.lang == "en" and l.prob >= 0.9 for l in langs):
                print(f"[SKIP] non-English: {url}")
                continue

            pages.append((url, text))

            # enqueue links if depth allows
            if depth < self.max_depth:
                for a in soup.find_all("a", href=True):
                    href = urljoin(url, a["href"])
                    href, _ = urldefrag(href)
                    p = urlparse(href)
                    if p.scheme in ("http", "https"):
                        frontier.append((href, depth + 1))

            time.sleep(self.delay)

        return pages

def postings_dict():
    """Creates a default dictionary for postings."""
    return defaultdict(int)

class StemIndex:
    """Builds an in-memory inverted index of stemmed tokens (no stopwords)."""
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stopwords = set(stopwords.words("english"))
        self.token_re = re.compile(r"[a-z0-9]+")
        self.inverted = defaultdict(lambda: defaultdict(int))
        self.inverted = defaultdict(postings_dict)
        self.doc_meta = {}

    def add_page(self, doc_id, url, raw_text):
        # normalize → lowercase, tokenize, stem, filter stopwords
        text = raw_text.lower()
        tokens = self.token_re.findall(text)
        for tok in tokens:
            if tok in self.stopwords:
                continue
            stem = self.stemmer.stem(tok)
            self.inverted[stem][doc_id] += 1
        self.doc_meta[doc_id] = {"url": url}

    def save(self, path):
        with open(path, "wb") as f:
            pickle.dump((self.inverted, self.doc_meta), f)

    def load(self, path):
        with open(path, "rb") as f:
            self.inverted, self.doc_meta = pickle.load(f)


In [25]:
seeds = ["https://example.com"]
crawler = OfflineCrawler(seeds, max_depth=1)
pages = crawler.run()

index = StemIndex()
for i, (url, text) in enumerate(pages):
    print(f"Idx: {i} – {url}")
    index.add_page(i, url, text)

index.save("stem_index.pkl")
print("Index saved to stem_index.pkl")


 
 
 Example Domain 
 
 
 
 
 
 
 
 Example Domain 
 This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission. 
 More information... 
 
 
 
 ...

 
 
 
 Example Domains 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Domains 
 Protocols 
 Numbers 
 About 
 
 
 
 
 
 
 
 
 Example Domains 
 As described in  RFC 2606  and  RFC 6761 , a
number of domains such as example.com and example.org are maintained
for documentation purposes. These domains may be used as illustrative
examples in documents without prior coordination with us. They are not
available for registration or transfer. 
 We provide a web service on the example domain hosts to provide basic
information on the purpose of the domain. These web services are
provided as best effort, but are not designed to support production
applications. While incidental traffic for incorrectly configured
applications is expected, please do not design applicatio