<a href="https://colab.research.google.com/github/omar-atwa16/Information-Retrieval-Final-Project/blob/main/WebCrawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
import time
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from collections import deque, defaultdict
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from flask import Flask, request, jsonify

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
visted = set()
pages = {}
invertedIdx = defaultdict(set)

lastAccess = {}
politenessDelay = 1.5

stopWords = set(stopwords.words('english'))

In [57]:
def wait_for_politeness(url: str) -> None:
  host = urlparse(url).netloc
  now = time.time()
  last = lastAccess.get(host, 0)
  wait = politenessDelay - (now - last)
  if wait > 0:
    time.sleep(wait)
  lastAccess[host] = time.time()

In [58]:
def fetch(url: str):
  try:
    wait_for_politeness(url)
    resp = requests.get(url)
    status = resp.status_code
    contentType = resp.headers.get("Content-Type", "")
    if "text/html" in contentType:
      return status, resp.text
    else:
      return status, ""
  except requests.exceptions.RequestException:
    return None, ""

In [59]:
def extract_links_and_text (base_url: str, html: str):
  soup = BeautifulSoup(html, 'html.parser')

  outLinks = set()
  for a in soup.find_all("a", href=True):
    href = a["href"]
    absURL = urljoin(base_url, href)
    parsed = urlparse(absURL)

    if parsed.scheme in ("http", "https"):
      normalized = parsed._replace(fragment="").geturl()
      outLinks.add(normalized)

  text = soup.get_text(separator=" ")
  return list(outLinks), text

In [60]:
def tokenize(text: str):
  tokens = re.findall(r"\b\w+\b", text.lower())
  return [t for t in tokens if t not in stopWords]

In [61]:
def index_page(url: str, text: str):
  for term in tokenize(text):
    invertedIdx[term].add(url)

In [62]:
def crawl(start_url: str, maxDepth: int, maxPages: int =50):
  frontier = deque()
  frontier.append((start_url, 0))
  crawled = []
  while frontier and len(crawled) < maxPages:
    url, depth = frontier.popleft()

    if url in visted or depth > maxDepth:
      continue

    visted.add(url)
    status, html = fetch(url)

    if status is None:
      pages[url] = {"status": None, "outlinks": []}
      continue

    if html:
      outlinks, text = extract_links_and_text(url, html)
      pages[url] = {"status": status, "outlinks": outlinks}
      index_page(url, text)
    else:
      outlinks = []
      pages[url] = {"status": status, "outlinks": outlinks}

    crawled.append(url)

    if depth < maxDepth:
      for link in outlinks:
        if link not in visted:
          frontier.append((link, depth + 1))
  return crawled

In [63]:
app = Flask(__name__)

In [64]:
@app.get("/crawl")
def crawl_endpoint():
  url = request.args.get("url")

  if not url:
    return jsonify({"error": "url parameter is required"})

  try:
    depth = int(request.args.get("depth", 1))
  except ValueError:
    depth = 1

  try:
    maxPages = int(request.args.get("maxPages", 50))
  except ValueError:
    maxPages = 50


  crawledUrls = crawl(url, maxDepth=depth, maxPages=maxPages)

  return jsonify({
        "start_url": url,
        "depth": depth,
        "max_pages": maxPages,
        "crawled_count": len(crawledUrls),
        "crawled_urls": crawledUrls,
    })

In [65]:
@app.get("/index")
def index_endpoint():
  term = request.args.get("term", "").lower().strip()

  if not term:
    return jsonify({"error": "term parameter is required"}), 400

  urls = sorted(invertedIdx.get(term, []))
  return jsonify({
      "term": term,
      "count": len(urls),
      "urls": urls,
  })

In [66]:
@app.get("/pages")
def pages_endpoint():
  data = []
  for url, meta in pages.items():
    data.append({
        "url": url,
        "status": meta["status"],
        "outlinks": meta["outlinks"],
    })
  return jsonify({
      "pages": data,
      "count": len(data)
  })

In [67]:
if __name__ == "__main__":
  app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [68]:
'''
http://127.0.0.1:5000/crawl?url=https://pubmed.ncbi.nlm.nih.gov/&depth=1&maxPages=5

'''

'\nhttp://127.0.0.1:5000/crawl?url=https://pubmed.ncbi.nlm.nih.gov/&depth=1&maxPages=5\n\n'