In [1]:
from collections import defaultdict
import time
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import requests
import json
import scipy as sp
import pickle

### Web crawler

In [2]:
class WebCrawler:
    def __init__(self, start_urls, domain, skip_urls=[]):
        self.urls = start_urls if isinstance(start_urls, list) else [start_urls]
        self.domain = domain
        self.found_urls = set(start_urls) if isinstance(start_urls, list) else {start_urls}  # for faster lookup during crawling
        self.found_urls.update(skip_urls)
        self.stemmer = PorterStemmer()
        try:
            with open("documents.json", "r") as doc_file:
                self.documents = json.load(doc_file)
                self.found_urls.update(self.documents)
        except (json.JSONDecodeError, FileNotFoundError) as _:
            print("Failed to load documents.json")
            self.documents = []
        try:
            with open("terms_by_doc.pkl", "rb") as pkl_file:
                self.terms_by_doc = pickle.load(pkl_file)
        except (EOFError, FileNotFoundError):
            print("Failed to load terms_by_doc.pkl")
            self.terms_by_doc = sp.sparse.lil_matrix((0, 0))
        try:
            with open("terms.json", 'r') as t_file:
                self.terms = json.load(t_file)
        except (json.JSONDecodeError, FileNotFoundError):
            print("Failed to load terms.json")
            self.terms = {}
        try:
            with open("stop_words.txt", 'r') as sw_file:
                self.stop_words = set(sw_file.read().splitlines())
        except (json.JSONDecodeError, FileNotFoundError):
            print("Failed to load stop_words.txt")
            self.stop_words = set()
    
    def _crawl_page(self, url):
        page = requests.get(url)
        if page.status_code == 200:
            soup = BeautifulSoup(page.content, 'html.parser')
            links = soup.find_all("a")
            all_links = [link.get('href') for link in links]
            new_links = set([link for link in all_links
                         if link is not None
                         and str(link).startswith("/wiki/")
                         and 'Special:' not in link
                         and ((':' not in link and '%3A' not in link) or 'Category:' in link)
                         and link not in self.found_urls])
            self.urls.extend(new_links)
            self.found_urls.update(new_links)
            if 'Category:' in url:
                return []
            paragraphs = soup.find_all("p")
            return paragraphs
        else:
            print(f"Failed to retrieve: {url}\nStatus Code: {page.status_code}")
            return []
        
    def _create_index(self, paragraphs):
        bag_of_words = defaultdict(int)  # bag_of_words = {idx0 : cnt0, idx1 : cnt1, ...}
        for p in paragraphs:
            words = word_tokenize(p.text.lower())
            filtered_words = [self.stemmer.stem(word) for word in words if word not in self.stop_words and word.isalpha()]
            for word in filtered_words:
                if word not in self.terms:
                    idx = len(self.terms)
                    self.terms[word] = idx
                bag_of_words[self.terms[word]] += 1
        return bag_of_words
    
    def _save_index(self, bag_of_words, url):
        self.terms_by_doc.resize((len(self.terms), self.terms_by_doc.shape[1] + 1))
        for idx, count in bag_of_words.items():
            self.terms_by_doc[idx, self.terms_by_doc.shape[1] - 1] = count
        self.documents.append(url)
        
    def crawl(self, max_crawls = 10):
        i = 0
        while len(self.urls) > 0 and i < max_crawls:
            url = self.urls.pop(0)
            if not url.startswith(self.domain):
                url = self.domain + url
            paragraphs = self._crawl_page(url)
            bag_of_words = self._create_index(paragraphs)
            self._save_index(bag_of_words, url)
            i += 1
            time.sleep(1)
        with open('terms.json', 'w') as t_file:
            json.dump(self.terms, t_file)
        with open('documents.json', 'w') as d_file:
            json.dump(self.documents, d_file)
        with open('terms_by_doc.pkl', 'wb') as t_file:
            pickle.dump(self.terms_by_doc, t_file)
        # print(self._create_index(self._crawl_page("https://realpython.com/beautiful-soup-web-scraper-python")))
        

In [3]:
wc = WebCrawler(start_urls = "https://simple.wikipedia.org/wiki/Category:Contents", domain = "https://simple.wikipedia.org", skip_urls=["https://simple.wikipedia.org/wiki/Category:Noindexed_pages"])
wc.crawl(max_crawls = 10000)
#["https://simple.wikipedia.org/wiki/War_crime", "https://simple.wikipedia.org/wiki/Computer_science", ]

Failed to load documents.json
Failed to load terms_by_doc.pkl
Failed to load terms.json


KeyboardInterrupt: 