In [None]:
# Imports
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
import urllib3
import requests
from requests.adapters import HTTPAdapter
import warnings
import threading
import hashlib
import selenium
import urllib
from urllib.parse import urlparse, urlunparse, urldefrag
from bs4 import BeautifulSoup
import threading
from queue import Queue
import urllib.robotparser
import datetime
import socket
import errno
import api_calls as ac
print(selenium.__version__)


# Options and parameters
WEB_DRIVER_LOCATION = "/app/geckodriver"
TIMEOUT = 5
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=1))
session.mount('https://', HTTPAdapter(max_retries=2))

INITIAL_SEED = [
    #'https://www.gov.si',
    #'https://spot.gov.si',
    #'https://e-uprava.gov.si',
    #'https://www.e-prostor.gov.si'
]

NUMBER_OF_WORKERS = 1

GROUP_NAME='wier2024'

In [79]:
class Site:
    def __init__(self, domain, robots_content, sitemap_content):
        self.domain = domain
        self.robots_content = robots_content
        self.sitemap_content = sitemap_content
        
class Page:
    def __init__(self, url, domain, page_type_code=None, content_hash=None, http_status_code=None, accessed_time=None, data_type_code=None, html_content=None, robots_content=None, from_page=None):
        self.page_type_code = page_type_code
        self.url = url
        self.html_content = html_content
        self.content_hash = content_hash
        self.http_status_code = http_status_code
        self.accessed_time = accessed_time
        self.domain = domain
        self.data_type_code = data_type_code
        self.robots_content = robots_content
        self.sitemap_content = ""
        self.from_page = from_page
        
class Image:
    def __init__(self, filename, content_type, data, accessed_time):
        self.filename = filename
        self.content_type = content_type
        self.data = data
        self.accessed_time = accessed_time

In [80]:
class CrawlerManager:
    def __init__(self, initial_seed, num_workers):
        self.num_workers = num_workers
        self.initial_seed = initial_seed
        self.frontier = Queue()
        
    def get_frontier(self):
        # Call database to get all the frontier entries from Page table
        # For each push into frontier.Queue()
        self.frontier = Queue()
        frontier_elements = ac.get_frontier()
        for element in frontier_elements:
            self.frontier.put(Page(element[1], self.get_domain(element[1])))
        
        print('Frontier size: ', self.frontier.qsize())
        
    def get_domain(self, url):
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        if "www." in domain:
            domain = domain.replace("www.", ".")
            
        return domain
        
    def set_frontier(self):
        print('Setting frontier')

        # Search the database
        # Pages with PAGE_TYPE = FRONTIER
        
        # self.frontier.put(page)
                
        #print('Frontier size: ', self.frontier.qsize())
        
    def initialize_seed(self):
        #print('Initializing seed')
        # Add URLs to frontier passed from the argument
        # In case there are no URLs passed from the argument, search for the seed URLs in the database
        if len(self.initial_seed) != 0:
            for url in self.initial_seed:
                #  Insert page into the frontier to DB
                #page = Page(url, self.get_domain(url))
                
                data = {
                        "domain": self.get_domain(url),
                        "url": url,
                        "from_page": None
                }
                page_id = ac.insert_page_into_frontier(data)
                # Insert each page into frontier
                
        self.get_frontier()
        
    def run(self):
        while not self.frontier.empty():
            count = 0
            threads = []
            while not self.frontier.empty() and count < self.num_workers:
                #if count > self.num_workers:
                #    break

                # Dequeue element
                page = self.frontier.get()
                crawler = Crawler()

                # Process the element
                thread = threading.Thread(name=f'{count}', target=crawler.crawl_webpage, args=(page,))
                thread.start()
                threads.append(thread)

                count += 1
            
            for thread in threads:
                thread.join()
    
            # Get frontier from database until frontier empty
            self.get_frontier()
    
        print('Crawler finished')

In [81]:
last_request_times = {}

class Crawler:
    def __init__(self):#, initial_seed, num_workers):
        #self.initial_seed = initial_seed
        #self.frontier = Queue()
        #self.num_workers = num_workers
        self.web_driver = self.initialize_web_driver()
        self.thread = None
        self.disallowed_urls = []
    
    def initialize_web_driver(self):
        options = FirefoxOptions()
        options.add_argument("--headless")
        options.add_argument("user-agent=fri-wier-oskapha")
        service = Service(executable_path=WEB_DRIVER_LOCATION)
        driver = webdriver.Firefox(service=service, options=options)
        driver.set_page_load_timeout(10)
        return driver
        
    def canonicalize_url(self, url):
        """
        Canonicalizes a URL by removing redundant parts and ensuring consistency.
        """
        parsed_url = urlparse(url)
        parsed_url = parsed_url._replace(fragment='')
        parsed_url = parsed_url._replace(scheme=parsed_url.scheme.lower(), netloc=parsed_url.netloc.lower())
        parsed_url = parsed_url._replace(query='')
        
        if parsed_url.path.endswith('/'):
            parsed_url = parsed_url._replace(path=parsed_url.path[:-1])
        if parsed_url.path.endswith('/index.html'):
            parsed_url = parsed_url._replace(path=parsed_url.path[:-11])
        if parsed_url.path.endswith('/index.php'):
            parsed_url = parsed_url._replace(path=parsed_url.path[:-10])
        
        if parsed_url.scheme == 'http' or parsed_url.scheme == 'https':
            pass
        else:
            return None
        
        parsed_url = parsed_url._replace(path=urlparse(parsed_url.geturl()).path)
        
        canonicalized_url = urlunparse(parsed_url)
        
        return canonicalized_url
    
    def parse_robots(self, url):
        domain = self.get_domain(url)
        robot_parser = urllib.robotparser.RobotFileParser()
        if domain.startswith('.'):
            domain = domain[1:]
        robot_parser.set_url(f'https://{domain}/robots.txt')
        robot_parser.read()
        crawl_delay = robot_parser.crawl_delay('')  # '' for useragent
        site_maps = robot_parser.site_maps()
        
        response = requests.get(f'https://{domain}/robots.txt', timeout=10)
        if response.status_code == 200:
            robots_content = response.text
        else:
            robots_content = None
        return crawl_delay, site_maps, robots_content
            
    
    def robots_allow(self, url):
        domain = self.get_domain(url)
        robot_parser = urllib.robotparser.RobotFileParser()
        if domain.startswith('.'):
            domain = domain[1:]
        robot_parser.set_url(f'https://{domain}/robots.txt')
        robot_parser.read()
        return robot_parser.can_fetch(GROUP_NAME, url)
    
    def check_duplicates(self, page):
        # Check if the page is already in the database by checking the content hash
        hashed_content = self.hash_content(page.html_content)
        
        content_hashes = []
        content_hashes = ac.get_hashed_content()
        # Get function to get the content hash of all pages in the database
        if hashed_content in content_hashes:
            return True
        else:
            return False
        
    def request_timeout(self, ip):
        last_request_time = last_request_times.get(ip)
        if last_request_time is None:
            return
        current_time = time.time()
        timeout = current_time - last_request_time
        if timeout < 5:
            time.sleep(5 - timeout)
    
    def update_last_request_time(self, ip):
        last_request_times[ip] = time.time()
    
    def get_domain(self, url):
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        if "www." in domain:
            domain = domain.replace("www.", ".")
            
        return domain
    
    def hash_content(self, content):
        hashed_content = hashlib.md5(content.encode('utf-8')).hexdigest()
        #print(hashed_content)
        return hashed_content
    
    def fetch_content(self, url):
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            return response.content
        else:
            print('Fetching content failed')
            return None
        
    def get_images(self):
        image_objects = []
        try:
            images = self.web_driver.find_elements(by=By.TAG_NAME, value="img")

            for image in images:
                filename = image.get_attribute('src')
                if filename is not None:
                    content_type = filename.split(".")[-1]
                    if len(content_type) > 3:
                        content_type = 'UNKNOWN'
                    if 'base64' in content_type and len(content_type) > 3:
                        metadata, base64_data = content_type.split(",")
                        content_type = metadata.split(";")[0].split(":")[1].split("/")[1]
                        content = base64_data
                    elif len(content_type) < 4:
                        # There is no need to populate content in database
                        #content = self.fetch_content(filename)
                        content = 'something'
                        
                    accessed_time = datetime.datetime.now()
                    image_object = Image(filename, content_type, content, accessed_time)
                    image_objects.append(image_object)
        except Exception as e:
            print(e)
        return image_objects
        
    def validate_links(self, links):
        already_added_links = set()
        validated_links = []
        for link in links:
            if link is not None and len(link) > 0 and 'javascript:' not in link.lower() and 'mailto:' not in link.lower():
                domain = self.get_domain(link)
                #print('Domain: ', domain)
                if "gov.si" in domain: # Add robots check before appending
                    
                    canonicalized_url = self.canonicalize_url(link)
                    #print('Canonicalized: ', canonicalized_url)
                    
                    # Check for duplicates in the array
                    if (canonicalized_url not in already_added_links and canonicalized_url not in INITIAL_SEED):
                        #print('Valid')
                        validated_links.append(canonicalized_url)
                        already_added_links.add(canonicalized_url)
            
        return validated_links
        
    # Get all urls from links on the page
    def get_links(self, page):
        #print('Getting links')
        links = []
        
        # Check robots again
        _, sitemap_links, _ = self.parse_robots(page.url)
        # Links from sitemaps
        if sitemap_links is not None:
            for sitemap_link in sitemap_links:
                #print("Sitemap: ", sitemap_link)
                links.append(sitemap_link)
            
        # Get links from href
        try:
            hrefs = self.web_driver.find_elements(by=By.XPATH, value="//a[@href]")
            for href in hrefs:
                try:
                    href_value = href.get_attribute("href")
                    links.append(href_value)
                except StaleElementReferenceException as e:
                    pass
        except NoSuchElementException:
            print('No elements')    
        
        # Get links from buttons
        try:
            on_clicks = self.web_driver.find_elements(by=By.XPATH, value="//*[@onclick]")
            for on_click in on_clicks:
                try:
                    click_value = on_click.get_attribute('onclick')
                    
                    # Check for links
                    if 'document.location' in click_value or 'location.href' in click_value:
                        link = click_value.split('=')[1]
                        links.append(link)
                except StaleElementReferenceException as e:
                    pass
        except NoSuchElementException:
            print('No elements')
        
        validated_links = self.validate_links(links)
            
        return validated_links
            
    def crawl_webpage(self, page):
        self.thread = threading.current_thread()
        print(f'Thread: {self.thread.name} now crawling: {page.url}')
        
        try:
            # Implement robots.txt
            if not self.robots_allow(page.url):
                return
            
            crawl_delay, sitemap_links, robots_content = self.parse_robots(page.url)
            page.robots_content = robots_content
            
            if sitemap_links is not None:
                for sitemap_link in sitemap_links:
                    page.sitemap_content += sitemap_link + "\n"
            else:
                page.sitemap_content = ""
            
            try:
                ip = socket.gethostbyname(page.domain)
                #print('IP: ', ip)
            except Exception as e:
                ip = None
            
            if crawl_delay is None:
                self.request_timeout(ip)
            else:
                time.sleep(crawl_delay)
            
            # Read the page
            try:
                response = requests.head(page.url, allow_redirects=True, timeout=10)
                page.http_status_code = response.status_code
                page.accessed_time = datetime.datetime.now()
                page.content_type = response.headers['Content-Type']
                
            except Exception as e:
                # Incase there are problems with header try the whole get
                response = requests.get(page.url, allow_redirects=True, timeout=10)
                page.http_status_code = response.status_code
                page.accessed_time = datetime.datetime.now()
                page.content_type = response.headers['Content-Type']
            
            # Is it HTML
            if "text/html" in response.headers['Content-Type']:
                page.page_type_code = "HTML"
                self.web_driver.get(page.url)
                page.html_content = self.web_driver.page_source
                page.content_hash = self.hash_content(page.html_content)
                page.data_type = None
                
                # Check duplicates
                #if self.check_duplicates(page):
                #    print('Duplicates found')
                #    page.page_type_code = "DUPLICATE"
                
                # Check links
                links = self.get_links(page)
                
                # Check for images
                try:
                    images = self.get_images()
                    # Insert image data into database
                    for image in images:
                        # Insert each image
                        #print('Item ', image.filename)
                        data = {
                            "url": page.url,
                            "filename": image.filename,
                            "content_type": image.content_type,
                            "accessed_time": image.accessed_time.isoformat()
                        }
                        ac.insert_image(data)
                except Exception as e:
                    print('Error on images: ', e)
                
                # Insert links into frontier
                for link in links:
                    data = {
                        "domain": self.get_domain(link),
                        "url": link,
                        "from_page": page.url
                    }
                    page_id = ac.insert_page_into_frontier(data)
                    #print(page_id)
                
            else:
                # Else it is a binary file
                #print("It's a binary file!")
                
                page.html_content = None
                page.page_type_code = "BINARY"
                if "application/pdf" == page.content_type:
                    page.data_type = "PDF"
                    
                elif "application/msword" == page.content_type:
                    page.data_type = "DOC"
                
                elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" == page.content_type:
                    page.data_type = "DOCX"
                    
                elif "application/vnd.ms-powerpoint" == page.content_type:
                    page.data_type = "PPT"
                    
                elif "application/vnd.openxmlformats-officedocument.presentationml.presentation" == page.content_type:
                    page.data_type = "PPTX"
                
                else:
                    page.data_type = "UNDEFINED"
                
            self.update_last_request_time(ip)
            
            # Insert into database table Page Data since it is a binary file
            # Also update the page entry in table Page
            data = {
                "url": page.url,
                "page_type_code": page.page_type_code,
                "html_content": page.html_content,
                "http_status_code": page.http_status_code,
                "accessed_time": page.accessed_time.isoformat(),
                "robots_content": page.robots_content,
                "sitemap_content": page.sitemap_content,
                "data_type_code": page.data_type,
                "hashed_content": page.content_hash
                
            }
            print(ac.update_page_data(data))
            
        except Exception as e:
            # Update database page entry with PAGE_TYPE = ERROR
            page.page_type_code = "ERROR"
            page.data_type = "UNDEFINED"
            data = {
                "url": page.url,
                "page_type_code": page.page_type_code,
                "html_content": page.html_content,
                "http_status_code": page.http_status_code,
                "accessed_time": datetime.datetime.now().isoformat(),
                "robots_content": page.robots_content,
                "sitemap_content": page.sitemap_content,
                "data_type_code": page.data_type,
                "hashed_content": page.content_hash
                
            }
            print('Error crawling', page.url, e)
            print('Update on error: ', ac.update_page_data(data))
            
            return
        
    def __del__(self):
        if self.web_driver is not None:
            self.web_driver.quit()

In [85]:
NUMBER_OF_WORKERS = 10
crawler_manager = CrawlerManager(initial_seed=INITIAL_SEED, num_workers=NUMBER_OF_WORKERS)

# Initialize seed and create first Page entries
crawler_manager.initialize_seed()
# Get frontier and run the loop
crawler_manager.run()

KeyError: 'data'