In [29]:
# Connect to Flask API

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException

# Check the selenium version insalled
import selenium
print(selenium.__version__)

WEB_DRIVER_LOCATION = "/app/geckodriver"
TIMEOUT = 5

import requests
from requests.adapters import HTTPAdapter

session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=1))
session.mount('https://', HTTPAdapter(max_retries=2))

import database

conn = database.connect_to_database()

print(database.get_frontier_pages(conn))

4.19.0
Connected to the database successfully.
[(1, 'neki.vom')]


In [30]:
class Site:
    def __init__(self, domain, robots_content, sitemap_content, crawl_delay):
        self.domain = domain
        self.robots_content = robots_content
        self.sitemap_content = sitemap_content
        self.crawl_dielay = crawl_delay
        
class Page:
    def __init__(self, url, domain, page_type_code=None, content_hash=None, http_status_code=None, accessed_time=None, data_type_code=None, html_content=None):
        self.page_type_code = page_type_code
        self.url = url
        self.html_content = html_content
        self.content_hash = content_hash
        self.http_status_code = http_status_code
        self.accessed_time = accessed_time
        self.domain = domain
        self.data_type_code = data_type_code
        
class Image:
    def __init__(self, filename, content_type, data, accessed_time):
        self.filename = filename
        self.content_type = content_type
        self.data = data
        self.accessed_time = accessed_time

In [31]:
import urllib
from urllib.parse import urlparse, urlunparse, urldefrag
from bs4 import BeautifulSoup
import threading
from queue import Queue
import urllib.robotparser
import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

class Crawler:
    def __init__(self, initial_seed, num_workers):
        self.initial_seed = initial_seed
        self.frontier = Queue()
        # self.visited = set()
        self.num_workers = num_workers
        self.web_driver = self.initialize_web_driver()
        # self.lock = threading.Lock()
    
    def initialize_web_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("user-agent=fri-wier-oskapha")
        service = Service()
        driver = webdriver.Chrome(service=service, options=options)
        driver.set_page_load_timeout(10)
        return driver
        
    def canonicalize_url(self, url):
        """
        Canonicalizes a URL by removing redundant parts and ensuring consistency.
        """
        parsed_url = urlparse(url)
        parsed_url = parsed_url._replace(fragment='')
        parsed_url = parsed_url._replace(scheme=parsed_url.scheme.lower(), netloc=parsed_url.netloc.lower())
        parsed_url = parsed_url._replace(query='')
        
        if parsed_url.path.endswith('/'):
            parsed_url = parsed_url._replace(path=parsed_url.path[:-1])
        if parsed_url.path.endswith('/index.html'):
            parsed_url = parsed_url._replace(path=parsed_url.path[:-11])
        if parsed_url.path.endswith('/index.php'):
            parsed_url = parsed_url._replace(path=parsed_url.path[:-10])
        
        if parsed_url.scheme == 'http' or parsed_url.scheme == 'https':
            pass
        else:
            return None
        
        parsed_url = parsed_url._replace(path=urlparse(parsed_url.geturl()).path)
        
        canonicalized_url = urlunparse(parsed_url)
        
        return canonicalized_url
    
    def robots_allow(self, url):
        domain = self.get_domain(url)
        robot_parser = urllib.robotparser.RobotFileParser()
        robot_parser.set_url(f'https://{domain}/robots.txt')
        robot_parser.read()
        return robot_parser.can_fetch(GROUP_NAME, url)
    
    def check_duplicates(self, page):
        # Check if the page is already in the database by checking the content hash
        content_hashes = []
        # Get function to get the content hash of all pages in the database
        
        for content_hash in content_hashes:
            if content_hash == page.content_hash:
                return True
            else:
                return False
    
    def get_domain(self, url):
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        if "www." in domain:
            domain = domain.replace("www.", "")
            
        return domain
    
    def get_frontier(self):
        # Call database to get all the frontier entries from Page table
        # For each push into frontier.Queue()
        
        print('Frontier size: %s', self.frontier.qsize())
        
    def set_frontier(self):
        print('Setting frontier')

        # Search the database
        # Pages with PAGE_TYPE = FRONTIER
        
        # self.frontier.put(page)
                
        print('Frontier size: %s', self.frontier.qsize())
        
    def initialize_seed(self):
        print('Initializing seed')
        # Add URLs to frontier passed from the argument
        # In case there are no URLs passed from the argument, search for the seed URLs in the database
        if len(self.initial_seed) != 0:
            for url in self.initial_seed:
                #  Insert page into the frontier to DB
                page = Page(url, self.get_domain(url))
                
                # Insert each page into frontier
                
                self.frontier.put(page)
                
        print('Frontier size: %s', self.frontier.qsize())
        
    def validate_links(self, links):
        validated_links = []
        for link in links:
            #print('Orginial Link: ', link)
            if link is not None and len(link) > 0 and 'javascript:' not in link.lower() and 'mailto:' not in link.lower():
                domain = self.get_domain(link)
                #print('Domain: ', domain)
                if "gov.si" in domain: # Add robots check before appending
                    # Removes /? 
                    #url = urldefrag(link)[0]
                    
                    canonicalized_url = self.canonicalize_url(link)
                    #print('Canonicalized: ', canonicalized_url)
                    
                    # Check for duplicates in the array
                    if (canonicalized_url not in validated_links or canonicalized_url not in self.initial_seed):
                        #print('Valid')
                        validated_links.append(canonicalized_url)
            
        return validated_links
        
    # Get all urls from links on the page
    def get_links(self, page):
        print('Getting links')
        links = []
        
        # Check robots again
        
        # Get links from href
        try:
            hrefs = self.web_driver.find_elements(by=By.XPATH, value="//a[@href]")
            for href in hrefs:
                try:
                    href_value = href.get_attribute("href")
                    links.append(href_value)
                except StaleElementReferenceException as e:
                    pass
        except NoSuchElementException:
            print('No elements')    
        
        # Get links from buttons
        # try:
        #    buttons = self.web_driver.find_element(by=By.XPATH, value="//*[@onclick]")
        #    for button in buttons:
        #        try:
        #            click_value = button.get_attribute('onclick')
        #            # print("Button: ", button)
        #        except StaleElementReferenceException as e:
        #            pass
        #except NoSuchElementException:
        #    print('No elements')
        urls = []
        soup = BeautifulSoup(page.html_content, 'html.parser')
        for script in soup(["script", "style"]):
            script.extract()
        
        for element in soup.select("[onClick], [onclick]"):
            print('Element: ', element)
        
        validated_links = self.validate_links(links)
        
        #for smt in validated_links:
            #print(smt)
            
        return validated_links
                
    def crawl_webpage(self, page):
        print('Started crawling', page.url)
        
        try:
            # Implement robots.txt
            
            # Read the page

            
            response = requests.head(page.url, allow_redirects=True, timeout=10)
            page.http_status_code = response.status_code
            page.accessed_time = datetime.datetime.now()
            page.content_type = response.headers['Content-Type']
            
            # Is it HTML
            if "text/html" in response.headers['Content-Type']:
                page.page_type_code = "HTML"
                self.web_driver.get(page.url)
                page.html_content = self.web_driver.page_source
                
                
                # Check duplicates
                
                # Check links
                links = self.get_links(page)

                for link in links:
                    # Insert page into frontier to DB
                    page = Page(link, self.get_domain(link))
                    #database.insert_page_into_frontier(conn, page.domain, page.url)

                
                
                # Check for images
                
                # Insert links into frontier
                
                # Insert image data into database
                
            else:
                # Else it is a binary file
                print("It's a binary file!")
                
                page.html_content = None
                page.page_type_code = "BINARY"
                if "application/pdf" == page.content_type:
                    page.data_type = "PDF"
                    
                elif "application/msword" == page.content_type:
                    page.data_type = "DOC"
                
                elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" == page.content_type:
                    page.data_type = "DOCX"
                    
                elif "application/vnd.ms-powerpoint" == page.content_type:
                    page.data_type = "PPT"
                    
                elif "application/vnd.openxmlformats-officedocument.presentationml.presentation" == page.content_type:
                    page.data_type = "PPTX"
            
                # Insert into database table Page Data since it is a binary file
                
                # Also update the page entry in table Page       
        except Exception as e:
            # Update database page entry with PAGE_TYPE = TIMEOUT
            print('Error crawling %s: %s', page, e)
            return

In [32]:
# Start of crawler

import urllib3
import requests
from requests.adapters import HTTPAdapter
import warnings

INITIAL_SEED = [
    #'https://www.gov.si',
    #'https://spot.gov.si',
    #'https://e-uprava.gov.si',
    'https://www.e-prostor.gov.si'
]

NUMBER_OF_WORKERS = 1

GROUP_NAME='wier2024'

In [33]:
crawler = Crawler(initial_seed=INITIAL_SEED, num_workers=NUMBER_OF_WORKERS)
crawler.initialize_seed()

while crawler.frontier.qsize() > 0:
    # Dequeue element
    element = crawler.frontier.get()
    
    # Process the element
    crawler.crawl_webpage(page=element)
    
    crawler.frontier.task_done()
    
    # Get frontier from database until frontier empty
    
print('Crawler finished')
crawler.web_driver.quit()

Initializing seed
Frontier size: %s 1
Started crawling https://www.e-prostor.gov.si
Getting links
Element:  <button aria-label="Išči" id="search-button" onclick="document.getElementById('form_kesearch_searchfield').submit();" type="submit"><span><i class="fal fa-search"></i></span></button>
Crawler finished
