In [None]:
import urllib 
import concurrent.futures
import threading
import psycopg2
import time
import hashlib
import socket
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from bs4 import BeautifulSoup

WEB_DRIVER_LOCATION = "./geckodriver"
TIMEOUT = 5

def calculate_hash(data):
    hash_object = hashlib.sha256()
    hash_object.update(data.encode('utf-8'))
    hash_hex = hash_object.hexdigest()
    return hash_hex


def canonicalize(url):
    try:
        url = urlparse(url)
        scheme = url.scheme.lower()
        if scheme == 'https':
            scheme = 'http'
        # host = url.netloc.lower() # .netloc poda tudi port
        host = url.hostname.lower() 
        path = url.path
        params = url.params.lower()
        query = url.query.lower() 
        if params != "":
            paramsList = params.split(';')
            paramsList.sort()
            params = ""
            for p in paramsList:
                params += ";" + p
        if query != "":
            queryList = query.split('&')
            queryList.sort()
            query = "?"
            first = True
            for q in queryList:
                if first:
                    query += q
                    first = False
                else:
                    query += "&" + q
        return f"{scheme}://{host}{path}{params}{query}"
    except:
        return "http://www.gov.si/"

frontier = ["http://www.gov.si/", "http://evem.gov.si/", "http://e-uprava.gov.si/", "http://e-prostor.gov.si/"]
link_history = []
current_hosts = set()
images = []
firefox_options = FirefoxOptions()
firefox_options.add_argument("user-agent=fri-ieps-TEST")
firefox_options.add_argument("--headless")

def crawl_page(n, thread):
    for _ in range(n):
        driver = webdriver.Firefox(executable_path=WEB_DRIVER_LOCATION, options=firefox_options)
        host = ""
        hostname = ""
        url = ""
        with lock:
            if len(frontier) > 0:
                for i in range(len(frontier)): # iščemo stran na IP-ju, ki še ni zaseden
                    url = frontier[i]
                    if url in link_history:
                        continue
                    try:
                        hostname = urlparse(url).hostname
                        host = socket.gethostbyname(hostname)
                        if host not in current_hosts:
                            current_hosts.add(host)
                            frontier.pop(i)
                            break
                        print("Server on " + url + " busy (" + host + ")")
                    except:
                        print("Wrong URL format")
                        continue
                link_history.append(url)
                
                print(f"Retrieving web page URL '{url}' ({host}) - thread {thread}")
                driver.get(url)
            else:
                print(f'----------------empty frontier---------------- thread {thread}')
                continue
     
        time.sleep(TIMEOUT)
        current_hosts.remove(host)
        
        html = driver.page_source
        driver.close()

        soup = BeautifulSoup(html)
        with lock:
            for link in soup.find_all('a'):
                link = link.get('href')
                if link is not None:
                    if link.startswith('/'): # stran znotraj domene
                        link = canonicalize("http://" + hostname + link)
                        if link not in frontier:
                            frontier.append(link)
                    elif link.startswith('http'): # stran izven domene
                        link = canonicalize(link)
                        if 'gov.si' in link and link not in frontier:
                            frontier.append(link)
            for img in soup.find_all('img'):
                img = img.get('src')
                if img is not None:
                    img = url + img
                    if img not in images:
                        images.append(img);
    if thread == 0:
        print("\nLink history: ")
        for l in link_history:
            print(l)
        print("\nFrontier: ")
        for l in frontier:
            print(l)
        print("\nImages: ")
        for img in images:
            print(img)
        
lock = threading.Lock()
database_port = 5431
def get_db():
    conn = psycopg2.connect(host="localhost", port = database_port, user="user", password="SecretPassword")
    conn.autocommit = True

    cur = conn.cursor()
    cur.execute("SELECT * FROM crawldb.site")
    cur.close()
    conn.close()

def para(num_workers, pages_per_thread):
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        print(f"\n ... executing workers ...\n")
        for i in range(num_workers):
            executor.submit(crawl_page, pages_per_thread, i)

pages = 25
workers = 5
pages_per_worker = pages // workers

print(f"Crawling {pages_per_worker * workers} pages with {workers} workers")
para(workers, pages_per_worker)
#crawl_page(10,0)

