In [None]:
import requests
import urllib 
import urllib.robotparser
import concurrent.futures
import threading
import psycopg2
import time
import hashlib
import socket
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from bs4 import BeautifulSoup

WEB_DRIVER_LOCATION = "./geckodriver"
TIMEOUT = 5
page_types = ('HTML', 'BINARY', 'DUPLICATE', 'FRONTIER')
data_types = ('PDF', 'DOC', 'DOCX', 'PPT', 'PPTX')

def calculate_hash(data):
    hash_object = hashlib.sha256()
    hash_object.update(data.encode('utf-8'))
    hash_hex = hash_object.hexdigest()
    return hash_hex


def canonicalize(url):
    try:
        url = urlparse(url)
        scheme = url.scheme.lower()
        if scheme == 'https':
            scheme = 'http'
        # host = url.netloc.lower() # .netloc poda tudi port
        host = url.hostname.lower() 
        path = url.path
        params = url.params.lower()
        query = url.query.lower() 
        if params != "":
            paramsList = params.split(';')
            paramsList.sort()
            params = ""
            for p in paramsList:
                params += ";" + p
        if query != "":
            queryList = query.split('&')
            queryList.sort()
            query = "?"
            first = True
            for q in queryList:
                if first:
                    query += q
                    first = False
                else:
                    query += "&" + q
        return f"{scheme}://{host}{path}{params}{query}"
    except:
        return "http://www.gov.si/"

frontier = ["http://www.gov.si/", "http://evem.gov.si/", "http://e-uprava.gov.si/", "http://e-prostor.gov.si/"]
link_history = []
current_hosts = set()
images = []
firefox_options = FirefoxOptions()
firefox_options.add_argument("user-agent=fri-ieps-TEST")
firefox_options.add_argument("--headless")


def get_robots_content(hostname):
    robots_url = f"http://{hostname}/sitemap"
    robots_r = requests.get(robots_url)
    if robots_r.status_code != 200:
        parser = urllib.robotparser.RobotFileParser()
        parser.parse(robots_r.text)

def get_sitemap_content(hostname):
    sitemap = requests.get(f"http://{hostname}/sitemap")


def crawl_page(n, thread):
    for _ in range(n):
        driver = webdriver.Firefox(executable_path=WEB_DRIVER_LOCATION, options=firefox_options)
        host = ""
        hostname = ""
        url = ""
        with lock:
            if len(frontier) > 0:
                for i in range(len(frontier)): # iščemo stran na IP-ju, ki še ni zaseden
                    url = frontier[i]
                    if url in link_history:
                        continue
                    try:
                        hostname = urlparse(url).hostname
                        host = socket.gethostbyname(hostname)
                        if host not in current_hosts:
                            current_hosts.add(host)
                            frontier.pop(i)
                            break
                        print("Server on " + url + " busy (" + host + ")")
                    except:
                        print("Wrong URL format")
                        continue
                link_history.append(url)
                
                print(f"Retrieving web page URL '{url}' ({host}) - thread {thread}")
                driver.get(url)
            else:
                print(f'----------------empty frontier---------------- thread {thread}')
                continue
     
        time.sleep(TIMEOUT)
        current_hosts.remove(host)
        
        html = driver.page_source
        driver.close()

        if get_site_id(hostname) is None:
            robots_content = get_robots_content(hostname)
            sitemap_content = get_sitemap_content(hostname)
            insert_new_site(hostname, robots_content, sitemap_content)

        soup = BeautifulSoup(html)
        with lock:
            for link in soup.find_all('a'):
                link = link.get('href')
                if link is not None:
                    if link.startswith('/'): # stran znotraj domene
                        link = canonicalize("http://" + hostname + link)
                        if link not in frontier:
                            frontier.append(link)
                    elif link.startswith('http'): # stran izven domene
                        link = canonicalize(link)
                        if 'gov.si' in link and link not in frontier:
                            frontier.append(link)
            for img in soup.find_all('img'):
                img = img.get('src')
                if img is not None:
                    img = url + img
                    if img not in images:
                        images.append(img);
    if thread == 0:
        print("\nLink history: ")
        for l in link_history:
            print(l)
        print("\nFrontier: ")
        for l in frontier:
            print(l)
        print("\nImages: ")
        for img in images:
            print(img)
        
lock = threading.Lock()
database_port = 5431
def insert_new_site(domain, robots_content, sitemap_content):
    conn = psycopg2.connect(host="localhost", port = database_port, user="user", password="SecretPassword")
    conn.autocommit = True

    with lock:
        cur = conn.cursor()
        cur.execute("INSERT INTO crawldb.site (domain, robots_content, sitemap_content) VALUES(%s, %s, %s)",
                    (domain, robots_content, sitemap_content))
        cur.close()
    conn.close()

def get_site_id(domain):
    conn = psycopg2.connect(host="localhost", port = database_port, user="user", password="SecretPassword")
    conn.autocommit = True

    cur = conn.cursor()
    cur.execute("SELECT id FROM crawldb.site WHERE domain = %s", domain)
    site_id = cur.fetchone()[0]
    cur.close()
    conn.close()
    return site_id

def insert_page(site_id, page_type_code, url, html_content, https_status_code, accessed_time):
    conn = psycopg2.connect(host="localhost", port = database_port, user="user", password="SecretPassword")
    conn.autocommit = True

    # If a page is of type HTML, its content should be stored as a value within html_content attribute,
    # otherwise (if crawler detects a binary file - e.g. .doc), html_content is set to NULL
    # and a record in the page_data table is created
    ###
    # The duplicate page should not have set the html_content value and should be linked to a duplicate version of a page.
    if page_type_code not in page_types:
        return
    with lock:
        cur = conn.cursor()
        cur.execute("INSERT INTO crawldb.page (site_id, page_type_code, url, html_content, https_status_code, accessed_time) VALUES(%s, %s, %s, %s, %s, %s)",
                    (site_id, page_type_code, url, html_content, https_status_code, accessed_time))
        cur.close()
    conn.close()

def insert_image(page_id, filename, content_type, data, accessed_time):
    conn = psycopg2.connect(host="localhost", port = database_port, user="user", password="SecretPassword")
    conn.autocommit = True

    # there is no need to populate data field 
    with lock:
        cur = conn.cursor()
        if data is not None:
            cur.execute("INSERT INTO crawldb.image (page_id, filename, content_type, data, accessed_time) VALUES(%s, %s, %s, %s, %s)",
                        (page_id, filename, content_type, data, accessed_time))
        else:
            cur.execute("INSERT INTO crawldb.image (page_id, filename, content_type, accessed_time) VALUES(%s, %s, %s, %s)",
            (page_id, filename, content_type, accessed_time))
        cur.close()
    conn.close()

def insert_link(from_page, to_page):
    conn = psycopg2.connect(host="localhost", port = database_port, user="user", password="SecretPassword")
    conn.autocommit = True

    with lock:
        cur = conn.cursor()
        cur.execute("INSERT INTO crawldb.link (from_page, to_page)) VALUES(%s, %s)",
                    (from_page, to_page))
        cur.close()
    conn.close()

def insert_page_data(page_id, data_type_code, data):
    conn = psycopg2.connect(host="localhost", port = database_port, user="user", password="SecretPassword")
    conn.autocommit = True

    # List all other content (.pdf, .doc, .docx, .ppt and .pptx) in the page_data table - there is no need to populate data field 
    with lock:
        cur = conn.cursor()
        if data is not None:
            cur.execute("INSERT INTO crawldb.page_data (page_id, data_type_code, data)) VALUES(%s, %s, %s)",
                        (page_id, data_type_code, data))
        else:
            cur.execute("INSERT INTO crawldb.page_data (page_id, data_type_code)) VALUES(%s, %s)",
                        (page_id, data_type_code))
        cur.close()
    conn.close()

def para(num_workers, pages_per_thread):
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        print(f"\n ... executing workers ...\n")
        for i in range(num_workers):
            executor.submit(crawl_page, pages_per_thread, i)

pages = 25
workers = 5
pages_per_worker = pages // workers

print(f"Crawling {pages_per_worker * workers} pages with {workers} workers")
para(workers, pages_per_worker)
#crawl_page(10,0)

