In [7]:
import requests
from requests.exceptions import HTTPError

from urllib.parse import urlparse, urljoin, unquote
import pandas as pd
import html


import os
from dotenv import load_dotenv

In [8]:
load_dotenv()

BASE_PATH = os.getenv("BASE_PATH")
BASE_HTML_PATH = os.getenv("BASE_HTML_PATH")
DOMAIN = os.getenv("DOMAIN")
MAX_PAGE = int(os.getenv("MAX_PAGE"))

In [9]:
headers = {
    'User-Agent': 'Not Meow\'s Bot',
    'From': 'test@domain.com'
}
seed_url = 'http://www.ku.ac.th/th/'
# seed_url = "https://crawler-test.com/"

In [10]:
def get_page(url):
    global headers, counter
    text = ''
    try:
        response = requests.get(url, headers=headers, timeout=2)
        # If the response was successful, no Exception will be raised
        response.raise_for_status()
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')  # Python 3.6
    except Exception as err:
        print(f'Other error occurred: {err}')  # Python 3.6
    else:
        print(f'Success!: {(counter+1):5}, {url}')
        text = response.text

    return text

def get_base_url(url):
    parsed_url = urlparse(url)
    return parsed_url.scheme, parsed_url.netloc

def link_parser(raw_html):
    urls = [];
    pattern_start = '<a href="';  pattern_end = '"'
    index = 0;  length = len(raw_html)
    while index < length:
        start = raw_html.find(pattern_start, index)
        if start > 0:
            start = start + len(pattern_start)
            end = raw_html.find(pattern_end, start)
            link = raw_html[start:end]
            if len(link) > 0:
                if link not in urls:
                    urls.append(link)
            index = end
        else:
            break
    return urls

def enqueue(links):
    global frontier_q, visited_q
    for link in links:
        if link not in frontier_q and link not in visited_q:
            frontier_q.append(link)

def dequeue():
    global frontier_q
    current_url = frontier_q[0]
    frontier_q = frontier_q[1:]
    return current_url

In [11]:
def create_file(data, url):
    global BASE_PATH
    path = os.path.join(BASE_PATH, url)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as fp:
        fp.write(data)

def get_dothtml_from_url(url):
    path = urlparse(url).path
    last_path = path.split('/')[-1]
    if (last_path.endswith(".html") or last_path.endswith(".htm")):
        return last_path
    elif ("." in last_path):
        return None
    else: 
        return "dummy"
    
def decode_html_url(url):
    url = html.unescape(url)
    url = unquote(url)
    return url

def create_abs_url(current_url, link):
    return urljoin(current_url, decode_html_url(link))

def is_in_domain(base_url):
    global DOMAIN
    return base_url.endswith("DOMAIN")

In [None]:
frontier_q = [seed_url]
visited_q = list()
counter = 0
has_robotstxt = list()
has_sitemap = list()
disallow = list()

while ((len(frontier_q) != 0) and counter < MAX_PAGE):
    current_url = dequeue()
    if (current_url in visited_q):
        continue

    scheme, base_url = get_base_url(current_url)
    if (not base_url.endswith(DOMAIN)):
        continue

    url = urljoin(scheme + "://" + base_url, "robots.txt")
    if (url not in visited_q):
        visited_q.append(url)
        page = get_page(url)
        if ("User-agent:" in page):
            create_file(page, os.path.join(BASE_HTML_PATH, base_url, "robots.txt"))
            has_robotstxt.append(base_url)

        if ("Sitemap:" in page):
            has_sitemap.append(base_url)

    visited_q.append(current_url)
    raw_html = get_page(current_url)
    if (raw_html != ""):
        filename = get_dothtml_from_url(current_url)
        if (filename is None):
            pass
        else:
            if (filename == "dummy"):
                filename = os.path.join("".join(current_url.split("://")[1:]), "dummy")
            else:
                filename = "".join(current_url.split("://")[1:])
            counter += 1
            create_file(raw_html, os.path.join(BASE_HTML_PATH, filename))

    extracted_links = link_parser(raw_html)
    for link in extracted_links:
        link = create_abs_url(current_url, link)
        ext = get_dothtml_from_url(link)
        if (ext is None):
            continue
        if (link.split("/")[-1].find("#") != -1):
            continue
        enqueue([link])

create_file("\n".join(has_robotstxt), os.path.join(BASE_PATH, "list_robots.txt"))
create_file("\n".join(has_sitemap), os.path.join(BASE_PATH, "list_sitemap.txt"))