In [20]:
import requests
from requests.exceptions import HTTPError

from urllib.parse import urlparse, urljoin
import pandas as pd


import os
from dotenv import load_dotenv

In [21]:
load_dotenv()

BASE_PATH = os.getenv("BASE_PATH")
MAX_PAGE = 20

In [22]:
headers = {
    'User-Agent': 'Meow\'s Bot',
    'From': 'test@domain.com'
}
# seed_url = 'http://www.ku.ac.th/th/'
seed_url = "https://crawler-test.com/"

In [23]:
def get_page(url):
    global headers, counter
    text = ''
    try:
        response = requests.get(url, headers=headers, timeout=2)
        # If the response was successful, no Exception will be raised
        response.raise_for_status()
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')  # Python 3.6
    except Exception as err:
        print(f'Other error occurred: {err}')  # Python 3.6
    else:
        print(f'Success!: {url}')
        text = response.text
    
    counter += 1

    return text.lower()

def get_base_url(url):
    parsed_url = urlparse(url)
    return parsed_url.scheme, parsed_url.netloc

def link_parser(raw_html):
    urls = [];
    pattern_start = '<a href="';  pattern_end = '"'
    index = 0;  length = len(raw_html)
    while index < length:
        start = raw_html.find(pattern_start, index)
        if start > 0:
            start = start + len(pattern_start)
            end = raw_html.find(pattern_end, start)
            link = raw_html[start:end]
            if len(link) > 0:
                if link not in urls:
                    urls.append(link)
            index = end
        else:
            break
    return urls

def enqueue(links):
    global frontier_q, visited_q
    for link in links:
        if link not in frontier_q and link not in visited_q:
            frontier_q.append(link)

def dequeue():
    global frontier_q
    current_url = frontier_q[0]
    frontier_q = frontier_q[1:]
    return current_url

In [24]:
def create_file(page, url):
    global BASE_PATH
    path = os.path.join(BASE_PATH, url)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as fp:
        fp.write(page)

def get_dothtml_from_url(url):
    path = urlparse(url).path
    dothtml = path.split('/')[-1]
    return dothtml if (dothtml.endswith(".html") or dothtml.endswith(".htm")) else "dummy"

In [None]:
frontier_q = [seed_url]
visited_q = []
counter = 0

while ((len(frontier_q) != 0) and counter < MAX_PAGE):
    current_url = dequeue()
    scheme, base_url = get_base_url(current_url)
    if (base_url not in visited_q):
        visited_q.append(base_url)
        url = urljoin(scheme + "://" + base_url, "robots.txt")
        page = get_page(url)
        if ("user-agent:" in page):
            create_file(page, os.path.join(base_url, "robots.txt"))
            # TODO: record web that has robots.txt in it 
        
        # TODO: extract sitemap
        # TODO: record web that has sitemap

    visited_q.append(current_url)
    raw_html = get_page(current_url)
    if (raw_html != ""):
        filename = get_dothtml_from_url(current_url)
        if (filename == "dummy"):
            filename = urljoin("".join(current_url.split("://")[1:]), "dummy")
        else:
            filename = "".join(current_url.split("://")[1:])
        create_file(raw_html, os.path.join(filename))

    extracted_links = link_parser(raw_html)
    enqueue([urljoin(current_url, link) for link in extracted_links])