In [23]:
import requests
from requests.exceptions import HTTPError

from urllib.parse import urlparse, urljoin, unquote, parse_qs, urlencode
import html

import re
import os
from dotenv import load_dotenv
import urllib.robotparser
from bs4 import BeautifulSoup

In [24]:
load_dotenv()

BASE_PATH = os.getenv("BASE_PATH")
BASE_HTML_PATH = os.getenv("BASE_HTML_PATH")
MAX_PAGE = int(os.getenv("MAX_PAGE"))
POOL_SIZE = int(os.getenv("POOL_SIZE"))

In [None]:
prefectures = [
    "Hokkaido", "Aomori", "Iwate", "Miyagi", "Akita", "Yamagata", "Fukushima",
    "Ibaraki", "Tochigi", "Gunma", "Saitama", "Chiba", "Tokyo", "Kanagawa", 
    "Niigata", "Toyama", "Ishikawa", "Fukui", "Yamanashi", "Nagano", "Gifu", 
    "Shizuoka", "Aichi", "Mie", "Shiga", "Kyoto", "Osaka", "Hyogo", "Nara", 
    "Wakayama", "Tottori", "Shimane", "Okayama", "Hiroshima", "Yamaguchi", 
    "Tokushima", "Kagawa", "Ehime", "Kochi", "Fukuoka", "Saga", "Nagasaki", 
    "Kumamoto", "Oita", "Miyazaki", "Kagoshima", "Okinawa"
]

In [1]:
headers = {
    'User-Agent': 'WEBIR_2 KU Project\'s Bot',
    'From': 'phakpoom.a@ku.th'
}
seed_urls = [input()]
# seed_urls = ["https://www.japan.travel/en/us/", "https://japantravel.navitime.com/en/", "https://www.japan-guide.com/", "https://en.japantravel.com/", "https://www.ana.co.jp/en/us/japan-travel-planner/"]
# seed_urls = ["https://www.aichi-now.jp/en/", "https://enjoy.pref.fukui.lg.jp/en/", "https://visitgifu.com/", "https://www.ishikawatravel.jp/en/", "https://www.go-nagano.net/en/", "http://en.nagano-cvb.or.jp/", "https://nagoya.travel/", "https://enjoyniigata.com/en/", "https://www.nvcb.or.jp/multilingual/", "https://www.visitsado.com/en/", "https://www.visit-shizuoka.com/en/", "https://exploreshizuoka.jp/en/", "https://www.toyamashi-kankoukyoukai.jp/en/", "https://visit-toyama-japan.com/en", "https://www.yamanashi-kankou.jp/english/index.html", "https://dive-hiroshima.com/en/", "https://www.okayama-japan.jp/en", "https://www.kankou-shimane.com/en/", "https://www.tottori-tour.jp/en/", "https://yamaguchi-city.jp/w/en/", "https://www.visit-jy.com/en/", "https://visit.sapporo.travel/", "https://www.visit-hokkaido.jp/en/index.html", "https://www.visitchiba.jp", "https://www.visit-gunma.jp/en/", "https://visit.ibarakiguide.jp/en/", "https://trip.pref.kanagawa.jp", "https://www.stib.jp/saitamacity-visitorsguide/", "https://saitama-supportdesk.com/", "https://www.visit-tochigi.com", "https://www.gotokyo.org/en/index.html", "https://www.hyogo-tourism.jp/world/", "https://www.kyototourism.org/en/", "https://www.kyoto.travel/en/", "https://www.visitnara.jp/", "https://en.osaka-info.jp/", "https://en.biwako-visitors.jp/", "https://visitwakayama.jp/en/index.html", "https://visitmie-japan.travel/en/index.html", "https://www.travel.pref.mie.lg.jp/en/index.shtm", "https://www.crossroadfukuoka.jp/en", "https://gofukuoka.jp", "https://www.kagoshima-yokanavi.jp/en", "https://www.kagoshima-kankou.com/for", "https://kumamoto.guide/en/", "https://kumamoto-guide.jp/en/", "https://www.miyazaki-city.tourism.or.jp/en", "https://www.kanko-miyazaki.jp/en", "https://www.discover-nagasaki.com/en", "https://en.at-nagasaki.jp", "https://www.discover-oita.com", "https://visitokinawajapan.com", "https://www.asobo-saga.jp/en/", "https://visitehimejapan.com/en/", "https://cycling-ehime.com/en/?_ga=2.264379849.717529468.1708478125-1172468562.1680245483", "https://www.my-kagawa.jp/en", "https://visitkochijapan.com/en/", "https://www.pref.tokushima.lg.jp/en/japanese/tourism", "https://shikoku-tourism.com/en/", "https://visitakita.com/en/", "https://aomori-tourism.com/en/", "https://fukushima.travel", "https://iwatetabi.jp/en/", "https://visitmiyagi.com", "https://yamagatakanko.com/en/", "https://www.tohokukanko.jp/en/"]
seed_domain = [(urlparse(url)).netloc for url in seed_urls]

In [None]:


def is_english_page(response):
    try:
        content_language = response.headers.get("Content-Language")
        if content_language and "en" not in content_language.lower():
            return False
        
        soup = BeautifulSoup(response.text, "html.parser")
        html_lang = soup.html.get("lang")
        if html_lang and not (html_lang.startswith("en") or html_lang.startswith("en-us")):
            return False
        
        return True
    
    except:
        return False

In [26]:
def get_page(url):
    global headers, counter
    response = None
    try:
        response = requests.get(url, headers=headers, timeout=2)
        response.raise_for_status()
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
    else:
        if (is_english_page(response) or url.endswith("robots.txt")):
            print(f'Success!: {(counter+1):10}, {url}')
        else:
            return None

    return response

def get_base_url(url):
    parsed_url = urlparse(url)
    return parsed_url.scheme, parsed_url.netloc

def link_parser(raw_html):
    urls = []
    pattern_start = '<a href="';  pattern_end = '"'
    index = 0;  length = len(raw_html)
    while index < length:
        start = raw_html.find(pattern_start, index)
        if start > 0:
            start = start + len(pattern_start)
            end = raw_html.find(pattern_end, start)
            link = raw_html[start:end]
            if len(link) > 0:
                if link not in urls:
                    urls.append(link)
            index = end
        else:
            break
    return urls

def normalize_url(url):
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    
    unique_params = {key: value[0] for key, value in query_params.items()}

    normalized_query = urlencode(unique_params, doseq=True)
    normalized_url = parsed_url._replace(query=normalized_query).geturl()

    parsed_url = urlparse(normalized_url)
    path_parts = parsed_url.path.split('/')
    normalized_path = '/'.join(sorted(set(path_parts), key=path_parts.index))
    normalized_url = parsed_url._replace(path=normalized_path).geturl()
    return normalized_url

def enqueue(links, base_urls):
    global frontier_q, visited_q
    for link in links:
        if link not in frontier_q and link not in visited_q:
            if any(base_url in link for base_url in base_urls):
                frontier_q = [normalize_url(link)] + frontier_q

def dequeue():
    global frontier_q
    current_url = frontier_q[0]
    frontier_q = frontier_q[1:]
    return current_url

In [27]:
def create_file(data, url):
    global BASE_PATH
    try: 
        path = os.path.join(BASE_PATH, url)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, "w") as fp:
            fp.write(data)
    except OSError as os_err:
        print(os_err)
        return False
    return True

def get_dothtml_from_url(url):
    path = urlparse(url).path
    last_path = path.split('/')[-1]
    if (last_path.endswith(".html") or last_path.endswith(".htm") or last_path.endswith(".shtm")):
        return last_path
    elif ("." in last_path):
        return None
    else: 
        return "dummy"
    
def decode_html_url(url):
    url = html.unescape(url)
    url = unquote(url)
    return url

def create_abs_url(current_url, link):
    return urljoin(current_url, decode_html_url(link))

def remove_header(content):
    content = re.sub(r'(?s)<header.*?>.*?</header>', '', content)
    return content

def remove_footer(content):
    content = re.sub(r'(?s)<footer.*?>.*?</footer>', '', content)
    return content

In [None]:
frontier_q = seed_urls
visited_q = set()
counter = 0
has_robotstxt = list()
has_sitemap = set()
disallow = set()

user_agent = headers["User-Agent"]
rp = urllib.robotparser.RobotFileParser()

scheme, base_url = get_base_url(frontier_q[0])

flag = True
url = urljoin(scheme + "://" + base_url, "robots.txt")
if (url not in visited_q):
    visited_q.add(url)
    page = get_page(url)
    if (page is not None):
        page = page.text
        if ("User-agent:" in page):
            create_file(page, os.path.join(BASE_HTML_PATH, base_url, "robots.txt"))
            has_robotstxt.append(base_url)
            flag = False
    rp.set_url(url)
    rp.read()

# base_urls = ["https://en.japantravel.com/" + prefecture.lower() for prefecture in prefectures]
# base_urls = [f"https://www.japan-guide.com/{path}" for path in ["blog", "bus", "chottozeitaku", "event", "forum", "list"]]
# base_urls = ["https://japantravel.navitime.com/en/area/jp/destinations", "https://japantravel.navitime.com/en/area/jp/guide", "https://japantravel.navitime.com/en/area/jp/spot"]

while ((len(frontier_q) != 0) and counter < MAX_PAGE):
    current_url = dequeue()
    if (current_url in visited_q):
        continue

    if (rp.can_fetch(user_agent, current_url) or flag):
        visited_q.add(current_url)
        response = get_page(current_url)
        if (response is None):
            continue
        raw_html = response.text
        filename = get_dothtml_from_url(current_url)
        if (filename is None):
            pass
        else:
            if (filename == "dummy"):
                filename = os.path.join("".join(current_url.split("://")[1:]), "dummy")
            else:
                filename = "".join(current_url.split("://")[1:])
            success = create_file(remove_header(remove_footer(raw_html)), os.path.join(BASE_HTML_PATH, filename))
            if (success):
                counter += 1
    
        extracted_links = link_parser(raw_html)
        for link in extracted_links:
            link = create_abs_url(current_url, link)
            ext = get_dothtml_from_url(link)
            if (ext is None):
                continue
            if (link.split("/")[-1].find("#") != -1):
                continue
            enqueue([link], base_urls)

if (create_file("\n".join(has_robotstxt), os.path.join(BASE_PATH, "list_robots.txt"))):
    print("Done!", seed_urls[0], counter)