In [10]:

import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')

import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import robotexclusionrulesparser
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def is_allowed(url, user_agent='*'):
    parsed_url = requests.utils.urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"

    rp = robotexclusionrulesparser.RobotExclusionRulesParser()
    try:
        robots_txt = requests.get(base_url, timeout=5).text
        rp.parse(robots_txt)
        return rp.is_allowed(user_agent, url)
    except Exception as e:
        print(f"Could not fetch robots.txt for {url}: {e}")
        return True


def fetch_dynamic_content(url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome('chromedriver', options=options)

    try:
        driver.get(url)
        html = driver.page_source
    finally:
        driver.quit()
    return html


def crawler_with_delay(start_url, delay=2, max_pages=5, use_selenium=False):
    visited = set()
    to_visit = [start_url]
    index = {}

    while to_visit and len(visited) < max_pages:
        url = to_visit.pop(0)
        if url not in visited:
            if is_allowed(url):
                try:
                    if use_selenium:
                        html = fetch_dynamic_content(url)
                        soup = BeautifulSoup(html, 'html.parser')
                    else:
                        response = requests.get(url, timeout=10)
                        if response.status_code == 200:
                            soup = BeautifulSoup(response.text, 'html.parser')
                        else:
                            print(f"Failed to fetch {url} with status code {response.status_code}")
                            continue

                    title = soup.title.string if soup.title else 'No Title'
                    index[url] = title
                    print(f"Crawled: {url} -> {title}")

                    for link in soup.find_all('a', href=True):
                        absolute_link = urljoin(url, link['href'])
                        if absolute_link not in visited and absolute_link not in to_visit:
                            to_visit.append(absolute_link)

                except Exception as e:
                    print(f"Failed to crawl {url}: {e}")

                visited.add(url)
                time.sleep(delay)
            else:
                print(f"Crawling disallowed for: {url}")
    return index


if __name__ == "__main__":
    start_url = "https://www.youtube.com/"
    print("Allowed to crawl:", is_allowed(start_url))
    indexed_pages = crawler_with_delay(start_url, delay=2, max_pages=3, use_selenium=False)

    print("\nIndexed Pages:")
    for url, title in indexed_pages.items():
        print(url, ":", title)


Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting robotexclusionrulesparser
  Downloading robotexclusionrulesparser-1.7.1.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.31.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m115.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 