# 0. Importing Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from urllib.parse import urljoin
import re
import os
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph

# 1. Web crawler for AIT database

In [2]:
# Function to crawl a webpage and extract links
def crawl_webpage(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        return links
    else:
        return []

In [3]:
def clean_html(html_content):
    # Parse HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove script tags and their contents
    for script in soup(['script', 'style']):
        script.extract()
    
    # Get text content
    text_content = soup.get_text(separator=' ')
    
    # Remove extra whitespace and newlines
    text_content = ' '.join(text_content.split())
    
    return text_content

In [4]:
def contains_foreign_languages(text):
    # Define a regex pattern to match non-ASCII characters
    non_ascii_pattern = re.compile(r'[^\x00-\x7F]')  # Matches any character not in the ASCII range
    
    # Search for non-ASCII characters in the text
    match = non_ascii_pattern.search(text)
    
    
    return bool(match)

In [5]:
# To check if crawled link is an AIT website
ait_expr = r'ait'
skip_expr = r'Just a moment... Enable JavaScript and cookies to continue|�|Rights Reserved. - Designed by Outsourcify|cookies|Log In|YouTube'

# Start from AIT home page
start_url = 'https://ait.ac.th'

# Database
# Create the 'AIT_database' folder if it doesn't exist
database_folder = 'AIT_database'
os.makedirs(database_folder, exist_ok=True)

# Crawl the webpage
crawled_data = []
crawled_text = []
links_to_crawl = [start_url]
i = 0 # Counter
while links_to_crawl and i < 100: # Limit to 100 links
    current_url = links_to_crawl.pop(0)
    try:
        links = crawl_webpage(current_url)
        for link in links:   
            if link not in crawled_data and re.search(ait_expr, link):
                text = requests.get(link).text
                cleaned_text = clean_html(text)
                if re.search(skip_expr, cleaned_text) is None and cleaned_text.strip() != '' and contains_foreign_languages(cleaned_text) and cleaned_text not in crawled_text:
                    crawled_text.append(cleaned_text)
                    crawled_data.append(f'Link: {link} \n Document: {cleaned_text}')
                    i += 1
                    links_to_crawl.append(link)
                    
                    # Download PDF attachments
                    attachments = re.findall(r'href="(.*?\.pdf)"', text)
                    for attachment_url in attachments:
                        attachment_response = requests.get(attachment_url)
                        if attachment_response.status_code == 200:
                            attachment_filename = os.path.basename(attachment_url)
                            with open(os.path.join(database_folder, attachment_filename), 'wb') as f:
                                f.write(attachment_response.content)
    except Exception as e:
        print(f"Error crawling {current_url}: {e}")

Error crawling http://giving.ait.ac.th/: No connection adapters were found for 'mailto:oaaa@ait.ac.th'
Error crawling http://giving.ait.ac.th/2023/03/01/upgrading-of-the-ait-international-school-canteen/: Invalid URL '/2023/03/01/upgrading-of-the-ait-international-school-canteen/#respond': No scheme supplied. Perhaps you meant https:///2023/03/01/upgrading-of-the-ait-international-school-canteen/#respond?


In [None]:
# Function to save list of documents as PDF files
def save_documents_as_pdf(documents, folder):
    os.makedirs(folder, exist_ok=True)
    for index, document in enumerate(documents, start=1):
        filename = os.path.join(folder, f'Document_{index}.pdf')
        doc = SimpleDocTemplate(filename, pagesize=letter)
        story = [Paragraph(document, style=None)]
        doc.build(story)

# Example usage:
# crawled_data = [...]  # List of documents
save_documents_as_pdf(crawled_data, database_folder)