In [10]:
import requests
from typing import List
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import json


In [15]:
class WebScrapper:
    
    def __init__(self, urls: List[str]) -> None:
        self.urls = urls
        self.corpus = []
        self.doc_id = 1

    def start_scraping(self) -> None:
        for url in self.urls:
            text, title, description = self.scrap_webpage(url)
            if text and title:
                cleaned_text = self.preprocess_text([text])[0]
                doc = {
                    'doc_id': str(self.doc_id),
                    'title': title,
                    'description': description,
                    'url': url,
                    'lang': 'en',  
                    'text': " ".join(cleaned_text)
                }
                self.corpus.append(doc)
                self.doc_id += 1
        self.save_corpus_to_jsonl()

    def scrap_webpage(self, url: str) -> (str, str, str):
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'lxml')
            title = soup.title.string if soup.title else "No Title"

            #extract meta description
            meta_desc = soup.find("meta", attrs={"name": "description"})
            description = meta_desc['content'] if meta_desc else "No Description"
            # Get raw text from the webpage
            text = soup.get_text(separator="\n", strip=True)
            return text, title, description
        else:
            print("Error: ", response.status_code)
            return "", ""

    # Preprocessing method
    def preprocess_text(self, documents: List[str]) -> List[List[str]]:
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))

        def clean_text(text):
            # Remove headers
            text = re.sub(r'From:.*|Subject:.*|Date:.*|Lines:.*|Reply-To:.*', '', text)

            # Lowercase the text
            text = text.lower()

            # Remove URLs
            text = re.sub(r'http\S+|www\S+', '', text)

            # Remove special characters and numbers
            text = re.sub(r'[^a-z\s]', '', text)

            # Tokenize
            tokens = text.split()

            # Remove stopwords and short tokens
            tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

            return tokens

        # Process the document and return the cleaned document
        return [clean_text(doc) for doc in documents]

    # Save the corpus to a JSONL file
    def save_corpus_to_jsonl(self) -> None:
        with open('web_corpus.jsonl', 'w', encoding='utf-8') as f:
            for doc in self.corpus:
                json.dump(doc, f)
                f.write('\n')

if __name__ == "__main__":
    urls = [
        "https://computer.ing.unipi.it/home",
        "https://computer.ing.unipi.it/ce-lm",
        "https://computer.ing.unipi.it/ce-lm/how-to-enroll",
        "https://computer.ing.unipi.it/ce-lm/study-plan-a-y-202324",
        "https://computer.ing.unipi.it/ce-lm/study-plan-a-y-202223",
        "https://computer.ing.unipi.it/ce-lm/study-plan-a-y-202122",
        "https://computer.ing.unipi.it/ce-lm/study-plan-20-21",
        "https://computer.ing.unipi.it/aide-lm",
        "https://computer.ing.unipi.it/aide-lm/how-to-enroll",
        "https://computer.ing.unipi.it/aide-lm/study-plan-a-y-202324",
        "https://computer.ing.unipi.it/aide-lm/study-plan-a-y-202223",
        "https://computer.ing.unipi.it/aide-lm/study-plan-a-y-202122",
        "https://computer.ing.unipi.it/aide-lm/study-plan-20-21",
        "https://computer.ing.unipi.it/news"
    ]
    webscrapper = WebScrapper(urls)
    webscrapper.start_scraping()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package st