In [26]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import os
import csv

In [27]:
class Crawler:
    """
    A web crawler class to fetch and process web pages, extract content,
    and save the data in a single CSV file.

    Attributes:
        baseUrl (str): The base URL for the crawler to start crawling.
        keywords (list): A list of keywords to filter relevant content.
        header (dict): HTTP headers to mimic a browser request.
        _exclude (list): URL patterns to exclude during crawling.
        _visited (set): A set to keep track of visited URLs.
        queue (list): A queue of URLs to be crawled.
        _data_loc (str): Directory to save the extracted data.
    """
    def __init__(self, baseUrl: str = None, keywords: list = None):
        """
        Initializes the Crawler class with optional base URL and keywords.

        Args:
            baseUrl (str): The base URL for crawling.
            keywords (list): Keywords to filter relevant web pages.
        """
        self.baseUrl = baseUrl
        self.header = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
            "connection": "keep-alive",
            "cache-control": "no-cache",
        }
        self._exclude = [
            "section", "for-authors",
            "editorial-board", "about", "issues",
            "posts", "toggle", "view=desktop", "search"
        ]
        self._keywords = keywords or [
            "healthcare", "health", "hospital", "clinic", "emergency", "primary care",
            "disease", "infection", "symptoms", "diagnosis", "treatment", "therapy",
            "chronic", "acute", "mental health", "cancer", "diabetes", "cardiology",
            "clinical trial", "meta-analysis", "systematic review", "case study",
            "epidemiology", "randomized controlled trial", "biostatistics",
            "nutrition", "exercise", "stress", "prevention", "self-care", "fitness",
            "vaccination", "immunization", "screening",
            "health policy", "public health", "health reform", "universal healthcare",
            "insurance", "health equity", "pandemic", "quarantine", "outbreak"
        ]
        self._visited = set()  # Tracks visited URLs
        self.queue = []  # Queue for URLs to crawl
        self._data_loc = "./data/"  # Directory to save output data
        os.makedirs(self._data_loc, exist_ok=True)

    def _crawl(self, currUrl: str = None):
        """
        Fetch a webpage and return its BeautifulSoup object.

        Args:
            currUrl (str): The URL of the webpage to crawl.

        Returns:
            BeautifulSoup: A parsed HTML page, or None if an error occurs.
        """
        try:
            page = requests.get(currUrl, headers=self.header)
            page.raise_for_status()  # Raise an HTTP error for bad responses
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {currUrl}: {e}")
            return None
        soup = BeautifulSoup(page.text, 'html.parser')

        # Add links from the current page to the queue for further crawling
        for link in soup.find_all('a', href=True):
            fullUrl = urljoin(currUrl, link['href'])
            if fullUrl not in self._visited and fullUrl.startswith("http") and not any(keyword in fullUrl for keyword in self._exclude):
                self._visited.add(fullUrl)
                self.queue.append(fullUrl)

        return soup

    def crawl_n(self, n: int, output_file: str):
        """
        Crawl 'n' documents and save all data to a single CSV file.

        Args:
            n (int): Number of documents to crawl and save.
            output_file (str): Path to the output CSV file.
        """
        count = 0
        data = []  # Store all crawled data

        while count < n and self.queue:
            currUrl = self.queue.pop(0)
            print(f"Crawling: {currUrl}")

            soup = self._crawl(currUrl)
            if not soup:
                continue

            # Extract the title of the webpage
            title = soup.title.string.strip() if soup.title else "No Title"

            # Extract and clean the page content
            content = soup.get_text(separator=' ')
            content = content.replace("\n", " ").replace("\r", " ").strip()

            # Check for relevant keywords in the content
            if any(keyword in content.lower() for keyword in self._keywords):
                data.append({
                    "URL": currUrl,
                    "Title": title,
                    "Content": content
                })
                print(f"Document {count + 1} crawled: {title}")
                count += 1

        # Save all crawled data to a single CSV file
        with open(output_file, mode='w', encoding='utf-8', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=["URL", "Title", "Content"])
            writer.writeheader()
            writer.writerows(data)

        print(f"All {count} documents saved to {output_file}")

In [28]:

# List of starting URLs for the crawler
starting_urls = [
    "https://healthcare-digital.com/top10/top-10-healthcare-websites",
    "https://pubmed.ncbi.nlm.nih.gov/",
    "https://www.cdc.gov/"
]

In [30]:
# Initialize the crawler
crawler = Crawler()
crawler.queue.extend(starting_urls)  # Add starting URLs to the queue
crawler.crawl_n(300, output_file="./data/combined_crawled_data.csv") # Crawl 500 documents

Crawling: https://healthcare-digital.com/top10/top-10-healthcare-websites
Document 1 crawled: Top 10 Healthcare Websites | Healthcare Digital
Crawling: https://pubmed.ncbi.nlm.nih.gov/
Document 2 crawled: PubMed
Crawling: https://www.cdc.gov/
Document 3 crawled: Centers for Disease Control and Prevention | CDC
Crawling: https://healthcare-digital.com/
Document 4 crawled: Home of Healthcare News | Healthcare Digital
Crawling: http://www.nih.gov
Document 5 crawled: National Institutes of Health (NIH) | Turning Discovery Into Health
Crawling: http://kidshealth.org
Document 6 crawled: Nemours KidsHealth - the Web's most visited site about children's health
Crawling: http://www.webmd.com
Document 7 crawled: WebMD - Better information. Better health.
Crawling: http://www.boots.webmd.com
Error fetching http://www.boots.webmd.com: HTTPSConnectionPool(host='www.webmd.boots.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection objec

In [6]:
import os
import csv
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk


In [10]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/priyanshusharma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/priyanshusharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/priyanshusharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/priyanshusharma/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [23]:
class TextPreprocessor:
    def __init__(self, input_dir="./data/", output_dir="./processed_data/"):
        self.input_dir = input_dir
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        """Remove HTML tags, special characters, numbers, and extra spaces."""
        text = re.sub(r'<[^>]*>', '', text)  # Remove HTML tags
        text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
        text = re.sub(r'\d+', '', text)      # Remove numbers
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text

    def remove_stopwords(self, words):
        """Remove stopwords from a list of words."""
        return [word for word in words if word.lower() not in self.stop_words]

    def stem_words(self, words):
        """Apply stemming to a list of words."""
        return [self.stemmer.stem(word) for word in words]

    def lemmatize_words(self, words):
        """Apply lemmatization to a list of words."""
        return [self.lemmatizer.lemmatize(word) for word in words]

    def process_file(self, input_file, output_file):
        """Process a single file: clean, tokenize, and apply stemming/lemmatization."""
        with open(input_file, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            processed_data = []

            for row in reader:
                content = row['Content']
                # Clean the text
                cleaned_text = self.clean_text(content)

                # Tokenize into sentences and words
                sentences = sent_tokenize(cleaned_text)
                word_tokens = [word_tokenize(sentence) for sentence in sentences]

                # Remove stopwords and apply stemming/lemmatization
                processed_sentences = []
                for tokens in word_tokens:
                    filtered_tokens = self.remove_stopwords(tokens)
                    stemmed_tokens = self.stem_words(filtered_tokens)  # You can switch to lemmatization
                    processed_sentences.append(" ".join(stemmed_tokens))

                # Join processed sentences
                processed_content = " ".join(processed_sentences)
                processed_data.append({
                    "URL": row['URL'],
                    "Title": row['Title'],
                    "Processed_Content": processed_content
                })

        # Save the processed data to a new CSV file
        with open(output_file, 'w', encoding='utf-8', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=["URL", "Title", "Processed_Content"])
            writer.writeheader()
            writer.writerows(processed_data)
    def process_all_files(self):
        """Process all CSV files in the input directory and save a combined CSV."""
        combined_data = []  # List to store all rows across files

        for file_name in os.listdir(self.input_dir):
            if file_name.endswith(".csv"):
                input_file = os.path.join(self.input_dir, file_name)
                print(f"Processing {input_file}...")

                # Read and process each file
                with open(input_file, 'r', encoding='utf-8') as file:
                    reader = csv.DictReader(file)

                    for row in reader:
                        content = row['Content']

                        # Clean the text
                        cleaned_text = self.clean_text(content)

                        # Tokenize into sentences and words
                        sentences = sent_tokenize(cleaned_text)
                        word_tokens = [word_tokenize(sentence) for sentence in sentences]

                        # Remove stopwords and apply stemming/lemmatization
                        processed_sentences = []
                        for tokens in word_tokens:
                            filtered_tokens = self.remove_stopwords(tokens)
                            stemmed_tokens = self.stem_words(filtered_tokens)  # You can switch to lemmatization
                            processed_sentences.append(" ".join(stemmed_tokens))

                        # Join processed sentences
                        processed_content = " ".join(processed_sentences)
                        combined_data.append({
                            "URL": row['URL'],
                            "Title": row['Title'],
                            "Processed_Content": processed_content
                        })
        # Save the combined data to a single CSV file
        combined_file_path = os.path.join(self.output_dir, 'combined_processed_data.csv')
        with open(combined_file_path, 'w', encoding='utf-8', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=["URL", "Title", "Processed_Content"])
            writer.writeheader()
            writer.writerows(combined_data)

        print(f"All processed data saved to {combined_file_path}")

In [24]:
import csv
import sys

# Increase the CSV field size limit
csv.field_size_limit(sys.maxsize)

9223372036854775807

In [None]:
# Initialize and run the text preprocessor
preprocessor = TextPreprocessor()
preprocessor.process_all_files()

Processing ./data/data488.csv...
Processing ./data/data339.csv...
Processing ./data/data463.csv...
Processing ./data/data305.csv...
Processing ./data/data311.csv...
Processing ./data/data477.csv...
Processing ./data/data107.csv...
Processing ./data/data113.csv...
Processing ./data/data35.csv...
Processing ./data/data21.csv...
Processing ./data/data259.csv...
Processing ./data/data271.csv...
Processing ./data/data265.csv...
Processing ./data/data264.csv...
Processing ./data/data270.csv...
Processing ./data/data258.csv...
Processing ./data/data20.csv...
Processing ./data/data34.csv...
Processing ./data/data112.csv...
Processing ./data/data106.csv...
Processing ./data/data310.csv...
Processing ./data/data476.csv...
Processing ./data/data462.csv...
Processing ./data/data304.csv...
Processing ./data/data338.csv...
Processing ./data/data489.csv...
Processing ./data/data448.csv...
Processing ./data/data474.csv...
Processing ./data/data312.csv...
Processing ./data/data306.csv...
Processing ./d