In [1]:
import csv
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import os

In [4]:

class Crawler:
    def __init__(self, baseUrl: str = None, keywords: list = None):
        self.baseUrl = baseUrl
        self.header = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
            "connection": "keep-alive",
            "cache-control": "no-cache",
        }
        self._exclude = [
            "section", "for-authors",
            "editorial-board", "about", "issues",
            "posts", "toggle", "view=desktop", "search"
        ]
        self._keywords = [
            "healthcare", "health", "hospital", "clinic", "emergency", "primary care",
            "disease", "infection", "symptoms", "diagnosis", "treatment", "therapy",
            "chronic", "acute", "mental health", "cancer", "diabetes", "cardiology",
            "clinical trial", "meta-analysis", "systematic review", "case study",
            "epidemiology", "randomized controlled trial", "biostatistics",
            "nutrition", "exercise", "stress", "prevention", "self-care", "fitness",
            "vaccination", "immunization", "screening",
            "health policy", "public health", "health reform", "universal healthcare",
            "insurance", "health equity", "pandemic", "quarantine", "outbreak"
        ]
        self._visited = set()
        self.queue = list()
        self._data_loc = "./data/"
        if not os.path.exists(self._data_loc):
            os.makedirs(self._data_loc)

    def _crawl(self, currUrl: str = None):
        if currUrl is None:
            if self.baseUrl:
                currUrl = self.baseUrl
            else:
                currUrl = self.queue.pop(0)
        try:
            page = requests.get(currUrl, headers=self.header)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {currUrl}: {e}")
            return None
        soup = BeautifulSoup(page.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            fullUrl = urljoin(currUrl, link['href'])
            if fullUrl not in self._visited and fullUrl.startswith("http") and not any(keyword in fullUrl for keyword in self._exclude):
                self._visited.add(fullUrl)
                self.queue.append(fullUrl)
        return soup

    def crawl_n(self, n: int):
        i = 0
        data = []
        while i < n:
            if not self.queue:
                print("No more URLs in the queue.")
                break
            currUrl = self.queue.pop(0)
            print(f"Crawling from: {currUrl}\n")
            soup = self._crawl(currUrl)
            if soup:
                curr_text = soup.get_text(separator=' ')
                curr_text = curr_text.replace('\n', ' ').strip()  # Remove newlines and extra spaces
            else:
                continue
            if curr_text and any(keyword in curr_text.lower() for keyword in self._keywords):
                data.append({"URL": currUrl, "Content": curr_text})
                i += 1

        # Save to CSV
        csv_file = f"{self._data_loc}scraped_data.csv"
        with open(csv_file, mode='w', encoding='utf-8', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=["URL", "Content"])
            writer.writeheader()
            writer.writerows(data)

        print(f"Crawled {i} pages successfully. Data saved to {csv_file}.\n")

In [None]:
crawler = Crawler()
crawler.queue.append("https://pmc.ncbi.nlm.nih.gov/articles/PMC8285156/")
crawler.crawl_n(300)

Crawling from: https://pmc.ncbi.nlm.nih.gov/articles/PMC8285156/

Crawling from: https://pmc.ncbi.nlm.nih.gov/articles/PMC8285156/#main-content

Crawling from: https://pmc.ncbi.nlm.nih.gov/

Crawling from: https://www.ncbi.nlm.nih.gov/myncbi/

Crawling from: https://www.ncbi.nlm.nih.gov/myncbi/collections/bibliography/

Crawling from: https://www.ncbi.nlm.nih.gov/account/settings/

Crawling from: https://www.ncbi.nlm.nih.gov/pmc/advanced/

Crawling from: https://pmc.ncbi.nlm.nih.gov/journals/

Crawling from: https://doi.org/10.7861/fhj.2021-0095

Crawling from: https://pmc.ncbi.nlm.nih.gov/articles/PMC8285156/pdf/futurehealth-8-2-e188.pdf

Crawling from: https://www.ncbi.nlm.nih.gov/pmc/?term=%22Future%20Healthc%20J%22%5Bjour%5D

Crawling from: https://pubmed.ncbi.nlm.nih.gov/?term=%22Future%20Healthc%20J%22%5Bjour%5D

Crawling from: https://www.ncbi.nlm.nih.gov/nlmcatalog?term=%22Future%20Healthc%20J%22%5BTitle%20Abbreviation%5D

Crawling from: https://pmc.ncbi.nlm.nih.gov/articles/PM