# Scrape the websites

The first step is to spider crawl the top level page and grab all pages and put them into a markdown file.

In [3]:
import os
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

DATA_FOLDER = '../data/raw'

def ensure_directory(folder_name):
    full_path = os.path.join(DATA_FOLDER, folder_name)
    if not os.path.exists(full_path):
        os.makedirs(full_path)
    return full_path

def get_internal_links(site):
    root_url = site["root_url"]
    folder_name = site.get("folder_name") or urlparse(root_url).netloc
    folder_path = ensure_directory(folder_name)

    internal_links = set()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    try:
        response = requests.get(root_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        for link in soup.find_all("a", href=True):
            href = link["href"]
            joined_url = urljoin(root_url, href)
            parsed_url = urlparse(joined_url)

            if parsed_url.netloc == urlparse(root_url).netloc:
                internal_links.add(joined_url)

    except requests.RequestException as e:
        print(f"An error occurred: {e}")

    with open(os.path.join(folder_path, "internal_links.txt"), "w") as file:
        for link in internal_links:
            file.write(f"{link}\n")

    return list(internal_links)

def scrape_to_markdown(site, urls):
    folder_name = site.get("folder_name") or urlparse(site["root_url"]).netloc
    folder_path = ensure_directory(folder_name)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    for url in urls:
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")
            title = soup.find("title").text.strip()
            paragraphs = soup.find_all("p")
            content = "\n".join(p.get_text().strip() for p in paragraphs)

            markdown_content = f"# {title}\n\n{content}"
            output_file_name = url.rstrip('/').split('/')[-1] + ".md"
            output_file = os.path.join(folder_path, output_file_name)

            with open(output_file, "w") as file:
                file.write(markdown_content)
            print(f"Markdown file '{output_file}' created successfully.")
        except Exception as e:
            print(f"An error occurred for URL '{url}': {e}")

def main(sites=None):
    if sites is None:
        sites = [
            {"root_url": "https://pcare.com/"},
            {"root_url": "https://www.hci-tv.com/"},
            {"root_url": "https://www.sonifi.com/"},
            {"root_url": "https://www.evideon.com/"},
            # Add more sites as needed
        ]
    for site in sites:
        internal_links = get_internal_links(site)
        scrape_to_markdown(site, internal_links)

if __name__ == "__main__":
    main()


Markdown file '../data/raw/pcare.com/room-of-the-future.md' created successfully.
Markdown file '../data/raw/pcare.com/pcare.com.md' created successfully.
Markdown file '../data/raw/pcare.com/sitemap.md' created successfully.
Markdown file '../data/raw/pcare.com/pcare.com.md' created successfully.
Markdown file '../data/raw/pcare.com/ips.md' created successfully.
Markdown file '../data/raw/pcare.com/room-connect.md' created successfully.
Markdown file '../data/raw/pcare.com/digital-signage-and-wayfinding.md' created successfully.
Markdown file '../data/raw/pcare.com/pcare-interactive-patient-care-system-ips-demo-video-series.md' created successfully.
Markdown file '../data/raw/pcare.com/pcare-cloud.md' created successfully.
Markdown file '../data/raw/pcare.com/partners.md' created successfully.
Markdown file '../data/raw/pcare.com/pcare-mobile.md' created successfully.
Markdown file '../data/raw/pcare.com/pcare-achieves-hitrust-implemented-1-year-i1-certification-to-manage-data-protect