In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin

In [None]:
def fetch_page(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status() 
        return BeautifulSoup(response.text, "html.parser")
    except requests.RequestException as e:
        print(f"Failed to fetch: {url}\nError: {e}\n")
        return None

def extract_links(soup, base_url):
    page_links = []
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"].strip()

        if href.startswith("http"):
            page_links.append(href)
        elif href.startswith("/"):
            full_url = urljoin(base_url, href)
            page_links.append(full_url)
        elif href.endswith(".html"):
            full_url = urljoin(base_url, href)
            page_links.append(full_url)

    return page_links[:32]

def extract_code_chunks(url, soup):
    code_chunks = []
    current_section = None
    current_subsection = None

    for element in soup.find_all(["h1", "h2", "div"]):
        if element.name == "h1":
            current_section = element.get_text()
            current_subsection = None
        elif element.name == "h2":
            current_subsection = element.get_text()
        elif element.name == "div" and "sourceCode" in element.get("class", []):
            code_chunk = element.get_text()
            if code_chunk:
                code_chunks.append({
                    "url": url,
                    "section": current_section,
                    "subsection": current_subsection,
                    "code": code_chunk
                })
    
    return code_chunks

def save_code_chunks_to_file(code_chunks, filename="bioinformatics_workshop_gitbook.json"):
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(code_chunks, f, indent=4)
        print(f"Successfully saved {len(code_chunks)} code chunks to {filename}.")
    except Exception as e:
        print(f"Failed to save data to file. Error: {e}")

In [5]:
def main():
    start_url = "https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/getting-started-with-r.html"
    base_url = start_url.rsplit("/", 1)[0] + "/"

    print(f"Fetching main page: {start_url}")
    soup = fetch_page(start_url)
    if not soup:
        return 

    page_links = extract_links(soup, base_url)
    code_chunks = []
    failed_links = []

    for link in page_links:
        print(f"Fetching page: {link}")
        soup = fetch_page(link)
        if soup:
            code_chunks.extend(extract_code_chunks(link, soup))
            print(f"Extracted {len(code_chunks)} total code chunks so far.")
        else:
            failed_links.append(link)

        time.sleep(5) 

    if failed_links:
        print("\nThe following links failed to load:")
        for failed_link in failed_links:
            print(failed_link)

    save_code_chunks_to_file(code_chunks)

    print(f"\nFinished! Extracted {len(code_chunks)} code chunks from {len(page_links) - len(failed_links)} pages.")

if __name__ == "__main__":
    main()

Fetching main page: https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/getting-started-with-r.html
Fetching page: https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/getting-started-with-r.html
Extracted 14 total code chunks so far.
Fetching page: https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/using-r-installing-packages-and-importingexporting-data.html
Extracted 57 total code chunks so far.
Fetching page: https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/basic-data-structures-in-r.html
Extracted 154 total code chunks so far.
Fetching page: https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/text-editing-and-data-transformations.html
Extracted 226 total code chunks so far.
Fetching page: https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/getting-biological-data-from-public-repositories.html
Extracted 258 total code chunks so far.
Fetching page: https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/basic-statistics-in-r.html
Extracted 325