# Step 1: Get Page Links

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin

In [2]:
url = "https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/getting-started-with-r.html"

base_url = url.rsplit("/", 1)[0] + "/"  

response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

page_links = []
for a_tag in soup.find_all("a", href=True):  
    href = a_tag["href"].strip() 
    
    if href.startswith("http"): 
        page_links.append(href)
    elif href.startswith("/"): 
        full_url = urljoin(url, href)  
        page_links.append(full_url)
    elif href.endswith(".html"): 
        full_url = urljoin(base_url, href)
        page_links.append(full_url)

page_links = page_links[:32]

# Step 2: Get Page Data
# Step 3: Gather url, section, subsection, and code chunks

In [3]:
import requests
from bs4 import BeautifulSoup
import json

code_chunks = []

for link in page_links:
    url = link
    
    # Fetch the webpage
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    current_section = None
    current_subsection = None

    for element in soup.find_all(["h1", "h2", "div"]):
        if element.name == "h1":
            current_section = element.get_text()
            current_subsection = None
        if element.name == "h2":
            current_subsection = element.get_text()
        elif element.name == "div" and "sourceCode" in element.get("class", []):
            code_chunk = element.get_text()
            if code_chunk: 
                code_chunks.append({
                    "url": url,
                    "section": current_section,
                    "subsection": current_subsection,
                    "code": code_chunk
                })
    
    print(f"Extracting data from url: ", {link}, "\n")
    print(f"Number of Code Chunks: ", {len(code_chunks)}, "\n")
    
    time.sleep(5)

Extracting data from url:  {'https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/getting-started-with-r.html'} 

Number of Code Chunks:  {14} 

Extracting data from url:  {'https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/using-r-installing-packages-and-importingexporting-data.html'} 

Number of Code Chunks:  {57} 

Extracting data from url:  {'https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/basic-data-structures-in-r.html'} 

Number of Code Chunks:  {154} 

Extracting data from url:  {'https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/text-editing-and-data-transformations.html'} 

Number of Code Chunks:  {226} 

Extracting data from url:  {'https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/getting-biological-data-from-public-repositories.html'} 

Number of Code Chunks:  {258} 

Extracting data from url:  {'https://corytophanes.github.io/BIO_BIT_Bioinformatics_209/basic-statistics-in-r.html'} 

Number of Code Chunks:  {325} 

Extracting data fro

# Step 4: Save JSON file

In [5]:
with open("bioinformatics_workshop_gitbook.json", "w", encoding="utf-8") as f:
    json.dump(code_chunks, f, indent=4)