In [13]:
import json
import os
import requests
import re
from bs4 import BeautifulSoup
import html

def parse_content(content):
    soup = BeautifulSoup(content, 'html.parser')
    sections = soup.find_all('h2')
    parsed_sections = []
    for section in sections:
        section_title = section.get_text()
        section_content = ''
        for sibling in section.find_next_siblings():
            if sibling.name == 'h2':
                break  # Stop when the next <h2> element is encountered
            # Remove tables from sibling elements
            for table in sibling.find_all('table'):
                table.decompose()
            section_content += str(sibling)  # Append HTML of sibling elements to section_content
        # Unescape HTML entities and strip HTML tags
        section_title = html.unescape(section_title)
        section_content = html.unescape(BeautifulSoup(section_content, 'html.parser').get_text())
        # Format section data
        formatted_title = re.sub(r'\.', '', section_title.replace(' ', '-')).lower()
        section_data = {
            "title": section_title,
            "text": section_content,
            "metadataJson": json.dumps({
                "section_url": f"/articles/{slug}/#{formatted_title}"
            })
        }
        parsed_sections.append(section_data)
    return parsed_sections


# Set up WordPress API connection
base_url = "https://dev.stump.works/cltudo/wp-json/wp/v2/post-type/articles/"
# Ensure jsonExports directory exists
os.makedirs('jsonExports', exist_ok=True)

page = 1
while True:
    url = f"{base_url}?page={page}"
    response = requests.get(url)  # Send GET request to WordPress API
    if response.status_code != 200:
        break  # Exit loop if no more pages or error
    data = json.loads(response.text)  # Parse response data into JSON format
    if not data:
        break  # Exit loop if no more data

    # Loop through each article in the response data
    for article in data:
        # Extract relevant data from article
        title = html.unescape(article['title']['rendered'])  # Unescape HTML entities here
        slug = article['slug']  # Assuming the 'slug' field exists
        content = article['content']['rendered']

        # Parse content into sections
        sections = parse_content(content)
    
        # Format document data
        document_data = {
            "title": title,
            "metadataJson": json.dumps({"base_url": slug}),
            "section": sections
        }
    
        # Create a valid filename from the title
        filename = re.sub(r'[^a-zA-Z0-9]', '_', title) + '.json'
    
        # Write document data to JSON file in jsonExports folder
        with open(os.path.join('jsonExports', filename), 'w') as outfile:
            json.dump(document_data, outfile)

    page += 1  # Increment page number for next iteration
