<a href="https://colab.research.google.com/github/louisdennington-design/decision-tree-dissertation/blob/main/html_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [2]:
# Import packages

import os
import json
import re
from bs4 import BeautifulSoup
import requests
from datetime import datetime

In [3]:
# Global parameters

# Scrape URLs
URL_bipolar = 'https://www.nice.org.uk/guidance/cg185/chapter/Recommendations'
URL_abortion = 'https://www.nice.org.uk/guidance/ng140/chapter/Recommendations'
URL_pneumonia = 'https://www.nice.org.uk/guidance/ng250/chapter/Recommendations'
URL_schizophrenia = 'https://www.nice.org.uk/guidance/cg178/chapter/Recommendations'
URL_borderline_pd = 'https://www.nice.org.uk/guidance/cg78/chapter/Recommendations'
URL_alcohol_use_disorders = 'https://www.nice.org.uk/guidance/cg115/chapter/Recommendations'
URL_depression_in_adults = 'https://www.nice.org.uk/guidance/ng222/chapter/Recommendations'
URL_self_harm = 'https://www.nice.org.uk/guidance/ng225/chapter/Recommendations'
URL_social_anxiety = 'https://www.nice.org.uk/guidance/cg159/chapter/Recommendations'
URL_ocd = 'https://www.nice.org.uk/guidance/cg31/chapter/Recommendations'
URL_eating_disorders = 'https://www.nice.org.uk/guidance/ng69/chapter/Recommendations'
URL_ptsd = 'https://www.nice.org.uk/guidance/ng116/chapter/Recommendations'
URL_gad_and_panic = 'https://www.nice.org.uk/guidance/cg113/chapter/Recommendations'

# Path and name for JSON of raw recommendations cleaned from html
SAVE_PATH = '/content/drive/My Drive/Colab Notebooks/Dissertation/Scrapes'
os.makedirs(SAVE_PATH, exist_ok=True)
SAVE_FILE = os.path.join(SAVE_PATH, "guideline_raw.json")

In [4]:
# Scrape the text

scrape_datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

url = URL_bipolar
html_scrape = requests.get(url, timeout = 30).text
soup = BeautifulSoup(html_scrape, "html.parser")

# print(soup.prettify())

# Save scrape with metadata as separate file
metadata_compiled = {'meta_data': {'url': url, 'scrape_datetime': scrape_datetime},
                    'recommendation_text': html_scrape}

metadata_folder_path = '/content/drive/My Drive/Colab Notebooks/Dissertation/Scrapes/Metadata'
os.makedirs(metadata_folder_path, exist_ok=True)
metadata_file_path = os.path.join(metadata_folder_path, f"scrape_metadata_{scrape_datetime}.json")

with open(metadata_file_path, "w", encoding="utf-8") as f:
    json.dump(metadata_compiled, f, ensure_ascii=False, indent=2)

In [5]:
# Processing function
# Reference material 1 https://github.com/awsdataarchitect/medium2dev/blob/main/medium2dev.py
# Reference material 2 https://cozy.computer/scraping-speculative-fiction-short-stories-using-python-and-beautiful-soup-4

def process_html(soup):

    raw_recommendations = []

    # Find start point where recommendation headings begin
    start = soup.find("h3", string=lambda s: s and s.strip().startswith("1.1"))
    if start is None:
        return raw_recommendations

    # For cleaning whitespace and removing text artefacts

    # IMPROVE THIS SECTION BY USING REGEX?

    def clean(s):
        return " ".join(s
                        .replace("\xa0", " ")
                        .replace("[2014]", " ")
                        .replace("[2014, amended 2021]", " ")
                        .replace("[2014, amended 2023]", " ")
                        .replace("PANEL: or", " ")
                        .replace(":, and", ": ")
                        .split()).strip()

    # Capture first h3 before iterating
    raw_recommendations.append(f"HEADING: {clean(start.get_text(' ', strip=True))}")

    # Iterate through elements by name and class
    for el in start.next_elements:
        if getattr(el, "name", None) is None:
            continue

        classes = el.get("class", [])
        text = clean(el.get_text(" ", strip=True))
        if not text:
            continue

        if el.name == "h3" and "title" in classes:
            if text.strip() == "Terms used in this guideline":
                break
            raw_recommendations.append(f"HEADING: {text}")

        elif el.name == "h4" and "title" in classes:
            raw_recommendations.append(f"SUB_HEADING_1: {text}")

        elif el.name == "h5" and "title" in classes:
            raw_recommendations.append(f"SUB_HEADING_2: {text}")

        elif el.name in ("h4", "h5", "h6") and "recommendation__number" in classes:
            raw_recommendations.append(f"REC_NUM: {text}")

        elif el.name == "div" and "recommendation__body" in classes:
            p = el.find("p")
            if p:
                raw_recommendations.append(f"REC_TEXT: {clean(p.get_text(' ', strip=True))}")

        elif el.name == "strong":
            raw_recommendations.append(f"PANEL: {text}")

        elif el.name == "li":
            raw_recommendations.append(f"BULLET: {text}")

    return raw_recommendations

In [6]:
# Transform flat html list object into JSON hierarchical structure

def raw_recommendations_to_json(raw_recommendations):

    recommendation_data = []

    current_h3 = None
    current_h4 = None
    current_h5 = None
    current_rec = None

    for line in raw_recommendations:
        if ":" not in line:
            continue

        label, text = line.split(":", 1)
        label = label.strip()
        text = text.strip()

        if label == "HEADING":
            current_h3 = text
            current_h4 = None
            current_h5 = None
            current_rec = None

        elif label == "SUB_HEADING_1":
            current_h4 = text
            current_h5 = None
            current_rec = None

        elif label == "SUB_HEADING_2":
            current_h5 = text
            current_rec = None

        elif label == "REC_NUM":
            current_rec = {
                "heading_1": current_h3,
                "sub_heading_1": current_h4,
                "sub_heading_2": current_h5,
                "original_recommendation_number": text,
                "original_recommendation_text": ""
            }
            recommendation_data.append(current_rec)

        elif label in ("REC_TEXT", "PANEL"):
            if current_rec is not None:
                if current_rec["original_recommendation_text"]:
                    current_rec["original_recommendation_text"] += " " + text
                else:
                    current_rec["original_recommendation_text"] = text

        elif label == "BULLET":
            if current_rec is not None:
                if current_rec["original_recommendation_text"]:
                    current_rec["original_recommendation_text"] += ", and " + text
                else:
                    current_rec["original_recommendation_text"] = text

    return recommendation_data

In [7]:
raw_scraped_recommendations = process_html(soup)

recommendations_as_json = raw_recommendations_to_json(raw_scraped_recommendations)

print(recommendations_as_json)

with open(SAVE_FILE.replace(".txt", ".json"), "w", encoding="utf-8") as f:
    json.dump(recommendations_as_json, f, ensure_ascii=False, indent=2)

# Include format validation?



In [None]:
# Note - scrape checked for recommendation numbers on 27/01/2026 and appears to be capturing all recommendation numbers

# What to do about capturing box at 1.5 and how to number
# Need to stop appending "and" after :, e.g., in 1.6: "Give carers written and verbal information in an accessible format about:, and diagnosis and management of bipolar disorder, and positive outcomes and recovery, and types of support for carers, and role of teams and services, and getting help in a crisis"
# occurrences of "or or"
# __NO_SUB_HEADING_1__ needs removal
# long bullet point lists being concatenated into the preceding recommendation make for rather wordy entries -- too verbose? E.g., 1.3.2