<a href="https://colab.research.google.com/github/louisdennington-design/decision-tree-dissertation/blob/main/html_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [2]:
# Import packages

import os
import json
import re
from bs4 import BeautifulSoup
import requests

In [3]:
# Global parameters

URL_bipolar = 'https://www.nice.org.uk/guidance/cg185/chapter/Recommendations'
URL_abortion = 'https://www.nice.org.uk/guidance/ng140/chapter/Recommendations'
URL_pneumonia = 'https://www.nice.org.uk/guidance/ng250/chapter/Recommendations'

SAVE_PATH = '/content/drive/My Drive/Colab Notebooks/Dissertation/Scrapes'
os.makedirs(SAVE_PATH, exist_ok=True)
SAVE_FILE = os.path.join(SAVE_PATH, "guideline_raw.json")

In [4]:
# Needs incorporation into functions below or deletion?

from dataclasses import dataclass

@dataclass
class recommendation_entity:
    """
    Defines the format that text is stored in when scraped from the html
    """
    recommendation_id: int
    parent_heading: str
    recommendation_text: str
    recommendation_url: str # Sometimes a bullet point incorporates a hyperlink to refer to earlier guidance

# If needing to force a finite list in one of the class fields

from enum import Enum

class types_of_thing(Enum):
    """
    1. Adjust the name
    2. Add it as a field, e.g., types_of_thing: types_of_thing
    """
    THIS = "this"
    THAT = "that"
    OR_THIS = "or_this"

In [5]:
# Scrape the text

url = URL_bipolar

html_scrape = requests.get(url).text

soup = BeautifulSoup(html_scrape, "html.parser")

#print(soup.prettify())

# Save scrape with metadata?

In [6]:
# Processing function

# FIXES NEEDED
## Box on Valproate (<div class="panel panel--primary">) should be included
## Check how / whether headings of boxes ("strong") and some sub-sections (e.g., "Starting antipsychotic medication") are being ingested properly

"""
Doc string
"""

def process_html(soup):

    raw_guideline_data = []
    current_heading = None
    sub_heading_1 = None
    sub_heading_2 = None

    start = soup.find("h3", string = lambda s: s and s.strip().startswith("1.1"))

    if start is None:
        print("Start point specified in function does not match page formatting.")
        return raw_guideline_data

    current_heading = start.get_text(strip=True)

    for element in start.next_elements:
        if getattr(element, "name", None) is None:
            continue

        classes = element.get("class", [])

        # Identifying elements with standard formatting

        if element.name == "h3" and "title" in classes:
            current_heading = element.get_text(strip=True)
            sub_heading_1 = None
            sub_heading_2 = None

        elif element.name == "h4" and "title" in classes:
            sub_heading_1 = element.get_text(strip=True)
            sub_heading_2 = None

        elif element.name == "h5" and "title" in classes:
            sub_heading_2 = element.get_text(strip=True)

        elif element.name == "strong":
            sub_heading_2 = element.get_text(strip=True)

        elif element.name == "article" and "recommendation" in classes:
            number_tag = element.find(["h4", "h5", "h6"], class_="recommendation__number")
            body_tag = element.find("div", class_="recommendation__body")

            if not number_tag or not body_tag:
                continue

            recommendation_number = number_tag.get_text(strip=True)

            bullet_points = [
                li.get_text(" ", strip=True).replace("\xa0", " ").replace("[2014]", "") # UPDATE ALL INSTANCES WITH REGEX INSTEAD FOR DATE REMOVAL
                for li in body_tag.select("ul.itemizedlist li")]

            lead_p = body_tag.find("p")
            lead_in = lead_p.get_text(" ", strip=True).replace("\xa0", " ").replace("[2014]", "") if lead_p else ""

            if bullet_points:
                recommendation_text = f"{lead_in} " + ", and ".join(bullet_points)
            else:
                recommendation_text = body_tag.get_text(" ", strip=True).replace("\xa0", " ").replace("[2014]", "")

            raw_guideline_data.append({
                "heading_1": current_heading,
                "sub_heading_1": sub_heading_1,
                "sub_heading_2": sub_heading_2,
                "original_recommendation_number": recommendation_number,
                "original_recommendation_text": recommendation_text
            })

        # Secondary elif for unusually formatted headings

        elif (element.name in ("h4", "h5", "h6")
        and re.match(r"^\d+(\.\d+)+", element.get_text(strip=True).replace("\xa0", "").replace("[2014]", ""))
        and element.find_parent("article", class_="recommendation") is None):
            recommendation_number = element.get_text(strip=True)

            bullet_points = []
            parts = []

            for next in element.next_elements:
                if getattr(next, "name", None) is None:
                    continue

                if next.name in ("h3", "h4", "h5", "h6"):
                    next_classes = next.get("class", [])
                    next_text = next.get_text(strip=True)

                    if ("title" in next_classes) or re.match(r"^\d+(\.\d+)+$", next_text):
                        break

                if next.name == "li":
                    bullet_points.append(next.get_text(" ", strip=True).replace("\xa0", " ").replace("[2014]", ""))
                elif next.name == "p":
                    parts.append(next.get_text(" ", strip=True).replace("\xa0", " ").replace("[2014]", ""))

            lead_in = " ".join(parts).strip()

            if bullet_points:
                if lead_in:
                    recommendation_text = lead_in + " " + ", and ".join(bullet_points)
                else:
                    recommendation_text = ", and ".join(bullet_points)
            else:
                recommendation_text = lead_in

            raw_guideline_data.append({
                "heading_1": current_heading,
                "sub_heading_1": sub_heading_1,
                "original_recommendation_number": recommendation_number,
                "original_recommendation_text": recommendation_text
            })

    return raw_guideline_data

In [7]:
raw_guideline_data = process_html(soup)

print(raw_guideline_data)

json.dump(raw_guideline_data, open(SAVE_FILE.replace(".txt", ".json"), "w", encoding="utf-8"), ensure_ascii=False, indent=2)

# Save guideline data with metadata ?

# Include format validation?



In [None]:
# Validation of JSON structure

## Use "Json Cleaner"?

keys are ONLY one of the following list ['heading_1', etc]
all fields either string or None? but no empty strings
all r_number fields contain number in recommendation format (1.1 etc)
all r_text fields contain string
