<a href="https://colab.research.google.com/github/louisdennington-design/decision-tree-dissertation/blob/main/html_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
import re

In [19]:
# Global parameters

URL_abortion = 'https://www.nice.org.uk/guidance/ng140/chapter/Recommendations'
URL_pneumonia = 'https://www.nice.org.uk/guidance/ng250/chapter/Recommendations'
URL_bipolar = 'https://www.nice.org.uk/guidance/cg185/chapter/Recommendations'

# regex indicating start of relevant text
# regex indicating end of relevant text

In [20]:
from dataclasses import dataclass

@dataclass
class recommendation_entity:
    """
    Defines the format that text is stored in when scraped from the html
    """
    recommendation_id: int
    parent_heading: str
    recommendation_text: str
    recommendation_url: str # Sometimes a bullet point incorporates a hyperlink to refer to earlier guidance

# If needing to force a finite list in one of the class fields

from enum import Enum

class types_of_thing(Enum):
    """
    1. Adjust the name
    2. Add it as a field, e.g., types_of_thing: types_of_thing
    """
    THIS = "this"
    THAT = "that"
    OR_THIS = "or_this"

In [25]:
# Scrape the text

from bs4 import BeautifulSoup
import requests

url = URL_abortion

html_scrape = requests.get(url).text

soup = BeautifulSoup(html_scrape, "html.parser")

# print(soup.prettify())

In [37]:
# Processing function

"""
- begin at <div class="section" title="1.1
- end before "Terms used in this guideline"
- store every <div class="recommendation__body">
- store the parent heading <h3 class="title" id="">, sub-heading (if exists) <h4 class="title" id="">, and sub-sub-heading <h5 class="title" id=""
- collapse bullet points into recommendation:

        <ul class="itemizedlist indented">
            <li class="listitem">

- exclude <div class="panel">
- save <div class="panel panel--primary"> as separate text to be accessed later if needed
- if link occurs within <div class="section">, as <a class="link" href="", then extract the URL

- if doing the Pneumonia guideline, will also need to think about how to digest <div class="informaltable">
"""

def process_html(soup):

    raw_guideline_data = []

    current_heading = None
    sub_heading_1 = None
    sub_heading_2 = None

    start = soup.find("h3", string = lambda s: s and s.strip().startswith("1.1"))
    if start is None:
        print("Start point specified in function does not match page formatting.")
        return raw_guideline_data

    for element in start.next_elements:

        if element.name == "h3" and "title" in element.get("class", []):
            current_heading = element.get_text(strip=True)
            sub_heading_1 = None
            sub_heading_2 = None

        elif element.name == "h4" and "title" in element.get("class", []):
            sub_heading_1 = element.get_text(strip=True)
            sub_heading_2 = None

        elif element.name == "h5" and "title" in element.get("class", []):
            sub_heading_2 = element.get_text(strip=True)


        elif element.name == "div" and "recommendation_body" in element.get("class", []):
            recommendation_text = element.get_text(" ", strip = True)

            bullet_points = []
            bullet_list = element.find("ul", class_="itemizedlist")
            if bullet_list:
                for li in bullet_list.find_all("li"):
                    bullet_points.append(li.get_text(" ", strip = True))

            raw_guideline_data.append({
                "heading_1": current_heading,
                "sub_heading_1": sub_heading_1,
                "sub_heading_2": sub_heading_2,
                "text": recommendation_text,
                "bullet_points": bullet_points})

    return raw_guideline_data

raw_guideline_data = process_html(soup)

print(raw_guideline_data)

[]
