In [1]:
!pip3 install pdfplumber --quiet
!pip3 install pymupdf --quiet

In [2]:
import pdfplumber
from pdfplumber.table import TableFinder

import re
import os
import pandas as pd
import logging
import csv

import ipywidgets as widgets
from IPython.display import display

import fitz

In [None]:
def extract_nice_recommendations_to_csv(pdf_path, output_csv="nice_guidelines_final.csv"):
    filename = os.path.basename(pdf_path)
    basename = os.path.splitext(filename)[0]
    parts = basename.split("-", 1)
    guidance_id = parts[0].upper()

    section_pattern = re.compile(r"^((?:\d+\.){2,}\d+)\s+(.*)")
    appendix_pattern = re.compile(r"^(Appendix\s+[A-Z])\b", re.I)
    skip_pattern = re.compile(r"^\d+\.\d+\s*$")

    results = []
    current_section_id = None
    current_text_lines = []
    in_research_section = False
    in_skip_section = False
    table_counter = 1
    pending_table_text = None
    in_appendix = False
    in_appendix_intro = False

    seen_section_ids = set()

    last_numbered_section_id = None
    carry_forward_tables = False

    table_only_sections = set()

    bullet_chars = r"\u2022\u2023\u25E6\u2043\u2219Ä¢â€¢"
    dash_chars = "－"

    meta_markers = [
        "for a short explanation", "rationale and impact",
        "committee discussion", "evidence review",
        "update information", "accreditation",
    ]

    research_markers = [
        "recommendations for research", "research recommendations",
        "other recommendations for research",
        "the guideline committee has made the following recommendations for research",
    ]

    skip_markers = [
        "terms used in this guideline", "terms and definitions",
        "notes on the scope of the guidance", "scope of this guidance",
        "scope of the guideline", "what this guideline covers",
        "how this guideline was developed",
        "what this guidance covers", "other versions of this guideline",
        "related nice guidance", "updating the guideline",
        "about this guideline", "glossary", "introduction",
        "woman-centred care", "key priorities for implementation",
        "this guidance is an update", "contents", "your responsibility",
        "overview", "who is it for?", "finding more information", "context",
        "for recommendations on",
    ]

    table_skip_markers = [
        "how this guideline was developed",
    ]

    recommendation_ref_cue = re.compile(
        r"\b(see|refer(?:\s+to)?|as per|for more information(?:\s+see)?)\s+recommendation(s)?\b",
        re.IGNORECASE
    )

    def dedouble_word(word):
        doubles = sum(1 for i in range(1, len(word)) if word[i] == word[i - 1])
        if len(word) > 3 and doubles >= len(word) // 2:
            return re.sub(r"(.)\1", r"\1", word)
        return word

    def clean_text(text):
        text = re.sub(r"\[.*?\d{4}.*?\]", "", text)
        text = re.sub(r"([:;])\s*[" + dash_chars + r"]+", r"\1", text)
        text = re.sub(r"\s*[" + dash_chars + r"]+\s*", ", ", text)
        text = re.sub(r"([:;])\s*[" + bullet_chars + r"]+", r"\1", text)
        text = re.sub(r"\s*[" + bullet_chars + r"]+\s*", "; ", text)
        text = text.replace("\t", " ").replace(",", ";")
        return " ".join(dedouble_word(w) for w in text.split()).strip()

    def trim_meta(text):
        low = text.lower()
        for marker in meta_markers:
            idx = low.find(marker)
            if idx != -1:
                return text[:idx].strip()
        return text

    def extract_tables_from_page(page):
        table_texts = []
        try:
            tables = page.extract_tables()
            for table in tables:
                rows = []
                for row in table:
                    row_cleaned = [
                        " ".join(dedouble_word(w) for w in (cell or "").split())
                        for cell in row
                    ]
                    rows.append(" | ".join(row_cleaned))
                table_texts.append("\n".join(rows))
        except Exception:
            pass
        return "\n\n".join(table_texts) if table_texts else None

    def _is_skip_table(txt: str) -> bool:
        low = txt.lower()
        return any(marker in low for marker in table_skip_markers)

    def flush_section(appendix_only=False):
        nonlocal current_section_id, current_text_lines, pending_table_text
        nonlocal table_counter, in_appendix, in_appendix_intro

        if current_section_id and not in_research_section and not in_skip_section:
            if appendix_only:
                if pending_table_text and not _is_skip_table(pending_table_text):
                    results.append(
                        {
                            "nice_guideline_id": guidance_id,
                            "section_id": current_section_id,
                            "nice_guideline_text": f"[TABLE {table_counter}]\n{pending_table_text.strip()}",
                        }
                    )
                    table_counter += 1
                pending_table_text = None
                in_appendix_intro = False
            else:
                if current_text_lines and not (current_section_id in table_only_sections and pending_table_text):
                    full_text = clean_text(" ".join(current_text_lines))
                    full_text = trim_meta(full_text)
                    results.append(
                        {
                            "nice_guideline_id": guidance_id,
                            "section_id": current_section_id,
                            "nice_guideline_text": full_text,
                        }
                    )
                if pending_table_text and not _is_skip_table(pending_table_text):
                    results.append(
                        {
                            "nice_guideline_id": guidance_id,
                            "section_id": current_section_id,
                            "nice_guideline_text": f"[TABLE {table_counter}]\n{pending_table_text.strip()}",
                        }
                    )
                    table_counter += 1
                pending_table_text = None

        current_section_id = None
        current_text_lines = []
        in_appendix = False
        in_appendix_intro = False

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_tables = extract_tables_from_page(page)
            if page_tables:
                if current_section_id is None and carry_forward_tables and last_numbered_section_id:
                    current_section_id = last_numbered_section_id
                    current_text_lines = []
                    in_research_section = False
                    in_skip_section = False
                    in_appendix = False
                    in_appendix_intro = False
                if pending_table_text:
                    pending_table_text += "\n" + page_tables
                else:
                    pending_table_text = page_tables
                carry_forward_tables = False

            text = page.extract_text()
            if not text:
                continue

            lines = text.split("\n")
            for line in lines:
                line = line.strip()
                if not line:
                    continue

                if (
                    re.search(r"Page \d+ of \d+", line)
                    or re.search(r"^© NICE", line)
                    or "Subject to" in line
                    or "conditions#notice-of-rights" in line
                    or skip_pattern.match(line)
                    or line.lower().startswith("isbn")
                ):
                    continue

                if any(marker in line.lower() for marker in research_markers):
                    if current_section_id:
                        flush_section()
                    in_research_section = True
                    in_skip_section = False
                    continue

                if any(marker in line.lower() for marker in skip_markers):
                    if current_section_id:
                        flush_section()
                    in_skip_section = True
                    in_research_section = False
                    continue

                if recommendation_ref_cue.search(line):
                    if current_section_id and not in_research_section and not in_skip_section and not in_appendix:
                        if current_section_id not in table_only_sections:
                            current_text_lines.append(line)
                    continue

                match = section_pattern.match(line)
                if match:
                    sec_id = match.group(1)
                    sec_title = match.group(2)

                    if sec_id in seen_section_ids:
                        if current_section_id and not in_research_section and not in_skip_section and not in_appendix:
                            if current_section_id not in table_only_sections:
                                current_text_lines.append(line)
                        continue

                    flush_section()
                    current_section_id = sec_id

                    if re.search(r"\btable\s+\d+\b", sec_title.lower()):
                        current_text_lines = []
                        table_only_sections.add(sec_id)
                        carry_forward_tables = True
                    else:
                        current_text_lines = [sec_title]
                        carry_forward_tables = False

                    in_research_section = False
                    in_skip_section = False
                    in_appendix = False
                    in_appendix_intro = False
                    seen_section_ids.add(sec_id)

                    last_numbered_section_id = sec_id
                    continue

                appendix_match = appendix_pattern.match(line)
                if appendix_match:
                    flush_section()
                    current_section_id = appendix_match.group(1)
                    current_text_lines = []
                    in_research_section = False
                    in_skip_section = False
                    in_appendix = True
                    in_appendix_intro = True
                    pending_table_text = None
                    continue

                if in_appendix and in_appendix_intro:
                    continue

                if current_section_id and not in_research_section and not in_skip_section:
                    if current_section_id in table_only_sections:
                        continue
                    current_text_lines.append(line)

        flush_section()
        if in_appendix:
            flush_section(appendix_only=True)

    df = pd.DataFrame(results)
    if os.path.exists(output_csv):
        df_existing = pd.read_csv(output_csv)
        df = pd.concat([df_existing, df], ignore_index=True)

    df.to_csv(output_csv, index=False, quoting=csv.QUOTE_ALL, escapechar="\\")
    print(f"Extracted {len(results)} recommendations from {guidance_id}")
    print(f"Saved to: {output_csv}")
    return output_csv

In [None]:
uploader = widgets.FileUpload(accept='.pdf', multiple=True)
display(uploader)

def process_uploaded_files():
    for file_info in uploader.value:
        name = file_info['name']
        content = file_info['content']
        
        pdf_path = os.path.join(os.path.expanduser('~/Documents/nhs-trends-ucl/nice_guidelines/'), name)        
        with open(pdf_path, 'wb') as f:
            f.write(content)

        print(f"> Processing: {name}")
        extract_nice_recommendations_to_csv(pdf_path)

FileUpload(value=(), accept='.pdf', description='Upload', multiple=True)

In [5]:
logging.getLogger("pdfminer").setLevel(logging.ERROR)
process_uploaded_files()

> Processing: cg62-antenatal-care-for-uncomplicated-pregnancies.pdf
Extracted 143 recommendations from CG62
Saved to: nice_guidelines_final.csv
> Processing: cg70-induction-of-labour.pdf
Extracted 80 recommendations from CG70
Saved to: nice_guidelines_final.csv
> Processing: cg107-hypertension-in-pregnancy.pdf
Extracted 166 recommendations from CG107
Saved to: nice_guidelines_final.csv
> Processing: cg190-intrapartum-care-for-healthy-women-and-babies.pdf
Extracted 319 recommendations from CG190
Saved to: nice_guidelines_final.csv
> Processing: ng3-diabetes-in-pregnancy-management-from-preconception-to-the-postnatal-period.pdf
Extracted 138 recommendations from NG3
Saved to: nice_guidelines_final.csv
> Processing: ng25-preterm-labour-and-birth.pdf
Extracted 69 recommendations from NG25
Saved to: nice_guidelines_final.csv
> Processing: ng51-suspected-sepsis-recognition-diagnosis-and-early-management.pdf
Extracted 171 recommendations from NG51
Saved to: nice_guidelines_final.csv
> Process