In [9]:
import requests
import bs4
import pandas as pd
from copy import deepcopy
import os

In [10]:
# Use the same certificate path as your Claude setup
CERT_PATH = '/opt/homebrew/etc/openssl@3/cert.pem'

# Verify the certificate file exists, fallback to system default if not
if os.path.exists(CERT_PATH):
    verify_setting = CERT_PATH
    print(f"Using MITRE certificates at: {CERT_PATH}")
else:
    verify_setting = True
    print("MITRE cert file not found, using system certificates")

Using MITRE certificates at: /opt/homebrew/etc/openssl@3/cert.pem


In [11]:
xlf = pd.ExcelFile("hl7.fhir.uv.subscriptions_1.1.0_requirements.xlsx")
df = pd.read_excel(xlf, "Requirements")


  warn(msg)


In [12]:
df.to_csv("hl7.fhir.uv.subscriptions_1.1.0_requirements_with_text.csv")

In [13]:
section_dict_with_negatives = {}
for i, x in enumerate(df["URL*"]):
    url = x.split("#")
    if len(url) == 1:
        url.append('root')
    print("\n", url, f"{i+1} of {df.shape[0]};", end=" ")
    
    if not (url[1] in section_dict_with_negatives.get(url[0], {})):
        url_dict = section_dict_with_negatives.get(url[0], {})
        
        try:
            # Use the MITRE certificate configuration
            response = requests.get(url[0], verify=verify_setting, timeout=30)
            response.raise_for_status()  # Raise an exception for bad status codes
            soup = bs4.BeautifulSoup(response.text, 'html.parser')
            
            if url[1] == "root":
                element = soup.find("h2", id=url[1])
                sum_text = element.text if element else ""
            elif url[1] == "summary":
                element = soup.find("h2", id="root")
                sum_text = element.text if element else ""
            else:
                i = 0
                s = soup.find(["h3", "h4", "h5"], id=url[1])
                
                while (i < 99) and (s is not None) and (hasattr(s, 'name')) and (s.name != "script"):
                    print(i+1, end=", ")
                    if i > 0:
                        url[1] = s.get("id", url[1])
                    done = False
                    sum_text = ""
                    
                    current = s.next_sibling
                    while not done and current is not None:
                        if hasattr(current, 'text'):
                            sum_text += current.text
                        if hasattr(current, 'name') and current.name in ["h3", "h4", "h5", "script"]:
                            done = True
                        current = current.next_sibling
                        
                        # Safety check to prevent infinite loops
                        if not current:
                            done = True
                            
                    url_dict[url[1]] = sum_text
                    i += 1
            
            if url[1] not in url_dict:
                url_dict[url[1]] = sum_text if 'sum_text' in locals() else ""
            section_dict_with_negatives[url[0]] = url_dict
            
        except requests.exceptions.SSLError as e:
            print(f"SSL Error for {url[0]}: {e}")
            print("Try running the MITRE certificate installation script first")
            # Continue with empty text
            url_dict[url[1]] = ""
            section_dict_with_negatives[url[0]] = url_dict
        except requests.exceptions.RequestException as e:
            print(f"Request Error for {url[0]}: {e}")
            # Continue with empty text
            url_dict[url[1]] = ""
            section_dict_with_negatives[url[0]] = url_dict
        except Exception as e:
            print(f"General Error for {url[0]}: {e}")
            # Continue with empty text
            url_dict[url[1]] = ""
            section_dict_with_negatives[url[0]] = url_dict
    else:
        print("cached", end=" ")


 ['https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/components.html', 'subscription-topics-in-r4'] 1 of 138; 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 
 ['https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/components.html', 'subscription-topics-in-r4'] 2 of 138; cached 
 ['https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/components.html', 'subscription-topics-in-r4'] 3 of 138; cached 
 ['https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/components.html', 'subscriptions'] 4 of 138; 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 

In [14]:
negatives = {}
for key, val in section_dict_with_negatives.items():
    negatives[key] = list(val.keys())
for x in df["URL*"]:
    url = x.split("#")
    if len(url) == 1:
        url.append('root')
    if url[1] in negatives[url[0]]:
        negatives[url[0]].pop(negatives[url[0]].index(url[1]))
    


In [15]:
for k,v in section_dict.items():
    print(k, v.keys())

In [16]:
negatives

{'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/components.html': [],
 'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/workflow.html': [],
 'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/channels.html': [],
 'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/notifications.html': [],
 'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/payloads.html': [],
 'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/conformance.html': [],
 'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/errors.html': [],
 'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/StructureDefinition-backport-subscription-definitions.html': [],
 'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/StructureDefinition-backport-subscription-status-r4-definitions.html': [],
 'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/OperationDefinition-backport-subscription-status.html': [],
 'https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/CapabilityStatement-backport-subscri

In [17]:
l = ['a','b','c']
l.pop(l.index('b'))
print(l)

['a', 'c']


In [18]:

for x in df["URL*"]:
    print(x)
    break

https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/components.html#subscription-topics-in-r4


In [19]:
r = requests.get("https://hl7.org/fhir/uv/subscriptions-backport/STU1.1/channels.html")
soup = bs4.BeautifulSoup(r.text)
col12 = soup.find_all("div", class_="col-12")
col = col12[0]
clist = list(col.children)

SSLError: HTTPSConnectionPool(host='hl7.org', port=443): Max retries exceeded with url: /fhir/uv/subscriptions-backport/STU1.1/channels.html (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))

In [20]:
col.find_all("")

NameError: name 'col' is not defined

In [21]:
for child in clist[:21]:
    print(type(child))
    print(child)
    print(f"text: {child.text.strip()}")
    print("_"*88)

NameError: name 'clist' is not defined

In [None]:
current_h = False
sec_to_text = {}
for child in clist:
    if type(child) == bs4.element.Tag:
        print(child)
        print(child.name)

<p id="publish-box">This page is part of the Subscriptions R5 Backport (v1.1.0: <a href="https://confluence.hl7.org/display/HL7/HL7+Balloting" title="Standard for Trial-Use">STU</a> 1.1) based on <a href="http://hl7.org/fhir/4.3.0">FHIR v4.3.0</a>. This is the current published version in its permanent home (it will always be available at this URL).  For a full list of available versions, see the <a href="http://hl7.org/fhir/uv/subscriptions-backport/history.html">Directory of published versions <img src="external.png" style="text-align: baseline"/></a></p>
p
<h2>Channels</h2>
h2
<p><!-- white space is critical inside of capture --></p>
p
<div>
<!-- do not remove - needed to prevent Jekyll from adding a p tag to any non block level element in the markdown.-->
</div>
div
<p>In FHIR R5, there are four channel types which were common enough to be defined in the specification, along with the ability to define additional channel types externally.  In this Implementation Guide, we define tho

In [None]:
df.columns

Index(['URL*', 'ID*', 'Requirement*', 'Conformance*', 'Actor*',
       'Sub-Requirement(s)', 'Conditionality', 'Conditionality Description',
       'Verifiable?', 'Verifiability Details', 'Planning To Test?',
       'Planning To Test Details', 'Page', 'Section', 'Same As…', 'Section #',
       'Grouping', 'Scope', 'Scope Description', 'Priority', 'Test Plan',
       'Simulation Approach', 'client testing simulation implementation group',
       'client simulation status', 'Test location', 'Test name', 'Notes',
       'Questions', 'reference_text'],
      dtype='object')

In [22]:
text_to_reqs = []

for text in df.reference_text.unique():
    entry = {'refernce_text': text, 'generated_requirements':[]}
    subdf = df[df.reference_text == text]
    for i, row in subdf.iterrows():
        req = row.dropna().to_dict()
        req.pop("URL*")
        req.pop("ID*")
        if "Page" in req:
            req.pop("Page")
        if "Section" in req:
            req.pop("Section")
        if "Section #" in req:
            req.pop("Section #")
        if "Same As…" in req:
            req.pop("Same As…")
        if "client simulation status" in req:
            req.pop("client simulation status")
        req.pop('reference_text')
        entry['generated_requirements'].append(req)
    text_to_reqs.append(entry)

AttributeError: 'DataFrame' object has no attribute 'reference_text'

In [23]:
import json
with open("uv.subscriptions_text_to_reqs.json", 'w+') as f:
    json.dump(text_to_reqs, f, indent=2)

In [24]:
l = {"reference_text": text, 'generated_requirements':[]}
for i, row in subdf.iterrows():
    req = row.dropna().to_dict()
    req.pop("URL*")
    req.pop("ID*")
    req.pop("Page")
    req.pop("Section")
    if "Same As…" in req:
        req.pop("Same As…")
    if "client simulation status" in req:
        req.pop("client simulation status")
    req.pop('reference_text')
    l['generated_requirements'].append(req)
print(l)

NameError: name 'text' is not defined

In [None]:
df[df['Section #'] == '5.2.1.1.2']['ref']

Unnamed: 0,URL*,ID*,Requirement*,Conformance*,Actor*,Sub-Requirement(s),Conditionality,Conditionality Description,Verifiable?,Verifiability Details,...,Priority,Test Plan,Simulation Approach,client testing simulation implementation group,client simulation status,Test location,Test name,Notes,Questions,reference_text
112,https://hl7.org/fhir/uv/subscriptions-backport...,114,[Subscription Resource] Conformance Expectatio...,SHALL,Server,,,,Yes,,...,,Verify that the Subscription resource is prese...,SIMULATED: Inferno will publish a static capab...,CapabilityStatement,,,,,,SubscriptionConformance Expectation: SHALLConf...
118,https://hl7.org/fhir/uv/subscriptions-backport...,120,\n[servers] SHOULD support [profile]: Backport...,SHOULD,Server,,,,Yes,,...,,Verify that the Subscription profile is indica...,SIMULATED: Inferno will publish a static capab...,CapabilityStatement,,,,,,CapabilityStatement: R4 Topic-Based Subscripti...
119,https://hl7.org/fhir/uv/subscriptions-backport...,121,SHALL support the $status operation,SHALL,Server,,,,Yes,,...,,Verify that the $status operation is listed un...,SIMULATED: Inferno will publish a static capab...,Status API,,,,actual status operation tested elsewhere,,CapabilityStatement: R4 Topic-Based Subscripti...
120,https://hl7.org/fhir/uv/subscriptions-backport...,122,MAY support the $events operation,MAY,Server,,,,Yes,,...,,Verify that the $events operation is listed un...,SIMULATED: Inferno will publish a static capab...,Events API,,,,actual events operation tested elsewhere,,CapabilityStatement: R4 Topic-Based Subscripti...
121,https://hl7.org/fhir/uv/subscriptions-backport...,123,MAY support the $get-ws-binding-token operation,MAY,Server,,,,Yes,,...,,,NOT SIMULATED: Inferno will not include this o...,,,,,,,CapabilityStatement: R4 Topic-Based Subscripti...
122,https://hl7.org/fhir/uv/subscriptions-backport...,124,A Server SHALL be capable of returning a Subsc...,SHALL,Server,,,,Yes,,...,,Verify that the read interaction is listed und...,SIMULATED: Inferno will publish a static capab...,Subscription API,,,,actual read tested in 134,,CapabilityStatement: R4 Topic-Based Subscripti...
123,https://hl7.org/fhir/uv/subscriptions-backport...,125,A Server SHOULD be capable of creating a Subsc...,SHOULD,Server,,,,Yes,,...,,Verify that the Create interaction is listed u...,SIMULATED: Inferno will publish a static capab...,CapabilityStatement,,,,,IG AUTHORS: conflicts with requirement 133. Wh...,CapabilityStatement: R4 Topic-Based Subscripti...
124,https://hl7.org/fhir/uv/subscriptions-backport...,126,A Server SHOULD be capable of modifying a Subs...,SHOULD,Server,,,,Yes,,...,,Verify that the Update interaction is listed u...,SIMULATED: Inferno will publish a static capab...,Subscription API,,,,,,CapabilityStatement: R4 Topic-Based Subscripti...
125,https://hl7.org/fhir/uv/subscriptions-backport...,127,A Server SHOULD be capable of deleting a Subsc...,SHOULD,Server,,,,Yes,,...,,Verify that the Delete interaction is listed u...,SIMULATED: Inferno will publish a static capab...,Subscription API,,,,,,CapabilityStatement: R4 Topic-Based Subscripti...
126,https://hl7.org/fhir/uv/subscriptions-backport...,128,A Server [MAY] be capable of searching for Sub...,MAY,Server,,,,Yes,,...,,,NOT SIMULATED: Inferno will publish a static c...,,,,,MAY is interpreted over SHOULD for the followi...,,CapabilityStatement: R4 Topic-Based Subscripti...
