In [1]:
from datasets import load_dataset, load_from_disk
import os

BASE_DIR = "/home/data/hlibt/checklist2_WIP/"

HF_cases_path = os.path.join(BASE_DIR, 'HF_cache', 'cases')
HF_KBs_path = os.path.join(BASE_DIR, 'HF_cache', 'KBs')

def load_local_HF_dataset(path):
    '''
    Load a dataset from a local path
    '''
    dataset = load_from_disk(path)
    return dataset

# Explore the cases

In [2]:
cases = load_local_HF_dataset(HF_cases_path)

cases

DatasetDict({
    AI_ACT: Dataset({
        features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'followed_articles', 'violated_articles', 'case_content'],
        num_rows: 3000
    })
    GDPR: Dataset({
        features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'followed_articles', 'violated_articles', 'case_content'],
        num_rows: 3137
    })
    HIPAA: Dataset({
        features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'followed_articles', 'violated_articles', 'case_content'],
        num_rows: 214
    })
})

In [3]:
print(cases["HIPAA"])

for i, case in enumerate(cases["HIPAA"]):
    if i == 10:
        break
    print(case)

Dataset({
    features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'followed_articles', 'violated_articles', 'case_content'],
    num_rows: 214
})
{'norm_type': 'prohibit', 'sender': ['Public Entity/Administrator'], 'sender_role': ['Public Entity/Administrator'], 'recipient': ['Citizen/Requester of Information'], 'recipient_role': ['Citizen/Requester of Information'], 'subject': ['Subjects of the health insurance plans'], 'subject_role': ['Subjects of the health insurance plans'], 'information_type': ['Health insurance plan details, including coverage scope and claims experience'], 'consent_form': '', 'purpose': 'To fulfill a public records request under OPRA', 'followed_articles': [], 'violated_articles': ['164.502(a)', '164.502'], 'case_content': 'In the City of Plainfield, William H. Michelson, a concerned citizen, submitted a detailed request for access to government records. His re

## Explore the first data in HIPAA dataset

In [4]:
first_case = cases["HIPAA"][0]
print(first_case)
print(first_case["norm_type"])
print(first_case["sender"])
print(first_case["sender_role"])
print(first_case["recipient"])
print(first_case["recipient_role"])
print(first_case["subject"])
print(first_case["subject_role"])
print(first_case["information_type"])
print(first_case["consent_form"])
print(first_case["purpose"])
print(first_case["followed_articles"])
print(first_case["violated_articles"])
print(first_case["case_content"])

{'norm_type': 'prohibit', 'sender': ['Public Entity/Administrator'], 'sender_role': ['Public Entity/Administrator'], 'recipient': ['Citizen/Requester of Information'], 'recipient_role': ['Citizen/Requester of Information'], 'subject': ['Subjects of the health insurance plans'], 'subject_role': ['Subjects of the health insurance plans'], 'information_type': ['Health insurance plan details, including coverage scope and claims experience'], 'consent_form': '', 'purpose': 'To fulfill a public records request under OPRA', 'followed_articles': [], 'violated_articles': ['164.502(a)', '164.502'], 'case_content': 'In the City of Plainfield, William H. Michelson, a concerned citizen, submitted a detailed request for access to government records. His request targeted the health insurance benefits available to city employees, officials, and their dependents over recent years. Michelson sought comprehensive details, including descriptions of health plans, costs, participant names, and claims experi

# Explore the KB

In [5]:
kb = load_local_HF_dataset(HF_KBs_path)

kb

DatasetDict({
    AI_ACT: Dataset({
        features: ['reference', 'norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'sender_is_subject', 'recipient_is_subject', 'regulation_id', 'regulation_content'],
        num_rows: 842
    })
    GDPR: Dataset({
        features: ['reference', 'norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'sender_is_subject', 'recipient_is_subject', 'regulation_id', 'regulation_content', 'node_index'],
        num_rows: 679
    })
    HIPAA: Dataset({
        features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'sender_is_subject', 'recipient_is_subject', 'regulation_id', 'regulation_content', 'reference'],
        num_rows: 591
    })
})

In [6]:
print(kb["HIPAA"])

for i, case in enumerate(kb["HIPAA"]):
    if i == 10:
        break
    print(case)

Dataset({
    features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'sender_is_subject', 'recipient_is_subject', 'regulation_id', 'regulation_content', 'reference'],
    num_rows: 591
})
{'norm_type': '"Permit"', 'sender': ['cover entity'], 'sender_role': ['may vary (e.g., doctor, provider)'], 'recipient': ['individual'], 'recipient_role': ['data'], 'subject': ['individual'], 'subject_role': ['data subject (e.g., patient)'], 'information_type': ['protected health information'], 'consent_form': '""', 'purpose': ['to provide information to the individual'], 'sender_is_subject': '"NO"', 'recipient_is_subject': '"YES"', 'regulation_id': '"164.502(a)(1)(i)"', 'regulation_content': '"\\n\\u00a7 164.502 Uses and disclosures of protected health information: General rules.\\n(a) Standard.  A covered entity or business associate may not use or disclose protected health information, except as permi

## Load GDPR dataset

In [7]:
gdpr_cases = cases["GDPR"]

gdpr_cases

Dataset({
    features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'followed_articles', 'violated_articles', 'case_content'],
    num_rows: 3137
})

In [8]:
# print the labels

for i in range(10):
    print(gdpr_cases["followed_articles"][i], gdpr_cases["violated_articles"][i])

[] ['Article 32', 'Article 33', 'Article 34']
[] ['Article 12', 'Article 13']
[] ['Article 6 - Lawfulness of processing', 'Article 7 - Conditions for consent']
[] ['Article 5(1)(f)', 'Article 32']
[] ['Article 33', 'Article 34']
[] ['Article 32']
[] ['Article 12', 'Article 15', 'Article 17']
[] ['Article 17', 'Article 12']
[] ['Article 15', 'Article 31']
[] ['Article 12', 'Article 17']


In [9]:
gdpr_kb = kb["GDPR"]

gdpr_kb

Dataset({
    features: ['reference', 'norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'sender_is_subject', 'recipient_is_subject', 'regulation_id', 'regulation_content', 'node_index'],
    num_rows: 679
})

In [15]:
import unicodedata

print(len(gdpr_kb["reference"]))
print(type(gdpr_kb["reference"]))

for i in range(len(gdpr_kb["regulation_id"])):
    print(gdpr_kb["regulation_id"][i])
    print(str(gdpr_kb["regulation_content"][i]))

679
<class 'list'>
Article 1
"Subject-matter and objectives"
Article 1(1)
"This Regulation lays down rules relating to the protection of natural persons with regard to the processing of personal data and rules relating to the free movement of personal data."
Article 1(2)
"This Regulation protects fundamental rights and freedoms of natural persons and in particular their right to the protection of personal data."
Article 1(3)
"The free movement of personal data within the Union shall be neither restricted nor prohibited for reasons connected with the protection of natural persons with regard to the processing of personal data."
Article 2
"Material scope"
Article 2(1)
"This Regulation applies to the processing of personal data wholly or partly by automated means and to the processing other than by automated means of personal data which form part of a filing system or are intended to form part of a filing system."
Article 2(2)
"This Regulation does not apply to the processing of personal 