In [6]:
from datasets import load_dataset, load_from_disk
import os

BASE_DIR = "/home/data/hlibt/checklist2_WIP/"

HF_cases_path = os.path.join(BASE_DIR, 'HF_cache', 'cases')
HF_KBs_path = os.path.join(BASE_DIR, 'HF_cache', 'KBs')

def load_local_HF_dataset(path):
    '''
    Load a dataset from a local path
    '''
    dataset = load_from_disk(path)
    return dataset

# Explore the cases

In [10]:
cases = load_local_HF_dataset(HF_cases_path)

cases

DatasetDict({
    AI_ACT: Dataset({
        features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'followed_articles', 'violated_articles', 'case_content'],
        num_rows: 3000
    })
    GDPR: Dataset({
        features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'followed_articles', 'violated_articles', 'case_content'],
        num_rows: 3137
    })
    HIPAA: Dataset({
        features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'followed_articles', 'violated_articles', 'case_content'],
        num_rows: 214
    })
})

In [11]:
print(cases["HIPAA"])

for i, case in enumerate(cases["HIPAA"]):
    if i == 10:
        break
    print(case)

Dataset({
    features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'followed_articles', 'violated_articles', 'case_content'],
    num_rows: 214
})
{'norm_type': 'prohibit', 'sender': ['Public Entity/Administrator'], 'sender_role': ['Public Entity/Administrator'], 'recipient': ['Citizen/Requester of Information'], 'recipient_role': ['Citizen/Requester of Information'], 'subject': ['Subjects of the health insurance plans'], 'subject_role': ['Subjects of the health insurance plans'], 'information_type': ['Health insurance plan details, including coverage scope and claims experience'], 'consent_form': '', 'purpose': 'To fulfill a public records request under OPRA', 'followed_articles': [], 'violated_articles': ['164.502(a)', '164.502'], 'case_content': 'In the City of Plainfield, William H. Michelson, a concerned citizen, submitted a detailed request for access to government records. His re

## Explore the first data in HIPAA dataset

In [12]:
first_case = cases["HIPAA"][0]
print(first_case)
print(first_case["norm_type"])
print(first_case["sender"])
print(first_case["sender_role"])
print(first_case["recipient"])
print(first_case["recipient_role"])
print(first_case["subject"])
print(first_case["subject_role"])
print(first_case["information_type"])
print(first_case["consent_form"])
print(first_case["purpose"])
print(first_case["followed_articles"])
print(first_case["violated_articles"])
print(first_case["case_content"])

{'norm_type': 'prohibit', 'sender': ['Public Entity/Administrator'], 'sender_role': ['Public Entity/Administrator'], 'recipient': ['Citizen/Requester of Information'], 'recipient_role': ['Citizen/Requester of Information'], 'subject': ['Subjects of the health insurance plans'], 'subject_role': ['Subjects of the health insurance plans'], 'information_type': ['Health insurance plan details, including coverage scope and claims experience'], 'consent_form': '', 'purpose': 'To fulfill a public records request under OPRA', 'followed_articles': [], 'violated_articles': ['164.502(a)', '164.502'], 'case_content': 'In the City of Plainfield, William H. Michelson, a concerned citizen, submitted a detailed request for access to government records. His request targeted the health insurance benefits available to city employees, officials, and their dependents over recent years. Michelson sought comprehensive details, including descriptions of health plans, costs, participant names, and claims experi

# Explore the KB

In [13]:
kb = load_local_HF_dataset(HF_KBs_path)

kb

DatasetDict({
    AI_ACT: Dataset({
        features: ['reference', 'norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'sender_is_subject', 'recipient_is_subject', 'regulation_id', 'regulation_content'],
        num_rows: 842
    })
    GDPR: Dataset({
        features: ['reference', 'norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'sender_is_subject', 'recipient_is_subject', 'regulation_id', 'regulation_content', 'node_index'],
        num_rows: 679
    })
    HIPAA: Dataset({
        features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'sender_is_subject', 'recipient_is_subject', 'regulation_id', 'regulation_content', 'reference'],
        num_rows: 591
    })
})

In [14]:
print(kb["HIPAA"])

for i, case in enumerate(kb["HIPAA"]):
    if i == 10:
        break
    print(case)

Dataset({
    features: ['norm_type', 'sender', 'sender_role', 'recipient', 'recipient_role', 'subject', 'subject_role', 'information_type', 'consent_form', 'purpose', 'sender_is_subject', 'recipient_is_subject', 'regulation_id', 'regulation_content', 'reference'],
    num_rows: 591
})
{'norm_type': '"Permit"', 'sender': ['cover entity'], 'sender_role': ['may vary (e.g., doctor, provider)'], 'recipient': ['individual'], 'recipient_role': ['data'], 'subject': ['individual'], 'subject_role': ['data subject (e.g., patient)'], 'information_type': ['protected health information'], 'consent_form': '""', 'purpose': ['to provide information to the individual'], 'sender_is_subject': '"NO"', 'recipient_is_subject': '"YES"', 'regulation_id': '"164.502(a)(1)(i)"', 'regulation_content': '"\\n\\u00a7 164.502 Uses and disclosures of protected health information: General rules.\\n(a) Standard.  A covered entity or business associate may not use or disclose protected health information, except as permi

In [16]:
train_dataset = load_dataset("csv", data_files="data/keywords.csv")

train_dataset["train"]

Dataset({
    features: ['query', 'positive', 'label'],
    num_rows: 1613
})

## Try to load denoise dataset

In [17]:
from sentence_transformers import datasets as sdatasets
from datasets import DatasetDict, Dataset

KB = load_from_disk(HF_KBs_path)
kb_contents = KB["HIPAA"]["regulation_content"]
train_dataset_denoise = sdatasets.DenoisingAutoEncoderDataset(kb_contents)
train_dataset_denoise = Dataset.from_list(train_dataset_denoise)
train_dataset_denoise = DatasetDict({"train": train_dataset_denoise})

train_dataset_denoise["train"]

TypeError: 'InputExample' object is not iterable