# Registering datasets

This notebook has examples on howto register datasets directly from HuggingFace.

## Dependencies

In [1]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), '..'))


In [12]:
%load_ext autoreload
%autoreload 2

import json
import os
import tempfile
import shutil

from azure.core.exceptions import ResourceNotFoundError
from datasets import load_dataset, Dataset
import logging

from medbench.aml import AzureML


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
aml = AzureML.connect_to_registry(registry_name="azureml-1p")

# Helper functions

In [4]:
def huggingface_dataset_to_jsonl(dataset: Dataset, output_folder: str):
    for split in dataset.keys():
        dataset[split].to_pandas().to_json(
            os.path.join(output_folder, f"{split}.jsonl"), orient="records", lines=True
        )

# BioMistral

In [6]:
bio_mistral_df = aml.get_dataset("Biomistral", read_folder_jsonl=True)

2024-10-16 13:04:31.058 | DEBUG    | medbench.aml:get_dataset:130 - Downloading Biomistral.jsonl to temporary directory `/tmp/tmpym59yoht`...
2024-10-16 13:04:51.189 | INFO     | medbench.aml:get_dataset:135 - Reading Biomistral.jsonl as a pandas DataFrame.
  return pd.read_json(jsonl_str, lines=True)


In [7]:
bio_mistral_df.head()

Unnamed: 0,corpus_name,task_type,classes,identifier,in,out,explanation,options,question,correct_answer_letter,correct_answer_text,context,few_shot_samples[1],few_shot_samples[2],few_shot_samples[3],system_prompt,explanation_prompt
0,MedMCQA,mcqa,"[A, B, C, D]",45258d3d-b974-44dd-a161-c3fccbdadd88,Which of the following is not true for myelina...,A. Impulse through myelinated fibers is slower...,,{'A': 'Impulse through myelinated fibers is sl...,Which of the following is not true for myelina...,A,Impulse through myelinated fibers is slower th...,,"[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...","[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...","[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...",We are giving you a scientific question and fo...,
1,MedMCQA,mcqa,"[A, B, C, D]",b944ada9-d776-4c2a-9180-3ae5f393f72d,Which of the following is not true about glome...,A. The oncotic pressure of the fluid leaving t...,Ans-a. The oncotic pressure of the fluid leavi...,{'A': 'The oncotic pressure of the fluid leavi...,Which of the following is not true about glome...,A,The oncotic pressure of the fluid leaving the ...,,"[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...","[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...","[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...",You are presented with a scientific query alon...,Upon what grounds have you formed your judgmen...
2,MedMCQA,mcqa,"[A, B, C, D]",b64a9cd7-d076-4c55-8be1-f9c44fece6cc,A 29 yrs old woman with a pregnancy of 17 week...,C. Amniotic fluid samples plus chromosomal ana...,,{'A': 'No test is required now as her age is b...,A 29 yrs old woman with a pregnancy of 17 week...,C,Amniotic fluid samples plus chromosomal analys...,,"[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...","[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...","[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...",You'll receive a scientific question and four ...,
3,MedMCQA,mcqa,"[A, B, C, D]",c6365cce-507c-40f6-90a2-46b867f47b6e,Axonal transport is: \n Option A: Antegrade \n...,C. Antegrade and retrograde,Fast anterograde (400 mm/day) transport occurs...,"{'A': 'Antegrade', 'B': 'Retrograde', 'C': 'An...",Axonal transport is:,C,Antegrade and retrograde,,"[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...","[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...","[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...",Your objective is to match a scientific questi...,What rationale supports your decision?
4,MedMCQA,mcqa,"[A, B, C, D]",72c1c5e0-b64f-4eef-bf22-ecfb60c5c19c,Low insulin to glucagon ratio is seen in all o...,A. Glycogen synthesis,Answer- A. Glycogen synthesisLow insulin to gl...,"{'A': 'Glycogen synthesis', 'B': 'Glycogen bre...",Low insulin to glucagon ratio is seen in all o...,A,Glycogen synthesis,,"[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...","[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...","[{'corpus_name': 'MedMCQA', 'task_type': 'mcqa...",We are giving you a scientific question and fo...,On what have you based your judgement on ? Or ...


In [16]:
bio_mistral_df[bio_mistral_df["corpus_name"].str.startswith("MedQA")]["question"].value_counts().unique()

array([2])

In [10]:
bio_mistral_df["corpus_name"].value_counts()

corpus_name
MedMCQA                       4183
MedQA                         1273
MedQA-5_options               1273
PubMedQA                       500
MMLU_professional_medicine     272
MMLU_clinical_knowledge        265
MMLU_college_medicine          173
MMLU_college_biology           144
MMLU_anatomy                   135
MMLU_medical_genetics          100
Name: count, dtype: int64

# MMLU (medical)

In [48]:
mmlu_subsets = [
    "anatomy",
    "clinical_knowledge",
    "college_biology",
    # "college_chemistry",
    "college_medicine",
    # "high_school_biology",
    # "high_school_chemistry",
    # "high_school_psychology",
    # "human_aging",
    # "human_sexuality",
    "medical_genetics",
    "nutrition",
    "professional_medicine",
    "professional_psychology",
    "virology",
]

In [49]:
for subset in mmlu_subsets:
    dataset_name = f"mmlu_{subset}".lower()

    logging.info(f"Registering dataset {dataset_name}.")
    should_register = False
    try:
        aml.get_dataset_latest_version(dataset_name)
    except ResourceNotFoundError:
        should_register = True
    
    if should_register:
        logging.debug("Loading dataset from HuggingFace.")
        ds = load_dataset("cais/mmlu", subset)

        with tempfile.TemporaryDirectory() as temp_dir_name:
            logging.debug(f"Saving dataset as .jsonl in `{temp_dir_name}`.")
            huggingface_dataset_to_jsonl(ds, temp_dir_name)
            aml.register_folder_as_dataset(
                folder_path=temp_dir_name,
                dataset_name=dataset_name,
                dataset_description=(
                    f"MMLU {subset.replace('_', ' ').title()} dataset.\n\n"
                    "This MMLU subset consists of multiple-choice questions with 4 answer options and is designed to evaluate a model's understanding of specific medical and biological domains.\n\n"
                    "Source: https://huggingface.co/datasets/cais/mmlu\n"
                    "Source version: c30699e8356da336a370243923dbaf21066bb9fe (commit from 20240308)"
                )
            )
    else:
        logging.info(f"Dataset {dataset_name} already registered. Skipping.")


2024-10-16 15:09:30.435 | INFO     | __main__:<module>:4 - Registering dataset mmlu_anatomy.
2024-10-16 15:09:31.165 | INFO     | __main__:<module>:29 - Dataset mmlu_anatomy already registered. Skipping.
2024-10-16 15:09:31.165 | INFO     | __main__:<module>:4 - Registering dataset mmlu_clinical_knowledge.
2024-10-16 15:09:31.579 | INFO     | __main__:<module>:29 - Dataset mmlu_clinical_knowledge already registered. Skipping.
2024-10-16 15:09:31.580 | INFO     | __main__:<module>:4 - Registering dataset mmlu_college_biology.
2024-10-16 15:09:31.967 | INFO     | __main__:<module>:29 - Dataset mmlu_college_biology already registered. Skipping.
2024-10-16 15:09:31.969 | INFO     | __main__:<module>:4 - Registering dataset mmlu_college_medicine.
2024-10-16 15:09:32.395 | INFO     | __main__:<module>:29 - Dataset mmlu_college_medicine already registered. Skipping.
2024-10-16 15:09:32.396 | INFO     | __main__:<module>:4 - Registering dataset mmlu_medical_genetics.
2024-10-16 15:09:32.701 | 

# MedMCQA

In [51]:
medmcqa = load_dataset("openlifescienceai/medmcqa")
with tempfile.TemporaryDirectory() as temp_dir_name:
    huggingface_dataset_to_jsonl(medmcqa, temp_dir_name)
    aml.register_folder_as_dataset(
        folder_path=temp_dir_name,
        dataset_name="medmcqa",
        dataset_description=(
            "MedMCQA dataset.\n\n"
            "MedMCQA is a large-scale, Multiple-Choice Question Answering (MCQA) dataset designed to address real-world medical entrance exam questions.\n"
            "Test set's ground truth is not available. To verify performance we must submit this form https://forms.gle/xLJHNbuvaRa2FXbD8.\n\n"
            "Dataset repository: https://github.com/MedMCQA/MedMCQA"
            "\n"
            "Source: https://huggingface.co/datasets/openlifescienceai/medmcqa"
            "\n"
            "Source version: 91c6572c454088bf71b679ad90aa8dffcd0d5868 (commit from 20240104)"
        )
    )

Generating train split: 100%|██████████| 182822/182822 [00:00<00:00, 253088.99 examples/s]
Generating test split: 100%|██████████| 6150/6150 [00:00<00:00, 391336.87 examples/s]
Generating validation split: 100%|██████████| 4183/4183 [00:00<00:00, 261788.05 examples/s]
2024-10-16 15:42:09.643 | INFO     | medbench.aml:_get_dataset_next_version:251 - Could not find dataset medmcqa, creating new with version "1".
Subtype value SAS has no mapping, use base class DataReferenceCredentialDto.
Your file exceeds 100 MB. If you experience low speeds, latency, or broken connections, we recommend using the AzCopyv10 tool for this file transfer.

Example: azcopy copy '/tmp/tmpbsy9scf4' 'https://azml1p5efskuse01.blob.core.windows.net/azureml-1p-925a44e3-469c-5a0c-8ea1-d61ddb47f7a4/tmpbsy9scf4' 

See https://docs.microsoft.com/azure/storage/common/storage-use-azcopy-v10 for more information.
[32mUploading tmpbsy9scf4 (151.72 MBs): 100%|██████████| 151722150/151722150 [00:05<00:00, 27158627.54it/s]


In [61]:
medmcqa["train"][0]

{'id': 'e9ad821a-c438-4965-9f77-760819dfa155',
 'question': 'Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma',
 'opa': 'Hyperplasia',
 'opb': 'Hyperophy',
 'opc': 'Atrophy',
 'opd': 'Dyplasia',
 'cop': 2,
 'choice_type': 'single',
 'exp': 'Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950',
 'subject_name': 'Anatomy',
 'topic_name': 'Urinary tract'}

In [58]:
medmcqa["test"][0]

{'id': '84f328d3-fca4-422d-8fb2-19d55eb31503',
 'question': 'Which of the following is derived from fibroblast cells ?',
 'opa': 'TGF-13',
 'opb': 'MMP2',
 'opc': 'Collagen',
 'opd': 'Angiopoietin',
 'cop': -1,
 'choice_type': 'single',
 'exp': '',
 'subject_name': 'Pathology',
 'topic_name': None}

In [59]:
biomistral_medmcqa = load_dataset(
    path="BioMistral/BioInstructQA",
    name="MedMCQA",
    trust_remote_code=True,
)

In [60]:
biomistral_medmcqa["test"][0]

{'identifier': '57f4102b-4c7f-41b0-8ee5-a221747da8ab',
 'corpus_name': 'MedMCQA',
 'task_type': 'mcqa',
 'classes': ['A', 'B', 'C', 'D'],
 'prompt_no_answer': 'The following are multiple choice questions (with answers) about medical knowledge. \n **Question:** It is likely that cell mediated immune reactions (delayed\nhypersensitivity) occur in periodontitis because subjects with periodontitis have \n (A) High levels of histamine in involved gingival tissue \n (B) IgG antibodies reactive with plaque bacterial antigens \n (C) Ttymphocytes sensitized to bacterial plaque antigens \n (D) High Levels of collagenase in gingival fluids \n **Answer:**(',
 'prompt': 'The following are multiple choice questions (with answers) about medical knowledge. \n **Question:** It is likely that cell mediated immune reactions (delayed\nhypersensitivity) occur in periodontitis because subjects with periodontitis have \n (A) High levels of histamine in involved gingival tissue \n (B) IgG antibodies reactive 

# `mtsamples`

In [8]:
mtsamples_path = "/mnt/c/Users/lschettini/OneDrive - Microsoft/Shared Documents - BabelBench for Healthcare/Summarization/jsonl/"
rounding_output = "rounding_output.jsonl"

os.path.join(mtsamples_path, rounding_output)

'/mnt/c/Users/lschettini/OneDrive - Microsoft/Shared Documents - BabelBench for Healthcare/Summarization/jsonl/rounding_output.jsonl'

In [14]:
output_clean = "output-clean-gpt4.jsonl"

with open(os.path.join(mtsamples_path, output_clean), "r") as f:
    jsonl_str = f.read()
    output_clean_df = pd.read_json(jsonl_str, lines=True)

output_clean_df.head()

  output_clean_df = pd.read_json(jsonl_str, lines=True)


Unnamed: 0,Keywords,Description,Filename,Clinical_Note,Medical_Specialty,output_gpt-4
0,"pediatrics - neonatal, 1-year-old, naps, mama...",Health maintenance exam for 1-year-old female.,1-year-old Exam - H&P.txt,CHIEF COMPLAINT: This 1-year-old female presen...,Pediatrics - Neonatal,"{\n""indication"": ""Routine well child care, Acu..."
1,"radiology, 2-d study, doppler, tricuspid regu...","Normal left ventricle, moderate biatrial enla...",2-D Doppler.txt,"2-D STUDY1. Mild aortic stenosis, widely calci...",Radiology,"{\n""indication"": ""Mild aortic stenosis, Mild l..."
2,"radiology, 2-d m-mode, doppler, aortic valve,...",2-D M-Mode. Doppler.,2-D Echocardiogram - 1.txt,2-D M-MODE: 1. Left atrial enlargement with l...,Radiology,"{\n""indication"": ""Left atrial enlargement, Mil..."
3,"radiology, 2-d, doppler, echocardiogram, annu...",2-D Echocardiogram,2-D Echocardiogram - 2.txt,COMMENTS:1. The left ventricular cavity size a...,Radiology,"{\n""indication"": ""Cardiac evaluation"",\n""indic..."
4,"radiology, 2-d echocardiogram, cardiac functi...",2-D Echocardiogram,2-D Echocardiogram - 3.txt,2-D ECHOCARDIOGRAMMultiple views of the heart ...,Radiology,"{\n""indication"": ""Routine check-up"",\n""indicat..."


In [15]:
with open(os.path.join(mtsamples_path, rounding_output), "r") as f:
    jsonl_str = f.read()
    rounding_output_df = pd.read_json(jsonl_str, lines=True)

rounding_output_df.head()

  rounding_output_df = pd.read_json(jsonl_str, lines=True)


Unnamed: 0,Keywords,Description,Filename,Clinical_Note,Medical_Specialty,output_gpt-4
0,"pediatrics - neonatal, 1-year-old, naps, mama,...",Health maintenance exam for 1-year-old female.,1-year-old Exam - H&P.txt,CHIEF COMPLAINT: This 1-year-old female presen...,Pediatrics - Neonatal,"{\n ""History"": ""This 1-year-old female patien..."
1,"radiology, 2-d study, doppler, tricuspid regur...","Normal left ventricle, moderate biatrial enlar...",2-D Doppler.txt,"2-D STUDY1. Mild aortic stenosis, widely calci...",Radiology,"{\n ""History"": ""The patient's chart does not ..."
2,"radiology, 2-d m-mode, doppler, aortic valve, ...",2-D M-Mode. Doppler.,2-D Echocardiogram - 1.txt,2-D M-MODE: 1. Left atrial enlargement with l...,Radiology,"{\n ""History"": ""The patient's chart does not ..."
3,"radiology, 2-d, doppler, echocardiogram, annul...",2-D Echocardiogram,2-D Echocardiogram - 2.txt,COMMENTS:1. The left ventricular cavity size a...,Radiology,"{\n ""History"": ""The patient's chart does not ..."
4,"radiology, 2-d echocardiogram, cardiac functio...",2-D Echocardiogram,2-D Echocardiogram - 3.txt,2-D ECHOCARDIOGRAMMultiple views of the heart ...,Radiology,"{\n ""History"": ""No relevant medical history, ..."


In [19]:
json.loads(output_clean_df.iloc[0]["output_gpt-4"])


{'indication': 'Routine well child care, Acute conjunctivitis',
 'indication_icd10': ['Z00.129', 'H10.9'],
 'diagnosis_icd10': ['H10.9'],
 'LOINC': ['718-7'],
 'RxNorm': ['1739026'],
 'findings': "1-year-old female presents for a health maintenance exam. Developmental milestones reached. Mother concerned about child's red, matted eye and not walking completely alone yet. Physical exam reveals conjunctivitis in left eye. Hb: 12 g/dl.",
 'summary': "1-year-old female presents for a routine check-up. Mother has concerns about child's red, matted eye and delayed walking. Physical exam reveals conjunctivitis in left eye. No other abnormalities noted. Hb: 12 g/dl. Immunizations updated. Tobramycin ophthalmic prescribed for conjunctivitis.",
 'nurse': "Monitor the child's conjunctivitis and response to treatment. Ensure immunizations are up to date. Schedule a follow-up visit in 3 months.",
 'pharmacy': 'Dispense Tobramycin ophthalmic 0.3% ointment. Instructions: Apply to both eyes QID x 5 da

In [18]:
json.loads(rounding_output_df.iloc[0]["output_gpt-4"])

{'History': 'This 1-year-old female patient has no known medical allergies and is not on any medications. Her past medical history is unremarkable with no previous surgeries. She has a family history of cancer associated with maternal aunt, and hypertension associated with paternal grandfather. She lives at home with parents and is in daycare.',
 'CurrentCondition': 'The patient is in no apparent distress, well developed and well nourished. Her vital signs are stable with a temperature of 97.6°F, height of 31 inches, and weight of 28 lbs. She has conjunctivitis in her left eye. Her respiratory effort is even and nonlabored with clear lung fields. Heart rate and rhythm are regular with no murmurs, gallop, rubs or clicks. Abdomen is soft and nontender with no palpable masses. She moves all extremities and her muscle tone is normal.',
 'Findings': "The patient's hemoglobin level is 12 g/dl. She has been diagnosed with acute conjunctivitis. A blood lead test has been ordered.",
 'Plan': "T

In [20]:
with tempfile.TemporaryDirectory() as temp_dir_name:
    # Copy os.path.join(mtsamples_path, rounding_output) to temp_dir with python
    shutil.copy(os.path.join(mtsamples_path, rounding_output), temp_dir_name)
    shutil.copy(os.path.join(mtsamples_path, output_clean), temp_dir_name)
    
    aml.register_folder_as_dataset(
        folder_path=temp_dir_name,
        dataset_name="mtsamples-gpt4-summarization",
        dataset_description=(
            "GPT-4 summaries from mtsamples dataset.\n\n"
            "Dataset created by running GPT-4 on the mtsamples dataset to generate summaries for several specialties."
        )
    )


2024-10-29 18:59:07.858 | INFO     | medbench.aml:_get_dataset_next_version:251 - Could not find dataset mtsamples-gpt4-summarization, creating new with version "1".
Subtype value SAS has no mapping, use base class DataReferenceCredentialDto.
[32mUploading tmpwgcvbdsc (13.99 MBs): 100%|██████████| 13989375/13989375 [00:08<00:00, 1565718.98it/s]
[39m

