In [1]:
from datasets import Dataset, load_dataset, concatenate_datasets
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def _process_doc(doc):
        return {
            "query": doc["question"] + "\n" + \
                    "".join([f" ({k}) {v}" if i else f"({k}) {v}" \
                    for i, (k, v) in enumerate(doc["options"].items())]),
            "choices": list(doc["options"].values()),
            "gold": ord(doc["answer_idx"])-ord("A"),
        }

def doc_to_text(doc):
    doc = _process_doc(doc)
    doc["prompt"] =  f"Question: {doc['query']}\nAnswer:"
    return doc

In [6]:
dataset_ = load_dataset("augtoma/medqa_usmle", split="test")

In [7]:
dataset_ = dataset_.map(doc_to_text)

In [8]:
dataset_[0]

{'question': 'A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?',
 'answer': 'Tell the attending that he cannot fail to disclose this mistake',
 'options': {'A': 'Disclose the error to the patient and put it in the operative report',
  'B': 'Tell the attending that he cannot fail to disclose this mistake',
  'C': 'Report the physician to the ethics committee',
  'D': 'Refuse to dictate the operative report'},
 'meta_info': 'step1',
 'ans

In [9]:
dataset_.to_json("datasets/medical_datasets/medqa_usmle.json")

Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 12.42ba/s]


4503397

In [11]:
project_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
pretrain_dataset = Dataset.from_json(f"{project_path}/datasets/medical_datasets/GenMedGPT-5k.json")
with open("../datasets/medical_datasets/medmcqa-instruction-train.json", "r") as f:
    ft_dataset = json.load(f)

Generating train split: 5452 examples [00:00, 34424.15 examples/s]


In [12]:
n_samples = 200
n_samples = min(n_samples, len(pretrain_dataset))
pretrain_dataset = pretrain_dataset.select(range(n_samples))
n_samples = min(n_samples, len(ft_dataset))
ft_dataset = ft_dataset[:n_samples]


In [13]:
# Prepare data for Dataset.from_dict
data_columns = {key: [] for key in ft_dataset[0].keys()}
for item in ft_dataset:
    for key, value in item.items():
        data_columns[key].append(value)
ft_dataset = Dataset.from_dict(data_columns)
ft_dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 200
})

In [14]:
merged_dataset = concatenate_datasets([pretrain_dataset, ft_dataset])
merged_dataset

Dataset({
    features: ['input', 'instruction', 'output'],
    num_rows: 400
})

In [15]:
merged_dataset.to_json(os.path.join(project_path, f"datasets/medical_datasets/gpt_medmcqa.json"))

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 135.08ba/s]


263091