# Generate Synthetic Dataset From MIMIC-IV-Note using Open-source LLM

First, we create the corpus of text chunks by leveraging LlamaIndex to load some financial PDFs, and parsing/chunking into plain text chunks.

### Generate synthetic queries

Now, we use an LLM (Mistral-7B-Instruct-v0.1) to generate questions using each clinical note in the dataset as context.

Each pair of (generated question, clinical note chunk used as context) becomes a datapoint in the finetuning dataset (either for training or evaluation).

In [2]:
import re
import json
import uuid

from llama_index.llms.openai import OpenAI
from llama_index.core.schema import MetadataMode
from tqdm.notebook import tqdm



In [3]:
DATA_PATH = "/home/75y/data_ragMimic/data/"

In [4]:
TRAIN_CORPUS_FPATH = DATA_PATH+'train_corpus.json'
VAL_CORPUS_FPATH = DATA_PATH+'val_corpus.json'

TRAIN_QUERIES_FPATH = DATA_PATH+'train_queries.json'
TRAIN_RELEVANT_DOCS_FPATH = DATA_PATH+'train_relevant_docs.json'

VAL_QUERIES_FPATH = DATA_PATH+'val_queries.json'
VAL_RELEVANT_DOCS_FPATH = DATA_PATH+'val_relevant_docs.json'

In [5]:
with open(TRAIN_CORPUS_FPATH, 'r+') as f:
    train_corpus = json.load(f)

with open(VAL_CORPUS_FPATH, 'r+') as f:
    val_corpus = json.load(f)

In [6]:
from transformers import AutoModelForCausalLM, pipeline
import torch

model_name = 'mistralai/Mistral-7B-Instruct-v0.1'
pipeline_gen = pipeline(
    "text-generation",
    model=model_name,
    device="cuda:0",
    model_kwargs={"torch_dtype": torch.bfloat16},
)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
def generate_queries(
    pipeline,
    corpus,
    num_questions_per_chunk=2,
    prompt_template=None,
    verbose=False,
):
    """
    Automatically generate hypothetical questions that could be answered with
    doc in the corpus.
    """
    # llm = AutoModelForCausalLM.from_pretrained('mistralai/Mistral-7B-Instruct-v0.1', use_cache=True)
    # OpenAI(model=)

#     prompt_template = prompt_template or """\
# Context information is below.

# ---------------------
# {context_str}
# ---------------------

# Given the context information and not prior knowledge.
# generate only questions based on the below query.

# You are a Medical Professional. Your task is to come up with \
# {num_questions_per_chunk} questions for finding clinical information about a patient. \
# The questions should be strictly about patient's medical history and diverse across the document. \
# Restrict the questions to the context information provided. \
# In your response, just list {num_questions_per_chunk} questions separated by new lines.
# """
    prompt_template = prompt_template or """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge. \
Generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided. \
In your response, just list {num_questions_per_chunk} questions separated by new lines.
"""
        
    count = 0
    queries = {}
    relevant_docs = {}
    for node_id, text in tqdm(corpus.items()):
        prompt = prompt_template.format(context_str=text, num_questions_per_chunk=num_questions_per_chunk)

        response = pipeline(prompt, max_new_tokens=100, pad_token_id=2)[0]["generated_text"][len(prompt) :]
        # print(response)
        
        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0]
        questions = [question for question in questions if question != '---------------------']
        
        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]
        count+=1
        if count > 2: break
    return queries, relevant_docs

In [9]:
train_queries, train_relevant_docs = generate_queries(pipeline_gen, train_corpus)

  0%|          | 0/47284 [00:00<?, ?it/s]

In [10]:
train_queries

{'2b099c90-e685-465e-aa61-c33c4d48aaa9': "What is the patient's name?",
 'c3fd91f4-ad9d-4977-8be2-5a3c608a471c': "What is the patient's major surgical or invasive procedure?",
 '9a80d116-30c5-4f03-8bd5-cd786a8d72ea': "What was the patient's discharge diagnosis?",
 'd7ee8c4b-f8dc-4a29-89eb-16b4f47b79bb': "What was the patient's discharge medication regimen?",
 'ba8232e1-f3dd-46c0-843a-f3636cc07d93': "What is the patient's name?",
 '724792d6-f33a-43d9-b465-3742bdb8e5f6': "What is the patient's major surgical or invasive procedure?"}

In [11]:
val_queries, val_relevant_docs = generate_queries(pipeline_gen, val_corpus)

  0%|          | 0/865 [00:00<?, ?it/s]

In [9]:
with open(TRAIN_QUERIES_FPATH, 'w+') as f:
    json.dump(train_queries, f)

with open(TRAIN_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(train_relevant_docs, f)

with open(VAL_QUERIES_FPATH, 'w+') as f:
    json.dump(val_queries, f)

with open(VAL_RELEVANT_DOCS_FPATH, 'w+') as f:
    json.dump(val_relevant_docs, f)

### Final data

In [10]:
TRAIN_DATASET_FPATH = DATA_PATH+'train_dataset.json'
VAL_DATASET_FPATH = DATA_PATH+'val_dataset.json'

In [11]:
train_dataset = {
    'queries': train_queries,
    'corpus': train_corpus,
    'relevant_docs': train_relevant_docs,
}

val_dataset = {
    'queries': val_queries,
    'corpus': val_corpus,
    'relevant_docs': val_relevant_docs,
}

In [12]:
with open(TRAIN_DATASET_FPATH, 'w+') as f:
    json.dump(train_dataset, f)

with open(VAL_DATASET_FPATH, 'w+') as f:
    json.dump(val_dataset, f)