In [None]:
# ==============================
# STEP A — Install libraries
# ==============================
!pip install transformers torch accelerate

# ==============================
# STEP B — Load libraries
# ==============================
import json
from transformers import pipeline
from tqdm import tqdm

# ==============================
# STEP C — Load your dataset
# ==============================
# Suppose your JSON looks like:
# [
#   {"note": "Patient presents with cough and fever...", "diagnosis": "pneumonia"},
#   {"note": "No chest pain. Complains of shortness of breath.", "diagnosis": "asthma"}
# ]

# Option 1: upload interactively
# from google.colab import files
# uploaded = files.upload()  # choose your file from local machine
# input_filename = list(uploaded.keys())[0]
# print("Uploaded:", input_filename)

# Option 2 (alternate): mount Drive and point to file
from google.colab import drive
drive.mount('/content/drive')
input_filename = "/content/drive/MyDrive/Datasets/LLMs/MIMIC/mimic_llm_training_data.json"
print("Input file from Drive:", input_filename)

DATA_FILE = "/content/drive/MyDrive/Datasets/LLMs/MIMIC/mimic_llm_training_data.json"   # <-- replace with your filename
NOTE_FIELD = "instruction"             # field that contains the clinical note
NEW_OUTFILE = "mimic_with_extracted_symptoms.json" # New output file name

with open(DATA_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

print("Loaded", len(data), "records")

# ==============================
# STEP D — Load Biomedical NER model
# ==============================
# This model recognizes symptoms, signs, diseases, etc.
ner_pipeline = pipeline("ner", model="d4data/biomedical-ner-all", aggregation_strategy="simple")

# ==============================
# STEP E — Define extraction function
# ==============================
def extract_symptoms(text):
    ents = ner_pipeline(text)
    symptoms = []
    for e in ents:
        if "symptom" in e["entity_group"].lower() or "sign" in e["entity_group"].lower():
            symptoms.append(e["word"])
    # Deduplicate
    return list(set(symptoms))

# ==============================
# STEP F — Run extraction and prepare new data
# ==============================
new_data = []

for idx, record in enumerate(tqdm(data[:10], desc="Processing")):
    note_text = record.get(NOTE_FIELD, "")
    extracted = extract_symptoms(note_text) if note_text else []
    new_record = {
        "instruction": record.get("instruction", ""),
        "input": extracted, # Store extracted symptoms in the 'input' field of the new record
        "output": record.get("output", "")
    }
    new_data.append(new_record)


# ==============================
# STEP G — Save results to new file
# ==============================
with open(NEW_OUTFILE, "w", encoding="utf-8") as f:
    json.dump(new_data, f, ensure_ascii=False, indent=4)

print("✅ Wrote new data to", NEW_OUTFILE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Input file from Drive: /content/drive/MyDrive/Datasets/LLMs/MIMIC/mimic_llm_training_data.json
Loaded 13063 records


Device set to use cpu
Processing: 100%|██████████| 10/10 [00:08<00:00,  1.19it/s]

✅ Wrote new data to mimic_with_extracted_symptoms.json





In [None]:
import json
print(json.dumps(new_data, ensure_ascii=False, indent=4))

[
    {
        "instruction": "     \n\n     Sex:  M\n\nService:  Medicine\n\nCHIEF COMPLAINT:  Admitted from rehabilitation for\nhypotension (systolic blood pressure to the 70s) and\ndecreased urine output.\n\nHISTORY OF PRESENT ILLNESS:  The patient is a 76-year-old\nmale who had been hospitalized at the  from  through  of \nafter undergoing a left femoral-AT bypass graft and was\nsubsequently discharged to a rehabilitation facility.\n\nOn , he presented again to the  after being found to have a systolic\nblood pressure in the 70s and no urine output for 17 hours.\nA Foley catheter placed at the rehabilitation facility\nyielded 100 cc of murky/brown urine.  There may also have\nbeen purulent discharge at the penile meatus at this time.\n\nOn presentation to the Emergency Department, the patient was\nwithout subjective complaints.  In the Emergency Department,\nhe was found to have systolic blood pressure of 85.  He was\ngiven 6 liters of intravenous fluids and transiently started\no