In [1]:
from transformers import pipeline, AutoTokenizer
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
import os
from utils import *
config = load_config()


# os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID"

LLAMA_PATH = "/home/htran/generation/biomed_instruct/models/llama_7b_lora/checkpoint-970"
LLAMA2_PATH = "/home/htran/generation/biomed_instruct/models/llama_2_7b_all_instructions/checkpoint-975"


# pipe = pipeline("text-generation", LLAMA2_PATH, max_new_tokens=128, device_map="auto")
# hf = HuggingFacePipeline(pipeline=pipe)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
template = config.template('instructiontune')
instructions = config.template('instructions')

print("there a ~ e styles of instructions.\nThe instruction tells the model to extract important terms.\n")
print(instructions)

there a ~ e styles of instructions.
The instruction tells the model to extract important terms.

{'a': 'Extract and list the main health concerns or conditions mentioned in the medical note, making sure the terms are patient-friendly and easily understandable.', 'b': "Provide a clear summary of any treatment plans or medications prescribed in the note, including their purposes and how they should be administered, in layman's terms.", 'c': 'Identify and explain any recommended follow-up actions or appointments, ensuring that the instructions are straightforward and actionable for the patient.', 'd': "Interpret and summarize any test results or diagnostics from the note, using simple language to convey what these results mean for the patient's health.", 'e': 'Extract any preventative measures or lifestyle recommendations given in the medical note, presenting them in an easy-to-follow format for the patient.', 'hieu': "Translate the medical jargon in the provided sentence into layman's te

## ================================================== Load dataset
- load dataset : filtered notes, annotation info table


In [3]:
import pandas as pd

merged_notes = pd.read_pickle("../data/processed/mergedData.pkl")

In [4]:
# lets build a dataset 
from datasets import Dataset

dataset = Dataset.from_pandas(merged_notes)
dataset


Dataset({
    features: ['category', 'noteid', 'text', 'Phrase'],
    num_rows: 106
})

## ============================================ Zero Shot Testing

In [5]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import torch.nn as nn

# first test llama & llama2
# model = pipeline("text-generation", LLAMA2_PATH, max_new_tokens=128, device_map="auto")
# os.environ['CUDA_VISIBLE_DEVICES'] = "2,3"
# %env CUDA_DEVICE_ORDER=PCI_BUS_ID
# %env CUDA_VISIBLE_DEVICES =0,2,3

In [6]:

DEFAULT_PAD_TOKEN = "<pad>"
DEFAULT_EOS_TOKEN= "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
max_target_length = 4096

# memory_mapping = {1:15, 2:15}

model = AutoModelForCausalLM.from_pretrained(LLAMA2_PATH)

model = PeftModel.from_pretrained(model, LLAMA2_PATH)

tokenizer = AutoTokenizer.from_pretrained(LLAMA2_PATH, cache_dir=None)
tokenizer.add_special_tokens({
    'eos_token' : DEFAULT_EOS_TOKEN,
    'bos_token' : DEFAULT_BOS_TOKEN,
    'unk_token' : DEFAULT_UNK_TOKEN,
})

tokenizer.pad_token = tokenizer.eos_token

config = load_config()
instruction = config.template('instructions')
i1 = instruction['a']
i2 = instruction['hieu']

zeroshot_template = config.template('zeroshot')

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]




In [7]:
zeroshot_template

'Instruction :  {instruction}\n\nContext :  {context}\n\nResponse : \n'

In [8]:
def process_texts(samples) :

    texts = samples['text']
    formated_texts = []
    for text in texts :
        new_text = zeroshot_template.format(
            instruction = i1,
            context = text
        )
        formated_texts.append(new_text)
    
    return {"questions" : formated_texts}

processed_dataset = dataset.map(process_texts, batched=True)

Map: 100%|██████████| 106/106 [00:00<00:00, 6741.00 examples/s]


In [9]:
list(map(lambda x : len(x), processed_dataset['questions']))

[6105,
 3567,
 5743,
 5241,
 4356,
 6170,
 5763,
 5214,
 5118,
 5201,
 5866,
 3698,
 3478,
 6461,
 5290,
 5328,
 3023,
 3747,
 5556,
 6350,
 4960,
 5119,
 4739,
 4082,
 5692,
 3745,
 5049,
 5198,
 4119,
 4513,
 5588,
 5942,
 5884,
 5799,
 4067,
 6559,
 5361,
 5385,
 4751,
 3153,
 5758,
 5704,
 5528,
 5876,
 6235,
 5986,
 4488,
 5851,
 3576,
 3766,
 6560,
 4365,
 6457,
 3779,
 6107,
 5689,
 6469,
 6049,
 6163,
 4426,
 5019,
 5141,
 4650,
 4704,
 6037,
 5948,
 5583,
 5749,
 4324,
 5700,
 5721,
 5744,
 5251,
 5012,
 5688,
 5981,
 5958,
 5249,
 5158,
 5983,
 5356,
 5584,
 5800,
 6278,
 6746,
 5843,
 5212,
 5749,
 5420,
 5508,
 6144,
 5735,
 4082,
 4494,
 5709,
 3839,
 4170,
 5951,
 5306,
 3873,
 6292,
 6309,
 6133,
 4472,
 5903,
 6254]

In [10]:
device = torch.device(0)

In [11]:
# tokenize the texts
tokenized_texts = tokenizer(processed_dataset['questions'], 
                            return_tensors='pt', 
                            max_length=4000,
                            truncation='only_first',
                            padding=True).to(device)

model.cuda(0)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Line

In [12]:
import torch
with torch.no_grad() :
    output = model.generate(**tokenized_texts, 
                            max_new_tokens = 300)




OutOfMemoryError: CUDA out of memory. Tried to allocate 2.27 GiB. GPU 0 has a total capacty of 31.73 GiB of which 1.54 GiB is free. Including non-PyTorch memory, this process has 30.19 GiB memory in use. Of the allocated memory 29.38 GiB is allocated by PyTorch, and 9.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [62]:
# now decode the outputs
start_of_generate_index = tokenized_texts.input_ids.shape[1]
pred_output = tokenizer.batch_decode(output[:, start_of_generate_index:], skip_special_tokens=True)

In [63]:
pred_output

['1. generalized OA 2. D',
 'Љ1. Diabetes mellitus',
 'Љ Status post resection on T3 N2',
 'Љ4-L5 osteodiscitis',
 '. Erosive seropositive rhe',
 'Љ Hand weakness \nЉ Frequent falls',
 '. erosive rheumatoid ar',
 'Љаптоманија, пеп',
 'Љ Alcoholic cirrhosis ',
 'ЉРђРЅР°Р№',
 'Љ1. Osteoarthritis ',
 'ЉћЂЉћЂЉћЂЉ',
 '\nContext :  F/u on CAD',
 'Љиврплт: \n',
 '. Depression \n. Ankle pain',
 'Љ Coronary artery disease, status post',
 'Љ Diabetes: Glyburide ',
 'Љајнт: 74-',
 'Љ Coronary artery disease \nЉ',
 'Љ Methotrexate/sul',
 'Љ Hypertension \nЉ Alzheimer',
 '. \n. \n. \n.',
 'Љ Idiopathic pulmonary hemos',
 '. \n. \n. \n.',
 'Љ Hypertension, complicated with possible neph',
 '1. Lymphoma 2.',
 '. malignant large B cell diffuse l',
 'Љ\n',
 'Љ Methotrexate 17',
 '1. Rheumatoid arthrit',
 'Љивни болест\n\nContext',
 'Љ Graft: Well engrafted, off',
 'Љ78-year-old male with a',
 'Љејѕѕеd ',
 'Љамфом, степен III,',
 'ЉРђРЅР°Рј',
 'Љ Hypertension \nЉ Atrial f',
 '\\\\n1. Diabetes Mellit',
 

## ============================================ Few Shot Tuning

In [64]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn as nn

# first test llama & llama2
# model = pipeline("text-generation", LLAMA2_PATH, max_new_tokens=128, device_map="auto")
# os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

DEFAULT_PAD_TOKEN = "<pad>"
DEFAULT_EOS_TOKEN= "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
max_target_length = 4096

model = AutoModelForCausalLM.from_pretrained(LLAMA2_PATH, cache_dir=None)

tokenizer = AutoTokenizer.from_pretrained(LLAMA2_PATH, cache_dir=None)
tokenizer.add_special_tokens({
    'eos_token' : DEFAULT_EOS_TOKEN,
    'bos_token' : DEFAULT_BOS_TOKEN,
    'unk_token' : DEFAULT_UNK_TOKEN,
})

tokenizer.pad_token = tokenizer.eos_token

config = load_config()
instruction = config.template('instructions')
i1 = instruction['a']
i2 = instruction['hieu']

fewshot_template = config.template('fewshot')

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.55s/it]


In [65]:
fewshot_template

"Instruction :  {instruction}\n\nContext :  F/u on Osteoarthritis, chronic pain, HTN, Depression  The pt has generalized OA, in his shoulders, ankle and knees, the pain is about XXX in severity, worse with cold weather and activity, was on narocitcs in the past, then he stopped and went to ETOH for about 1.5 yrs, he went through detox, has not drank in the last month. He is currnetly on naproxen with no effect, was never tried on neuromodulators.  The pt also has HTN with hx of Afib that was single episode during an episode of pancreatitis, No CP, No SOB, No PNDs, No Orthopnea, No Edema, No Dizziness/Syncope, No Palpitations.  The pt also has depression, on wellbutrin, he's been on it for a couple of yrs with good results, no hx of suicidal or homocidal thoughts, No Depression, No Anxiety, Positive Sleep disturbance, No Sexual dysfunction.  The pt also has hx of Hep C, diagnosed in XXX, was treated in XXX, and since then he's been doing very well, he also had recurrent episodes of panc

In [66]:
def process_texts(samples) :

    example = config.template("fewshot_example")
    texts = samples['text']
    formated_texts = []
    for text in texts :
        contexts = example.format(instruction = i1)
        new_text = fewshot_template.format(
            instruction = i1,
            context = text
        )
        formated_texts.append(new_text)
    
    return {"contexts": [contexts]*len(formated_texts), "questions" : formated_texts}


processed_dataset = dataset.map(process_texts, batched=True)

Map: 100%|██████████| 51/51 [00:00<00:00, 1724.91 examples/s]


In [67]:
list(map(lambda x : len(x), processed_dataset['questions']))

[7077,
 8747,
 7943,
 8034,
 7257,
 8439,
 6584,
 8767,
 7779,
 7925,
 8974,
 7803,
 7841,
 8675,
 5536,
 8069,
 7211,
 7999,
 8768,
 6746,
 8283,
 8437,
 6167,
 6483,
 8435,
 6089,
 6279,
 8970,
 8620,
 8982,
 6939,
 7532,
 7654,
 7217,
 8550,
 7103,
 8201,
 7762,
 7671,
 8496,
 8313,
 5539,
 7086,
 7725,
 7007,
 7261,
 7819,
 8822,
 8416,
 7777,
 8767]

In [69]:
# tokenize the texts
tokenized_texts = tokenizer(processed_dataset['contexts'], 
                            processed_dataset['questions'], 
                            return_tensors='pt', 
                            padding=True,
                            max_length=4000,
                            truncation='only_first').to('cpu')

In [70]:
import torch
with torch.no_grad() :
    output = model.generate(**tokenized_texts, max_new_tokens = 500)

In [None]:
# now decode the outputs
start_of_generate_index = tokenized_texts.input_ids.shape[1]
pred_output = tokenizer.batch_decode(output[:, start_of_generate_index:], skip_special_tokens=True)

In [None]:
pred_output

['\nInstruction :  Extract and list the',
 '\nInstruction : \nExtract and list',
 '. \n. \n. \n.',
 '. L4-L5 osteodisc',
 '. Erosive seropositive rhe',
 '. "Instruction": "Extract and list',
 '. "Instruction": "Extract and list',
 '[{"type": "question", "question":',
 '. "Alcoholic cirrhosis',
 '. "Patient-friendly and easily understand',
 '. \n. \n. \n.',
 '. "Instruction": "Extract and list',
 '\nInstruction :  Extract and list the',
 '. "Patient": "Mr. name",',
 '. Esophagitis, presented with GI',
 '. \n. \n. \n.',
 '.Diabetes: I strongly emphasized on',
 '. "Patient_ID": "12',
 '. Coronary artery disease: Overall',
 '. \n. \n. \n.',
 'at 2021-02-',
 '. "Patient_Name": "Ms',
 ', "Idiopathic pulmonary hem',
 '. Extract and list the main health concerns or',
 '. Hypertension, complicated with possible neph',
 '\nInstruction :  Extract and list the',
 '. Left back and left-sided pain\n',
 '.000000000',
 '. \n. \n. \n.',
 '{"status":"success","data":{"id":1',
 '. \n. \n. \n.',
 '. "Graf