# Medication Extraction

This notebook is intended as an example of extracting medications and their attributes from text.
You can specify your own dataset (must contain text), sampler, schema, model, prompting strategy and templates or use the default ones.

In [56]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths

import os

import torch

from src import paths
from src.utils import (load_model_and_tokenizer,   
                       check_gpu_memory,
                       get_format_fun,
                        information_retrieval, 
                        get_sampler,
                        get_outlines_generator,
                        get_pydantic_schema,
                        format_prompt,
                        outlines_medication_prompting,
)

import argparse

from datasets import Dataset

import json

import random

In [23]:
MODEL_NAME = "Llama2-MedTuned-13b"
QUANTIZATION = "4bit"
PROMPT_STRATEGY = "few_shot_instruction"
SAMPLER_NAME = "greedy"
ATTN_IMPLEMENTATION = "flash_attention_2" # Only enable this if your GPU is able to use flash_attention_2, check https://github.com/Dao-AILab/flash-attention
MAX_TOKENS = 500
NUM_EXAMPLES = 10

In [2]:
# Check GPU Memory, for 13B model should have at least 16GB of VRAM (12 could work for short examples)
check_gpu_memory()

GPU 0: NVIDIA GeForce RTX 3090
   Total Memory: 23.69 GB
   Free Memory: 22.67 GB
   Allocated Memory : 0.00 GB
   Reserved Memory : 0.00 GB


In [7]:
# Load Model and Tokenizer
model, tokenizer = load_model_and_tokenizer(model_name = MODEL_NAME,
                                            task_type = "outlines",
                                            quantization = QUANTIZATION,
                                            attn_implementation = ATTN_IMPLEMENTATION,
                                            )
check_gpu_memory()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

GPU 0: NVIDIA GeForce RTX 3090
   Total Memory: 23.69 GB
   Free Memory: 15.55 GB
   Allocated Memory : 6.90 GB
   Reserved Memory : 7.07 GB


In [25]:
# Output schema (you can implement your own, must follow pydantic BaseModel, or check https://github.com/outlines-dev/outlines
schema = get_pydantic_schema(schema_name="medication")

# Example of what this looks like
get_default_pydantic_model("medication")

MedicationList(medications=[Medication(name='unknown', dose=-99.0, dose_unit='unknown', morning=-99.0, noon=-99.0, evening=-99.0, night=-99.0, extra='')])

In [26]:
# Get sampler (could also be "multionmial" or "beam")
sampler = get_sampler(SAMPLER_NAME)

# Get Generator (for task with JSON output according to schema)
generator = get_outlines_generator(model, sampler, task = "json", schema = schema)

In [61]:
# Load your data
df = Dataset.load_from_disk(paths.DATA_PATH_PREPROCESSED/"medication/kisim_medication_sample")
text = df["text"][1] # Sample of 1

In [79]:
# Write your task instruction, system prompt and examples or load default ones
with open(paths.DATA_PATH_PREPROCESSED/"medication/task_instruction.txt", "r") as f:
    task_instruction = f.read()

with open(paths.DATA_PATH_PREPROCESSED/"medication/system_prompt.txt", "r") as f:
    system_prompt = f.read()

with open(paths.DATA_PATH_PREPROCESSED/"medication/examples.json", "r") as file:
    examples = json.load(file)                  

# Can also select a subset of examples at random
random.seed(42)
examples = random.sample(examples, NUM_EXAMPLES)

In [62]:
# Load a prompting template and format input
format_fun = get_format_fun(prompting_strategy=PROMPT_STRATEGY)
input = format_prompt(text = text, format_fun=format_fun, task_instruction = task_instruction, system_prompt = system_prompt, examples = examples)

# If you don't want to use any of the templates and formatting you can also use the tokenizer to prepare an input:
# input = tokenizer(["This is a test"], return_tensors = "pt")

In [63]:
input

['[INST]<<SYS>>You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.\nYour answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.\nPlease ensure that your responses are socially unbiased and positive in nature.\nIf a question does not makeany sense, or is not factually coherent, explain why instead of answering something not correct. \nIf you don’t know the answer to a question, please don’t share false information.\n<</SYS>>\n\n### Instruction:\nYour task is to extract specific information from medication descriptions. \nThe input for this task is a list of medication descriptions, a report or doctors recipe, and the output should be a complete list of dictionaries (one per medication) with the following keys:\n- name (str): The name of the medication.\n- dose (float): The dose of the medication.\n- dose_unit (str): The unit of the dose.\n- morning (float): The dose to be taken in

In [None]:
# Get results (note you can also use DataLoaders to submit
result = generator(input, max_tokens = MAX_TOKENS)

In [64]:
# Alternatively you can stream the input and observe the progress
for token in generator.stream(input, max_tokens = MAX_TOKENS):
    print(token, end = "")

{
"medications": [
{
"name": "Floxal AT",
"dose": 4,
"dose_unit": "tablet",
"morning": 0,
"noon": 0,
"evening": 0,
"night": 1,
"extra": "5 Tage"
},
{
"name": "Vitamine A AS",
"dose": 1,
"dose_unit": "tablet",
"morning": 0,
"noon": 0,
"evening": 0,
"night": 1,
"extra": "zur Nacht"
}
]
}