In [1]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths

from pydantic import BaseModel, Field
from enum import Enum, auto
from src.utils import load_model_and_tokenizer, zero_shot_base, zero_shot_instruction, few_shot_base, few_shot_instruction

from outlines import samplers
import outlines

import pandas as pd

import json

from datasets import Dataset

In [2]:
# Load medication dict
with open(paths.DATA_PATH_PREPROCESSED/"medication/medication_dict.json", "r") as f:
    medication_dict = json.load(f)

MedicationName = Enum("MedicationName", [(key, value) for key, value in medication_dict.items()])

class MedicationUnit(str, Enum):
    mg = "mg"
    ug = "ug"
    g = "g"
    unknown = "unknown"

intake_amount = Field(pattern=r"\d+(\.\d{1,2})?")

class Medication(BaseModel):
    name: MedicationName
    unit: MedicationUnit
    amount: float
    morning: float
    noon: float
    evening: float
    night: float

class MedicationList(BaseModel):
    medications: list[Medication]

# Instantiate Medication
medication1 = Medication(
    name="Avonex",
    unit="mg",
    amount=100.0,
    morning=-99,
    noon=0.5,
    evening=0,
    night=1,
)

In [3]:
model, tokenizer = load_model_and_tokenizer("Llama2-MedTuned-13b",
                                            task_type = "outlines",
                                            quantization = "4bit",
                                           )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
sampler = samplers.greedy()
generator = outlines.generate.json(model, Medication, sampler = sampler)

In [3]:
df = Dataset.load_from_disk(paths.DATA_PATH_PREPROCESSED/"medication/kisim_medication")

In [4]:
with open(paths.DATA_PATH_PREPROCESSED/"medication/task_instruction.txt", "r") as f:
    task_instruction = f.read()

with open(paths.DATA_PATH_PREPROCESSED/"medication/system_prompt.txt", "r") as f:
    system_prompt = f.read()
                    
# Outlines uses the default tokenize settings to encode, thus special tokens are added automatically and need not be added in the prompt.


In [18]:
from typing import Callable, Union
from outlines.generate import SequenceGenerator
from outlines.samplers import Sampler
from pydantic import BaseModel

In [23]:
def format_prompt(text: list[str], format_fun: Callable[list[str], list[str]], *args, **kwargs) -> list[str]:
    """
    Formats a list of texts using a given formatting function. Used for formatting text with a prompt template.

    Args:
        text (list[str]): list of strings to be formatted
        format_fun (Callable[list[str], list[str]]): formatting function. Specify additional arguments using *args and **kwargs.

    Returns:
        list[str]: list of formatted strings
    """
    return [format_fun(t, **kwargs) for t in text]

def outlines_prompting(text: list[str], generator: SequenceGenerator, sampler: Sampler, batch_size: int = 4)-> list[Union[str, BaseModel]]:
    """
    Generates a list of sequences using the given outlines generator and sampler.

    Args:
        text (list[str]): list of strings to be used as prompts
        generator (outlines.SequenceGenerator): outlines generator
        sampler (outlines.Sampler): outlines sampler
        batch_size (int, optional): batch size. Defaults to 4.

    Returns:
        list[Union[str, pydantic.BaseModel]]: list of generated sequences
    """
    dataloader = DataLoader(text, batch_size = batch_size, shuffle = False)

    results = []

    for batch in tqdm(dataloader):
        answer = generator(batch)
        results.extend(answer)

    return results


In [30]:
format_prompt(df["text"][:5], few_shot_instruction, system_prompt=system_prompt, task_instruction=task_instruction, examples = examples)

['[INST]<<SYS>>You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.\nYour answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.\nPlease ensure that your responses are socially unbiased and positive in nature.\nIf a question does not makeany sense, or is not factually coherent, explain why instead of answering something not correct. \nIf you don’t know the answer to a question, please don’t share false information.\n<</SYS>>\n\n### Instruction:\nYour task is to extract specific information from medication descriptions. \nThe input for this task is a medication description, and the output should be the medication name, the dose, the dose unit, and the amount of intake doses over the day in JSON format.\nThe JSON format should look like this:\n{name: MedicationName, dose: MedicationDose, unit: MedicationDoseUnit, morning: MorningDose, noon: NoonDose, evening: EveningDose, night: Nig

In [20]:
examples = [{"text": df["text"][0], "labels" : '{name: "Medrol", unit: "mg", amount: 32, morning: 1, noon: 0, evening: 0, night: 0}'}]
print(few_shot_instruction(df["text"][0], system_prompt, task_instruction, examples))

[INST]<<SYS>>You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.
If a question does not makeany sense, or is not factually coherent, explain why instead of answering something not correct. 
If you don’t know the answer to a question, please don’t share false information.
<</SYS>>

### Instruction:
Your task is to extract specific information from medication descriptions. 
The input for this task is a medication description, and the output should be the medication name, the dose, the dose unit, and the amount of intake doses over the day in JSON format.
The JSON format should look like this:
{name: MedicationName, dose: MedicationDose, unit: MedicationDoseUnit, morning: MorningDose, noon: NoonDose, evening: EveningDose, night: NightDose}
- The