In [None]:
import pandas as pd

import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths

from datasets import Dataset

import json

In [None]:
# Medication file
kisim_medication = pd.read_csv(paths.DATA_PATH_SEANTIS/"kisim_medication.csv")

# Drop nan values
kisim_medication = kisim_medication.dropna(subset=["medication_name"])

# Select columns
kisim_medication = kisim_medication[["medication_name", "research_id"]].rename(columns={"medication_name": "text", "research_id": "rid"})

# Remove empty strings
kisim_medication = kisim_medication[kisim_medication["text"] != ""]

# Select random 100 rows
kisim_medication_sample = kisim_medication.sample(100, random_state=42)

for index, row in kisim_medication_sample.iterrows():
    print(row["text"])
    print(50*"-")

Some entries don't have medications in them but stuff like: "Bauchgurt" or "Blutdruckmessgerät".
Other entries don't have a full medication intake, like "Sirdalud MR (Ret Kaps 6 mg)"

In [None]:
# Create Dataset
kisim_sample = Dataset.from_dict({
    "text": kisim_medication_sample["text"],
    "rid": kisim_medication_sample["rid"],
})

In [None]:
# Save datasets
os.makedirs(paths.DATA_PATH_PREPROCESSED/"medication", exist_ok=True)
kisim_medication_sample.to_csv(paths.DATA_PATH_PREPROCESSED/"medication/kisim_medication_sample.csv", index=False)
kisim_sample.save_to_disk(paths.DATA_PATH_PREPROCESSED/"medication/kisim_medication_sample")

## Prompting with Outlines

The following part is used to create the files for the prompting task with outlines. Write task instruction, system prompt and examples.

In [None]:
# Look at different medication formats:
for idx, text in enumerate(kisim_sample["text"]):
    print(5*"---")
    print(idx)
    print(text)

In [None]:
# Prompt
task_instruction ="""Your task is to extract specific information from medication descriptions. 
The input for this task is a list of medication descriptions, a report or doctors recipe, and the output should be a complete list of dictionaries (one per medication) with the following keys:
- name (str): The name of the medication.
- dose (float): The dose of the medication.
- dose_unit (str): The unit of the dose.
- morning (float): The dose to be taken in the morning.
- noon (float): The dose to be taken at noon.
- evening (float): The dose to be taken in the evening.
- night (float): The dose to be taken at night.
- extra (str): Any additional information about the medication, especially its intake.

The output format should look like this:
[
    {name: "MedicationName1", dose: MedicationDose1, dose_unit: "MedicationDoseUnit1", morning: MorningDose1, noon: NoonDose1, evening: EveningDose1, night: NightDose1, extra: "ExtraInfo1"},
    {name: "MedicationName2", dose: MedicationDose2, dose_unit: "MedicationDoseUnit2", morning: MorningDose2, noon: NoonDose2, evening: EveningDose2, night: NightDose2, extra: "ExtraInfo2"},
    ...
]

- The MedicationName can consist of multiple words with whitespace and should be returned as a single string. If you don't find any mentions of medications or drugs, it should be represented as "unknown".
- The MedicationDose is a float value. If the MedicationDose is not detected, it should be represented as -99.
- The MedicationDoseUnit can have various formats (e.g. mg, ug, g, stk, ml, tropfen, IE/ml, mmol, unknown, etc.). If the MedicationDoseUnit is not detected, it should be represented as "unknown".
- The intake doses over the day can be given several ways:
    - If the amount of doses is given in the form of float-float-float, it corresponds to MorningDose-NoonDose-EveningDose with NightDose being 0.
    - If the amount of doses is given in the form float-float-float-float, it corresponds to MorningDose-NoonDose-EveningDose-NightDose.
    - If keywords like "Morgen", "Mittag", "Abend", "Nacht" are used, the corresponding dose should be extracted.
    - If an intake schema like the ones above is not detected, MorningDose, NoonDose, EveningDose and NightDose should all be represented as -99.
- The extra field can contain any additional information about the medication, like its intake (e.g. daily, for 2 weeks). This field can be empty if no additional information is found. 
"""

system_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.
If a question does not makeany sense, or is not factually coherent, explain why instead of answering something not correct. 
If you don’t know the answer to a question, please don’t share false information.
"""

with open(paths.DATA_PATH_PREPROCESSED/"medication/task_instruction.txt", "w") as file:
    file.write(task_instruction)

with open(paths.DATA_PATH_PREPROCESSED/"medication/system_prompt.txt", "w") as file:
    file.write(system_prompt)

In [None]:
# Examples for few shot based on the text formats I have seen
examples = [{"text": "1 OP Cipralex 20 mg 0.5-0-0 für 4 Tag\n 3 OP Propranolol 35g 3-0-1-0", "labels" : '[{name: "Cipralex", dose: 20, dose_unit: "mg", morning: 0.5, noon: 0, evening: 0, night: 0, extra: "für 4 Tage"}, {name: "Propranolol", dose: 35, dose_unit: "g", morning: 3, noon: 0, evening: 1, night: 1, extra: ""}]'},
           {"text": "Blutdruckmessgerät über einen Zeitraum von 3 Wochen", "labels": '[{name: "Blutdruckmessgerät", dose: 1, dose_unit: "unknown", morning: -99, noon: -99, evening: -99, night: -99, extra: "3 Wochen"}]'},
           {"text": "Rezept für Propranolol, einmal täglich, 0.5mg", "labels": '[{name: "Propranolol", dose: 0.5, dose_unit: "mg", morning: -99, noon: -99, evening: -99, night: -99, extra: "einmal täglich"}]'},
           {"text": "Gültigkeit bis Sommer 2024", "labels": '[{name: "unknown", dose: -99, dose_unit: "unknown", morning: -99, noon: -99, evening: -99, night: -99, extra: "bis Sommer 2024"}]'},
           {"text": "1 Tablette Aspirin 100 mg täglich nach dem Frühstück", "labels": '[{name: "Aspirin", dose: 100, dose_unit: "mg", morning: 1, noon: 0, evening: 0, night: 0, extra: "täglich nach dem Frühstück"}]'},
           {"text": "2 Sprühstöße Symbicort 2x täglich morgens und abends", "labels": '[{name: "Symbicort",  dose: 2, dose_unit: "Sprühstöße", morning: 1, noon: 0, evening: 1, night: 0, extra: "2x2 täglich morgens und abends"}]'},
           {"text": "Ibuprofen 400mg zur Nacht", "labels": '[{name: "Ibuprofen", dose: 400, dose_unit: "mg", morning: 0, noon: 0, evening: 0, night: 1, extra: "zur Nacht"}]'},
           {"text": "Levocetirizin 5 mg 1-0-1", "labels": '[{name: "Levocetirizin", dose: 5, dose_unit: "mg", morning: 1, noon: 0, evening: 1, night: 0, extra: ""}]'},
            {"text": "1 Tablette Pantoprazol 40 mg morgens", "labels": '[{name: "Pantoprazol", dose: 40, dose_unit: "mg", morning: 1, noon: 0, evening: 0, night: 0, extra: "morgens"}]'},
            {"text": "Stützstrümpfe beidseitig nach Bedarf", "labels": '[{name: "Stützstrümpfe", dose: 2, dose_unit: "unknown", morning: -99, noon: -99, evening: -99, night: -99, extra: "nach Bedarf"}]'},
]
           
# Save dictionary
with open(paths.DATA_PATH_PREPROCESSED/"medication/examples.json", "w") as file:
    json.dump(examples, file, indent=4)