In [1]:
import pandas as pd

import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths

from datasets import Dataset

import json

In [2]:
# Medication file
kisim_medication = pd.read_csv(paths.DATA_PATH_SEANTIS/"kisim_medication.csv")

# Drop nan values
kisim_medication = kisim_medication.dropna(subset=["medication_name"])

# Select columns
kisim_medication = kisim_medication[["medication_name", "research_id"]].rename(columns={"medication_name": "text", "research_id": "rid"})

# Remove empty strings
kisim_medication = kisim_medication[kisim_medication["text"] != ""]

# Select random 100 rows
kisim_medication_sample = kisim_medication.sample(100, random_state=42)

for index, row in kisim_medication_sample.iterrows():
    print(row["text"])
    print(50*"-")

Prednison 100 mg	1-0-0		24.09. - 07.10.2016
Prednison 80 mg		1-0-0		08.10. - 14.10.2016
Prednison 60 mg		1-0-0		15.10. - 21.10.2016
Prednison 40 mg		1-0-0		22.10.2016 bis auf weiteres

Pantozol 40 mg		1-0-0 		für die Dauer der Prednison-Behandlung
--------------------------------------------------
Auge rechts:
Floxal AT 4x/d für 5 Tage
Vitamine A AS zur Nacht
--------------------------------------------------
Volare Handgelenksschiene zur Nacht, bitte 1x für beide Hände

Dg.: CTS bds
--------------------------------------------------
Ebrufen 200 mg
--------------------------------------------------
Paracetamol 500 Hänseler neue Formel Tabl 20 (teilbar)
bei Bedarf


Ibuprofen Adico Filmtabl 400 mg 50 
bei Bedarf

Dauerrezept
--------------------------------------------------
Nexium MUPS-Tabl 40 mg 14 Stück Einnahme: 1-0-0
Zolpidem Winthrop Filmtabl 10mg 10 Stück (teilbar) Einnahme: 0-0-1
--------------------------------------------------
Einlagen
----------------------------------------

Some entries don't have medications in them but stuff like: "Bauchgurt" or "Blutdruckmessgerät".
Other entries don't have a full medication intake, like "Sirdalud MR (Ret Kaps 6 mg)"

In [3]:
# Create Dataset
kisim_sample = Dataset.from_dict({
    "text": kisim_medication_sample["text"],
    "rid": kisim_medication_sample["rid"],
})

In [4]:
# Save datasets
os.makedirs(paths.DATA_PATH_PREPROCESSED/"medication", exist_ok=True)
kisim_medication_sample.to_csv(paths.DATA_PATH_PREPROCESSED/"medication/kisim_medication_sample.csv", index=False)
kisim_sample.save_to_disk(paths.DATA_PATH_PREPROCESSED/"medication/kisim_medication_sample")

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

## Prompting with Outlines

The following part is used to create the files for the prompting task with outlines. Write task instruction, system prompt and examples.

In [5]:
# Look at different medication formats:
for idx, text in enumerate(kisim_sample["text"]):
    print(5*"---")
    print(idx)
    print(text)

---------------
0
Prednison 100 mg	1-0-0		24.09. - 07.10.2016
Prednison 80 mg		1-0-0		08.10. - 14.10.2016
Prednison 60 mg		1-0-0		15.10. - 21.10.2016
Prednison 40 mg		1-0-0		22.10.2016 bis auf weiteres

Pantozol 40 mg		1-0-0 		für die Dauer der Prednison-Behandlung
---------------
1
Auge rechts:
Floxal AT 4x/d für 5 Tage
Vitamine A AS zur Nacht
---------------
2
Volare Handgelenksschiene zur Nacht, bitte 1x für beide Hände

Dg.: CTS bds
---------------
3
Ebrufen 200 mg
---------------
4
Paracetamol 500 Hänseler neue Formel Tabl 20 (teilbar)
bei Bedarf


Ibuprofen Adico Filmtabl 400 mg 50 
bei Bedarf

Dauerrezept
---------------
5
Nexium MUPS-Tabl 40 mg 14 Stück Einnahme: 1-0-0
Zolpidem Winthrop Filmtabl 10mg 10 Stück (teilbar) Einnahme: 0-0-1
---------------
6
Einlagen
---------------
7
Amoxicillin Sandoz (Disp Tabl 1000 mg)
---------------
8
Fampyra 10mg 1-0-1 per os
---------------
9
Burgerstein Vitamin D3 (Kaps)
---------------
10
Novalgin (Filmtabl 500 mg)
---------------
11
Cetall

In [6]:
# Prompt
task_instruction ="""Your task is to extract specific information from medication descriptions. 
The input for this task is a list of medication descriptions, a report or doctors recipe, and the output should be a complete list of dictionaries (one per medication) with the following keys:
- name (str): The name of the medication.
- dose (float): The dose of the medication.
- unit (str): The unit of the dose.
- morning (float): The dose to be taken in the morning.
- noon (float): The dose to be taken at noon.
- evening (float): The dose to be taken in the evening.
- night (float): The dose to be taken at night.
- extra (str): Any additional information about the medication, especially its intake.

The output format should look like this:
[
    {name: "MedicationName1", dose: MedicationDose1, unit: "MedicationDoseUnit1", morning: MorningDose1, noon: NoonDose1, evening: EveningDose1, night: NightDose1, extra: "ExtraInfo1"},
    {name: "MedicationName2", dose: MedicationDose2, unit: "MedicationDoseUnit2", morning: MorningDose2, noon: NoonDose2, evening: EveningDose2, night: NightDose2, extra: "ExtraInfo2"},
    ...
]

- The MedicationName can consist of multiple words with whitespace and should be returned as a single string. If you don't find any mentions of medications or drugs, it should be represented as "unknown".
- The MedicationDose can have various formats (e.g. mg, ug, g, stk, ml, tropfen, IE/ml, mmol, unknown, etc.). If the MedicationDose is not detected, it should be represented as "unknown".
- The MedicationDoseUnit is a float value. If the MedicationDoseUnit is not detected, it should be represented as -99.
- The intake doses over the day can be given several ways:
    - If the amount of doses is given in the form of float-float-float, it corresponds to MorningDose-NoonDose-EveningDose with NightDose being 0.
    - If the amount of doses is given in the form float-float-float-float, it corresponds to MorningDose-NoonDose-EveningDose-NightDose.
    - If keywords like "Morgen", "Mittag", "Abend", "Nacht" are used, the corresponding dose should be extracted.
    - If an intake schema like the ones above is not detected, MorningDose, NoonDose, EveningDose and NightDose should all be represented as -99.
- The extra field can contain any additional information about the medication, like its intake (e.g. daily, for 2 weeks). This field can be empty if no additional information is found. 
"""

system_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.
If a question does not makeany sense, or is not factually coherent, explain why instead of answering something not correct. 
If you don’t know the answer to a question, please don’t share false information.
"""

with open(paths.DATA_PATH_PREPROCESSED/"medication/task_instruction.txt", "w") as file:
    file.write(task_instruction)

with open(paths.DATA_PATH_PREPROCESSED/"medication/system_prompt.txt", "w") as file:
    file.write(system_prompt)

In [7]:
# Examples for few shot based on the text formats I have seen
examples = [{"text": "1 OP Cipralex 20 mg 0.5-0-0 für 4 Tag\n 3 OP Propranolol 35g 3-0-1-0", "labels" : '[{name: "Cipralex", unit: "mg", amount: 20, morning: 0.5, noon: 0, evening: 0, night: 0, extra: "für 4 Tage"}, {name: "Propranolol", unit: "g", amount: 35, morning: 3, noon: 0, evening: 1, night: 1, extra: ""}]'},
           {"text": "Blutdruckmessgerät über einen Zeitraum von 3 Wochen", "labels": '[{name: "Blutdruckmessgerät", unit: "unknown", amount: 1, morning: -99, noon: -99, evening: -99, night: -99, extra: "3 Wochen"}]'},
           {"text": "Rezept für Propranolol, einmal täglich, 0.5mg", "labels": '[{name: "Propranolol", unit: "mg", amount: 0.5, morning: -99, noon: -99, evening: -99, night: -99, extra: "einmal täglich"}]'},
           {"text": "Gültigkeit bis Sommer 2024", "labels": '[{name: "unknown", unit: "unknown", amount: -99, morning: -99, noon: -99, evening: -99, night: -99, extra: "bis Sommer 2024"}]'},
           {"text": "1 Tablette Aspirin 100 mg täglich nach dem Frühstück", "labels": '[{name: "Aspirin", unit: "mg", amount: 100, morning: 1, noon: 0, evening: 0, night: 0, extra: "täglich nach dem Frühstück"}]'},
           {"text": "2 Sprühstöße Symbicort 2x täglich morgens und abends", "labels": '[{name: "Symbicort", unit: "Sprühstöße", amount: 2, morning: 1, noon: 0, evening: 1, night: 0, extra: "2x2 täglich morgens und abends"}]'},
           {"text": "Ibuprofen 400mg zur Nacht", "labels": '[{name: "Ibuprofen", unit: "mg", amount: 400, morning: 0, noon: 0, evening: 0, night: 1, extra: "zur Nacht"}]'},
           {"text": "Levocetirizin 5 mg 1-0-1", "labels": '[{name: "Levocetirizin", unit: "mg", amount: 5, morning: 1, noon: 0, evening: 1, night: 0, extra: ""}]'},
            {"text": "1 Tablette Pantoprazol 40 mg morgens", "labels": '[{name: "Pantoprazol", unit: "mg", amount: 40, morning: 1, noon: 0, evening: 0, night: 0, extra: "morgens"}]'},
            {"text": "Stützstrümpfe beidseitig nach Bedarf", "labels": '[{name: "Stützstrümpfe", unit: "unknown", amount: 2, morning: -99, noon: -99, evening: -99, night: -99, extra: "nach Bedarf"}]'},
]
           
# Save dictionary
with open(paths.DATA_PATH_PREPROCESSED/"medication/examples.json", "w") as file:
    json.dump(examples, file, indent=4)