In [1]:
import pandas as pd

import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths

from datasets import Dataset

import json

In [2]:
# Medication file
kisim_medication = pd.read_csv(paths.DATA_PATH_SEANTIS/"kisim_medication.csv")

# Drop nan values
kisim_medication = kisim_medication.dropna(subset=["medication_name"])

# Check if \n is in the medication name
line_breaks = kisim_medication["medication_name"].apply(lambda x: "\n" in x)
kisim_medication[~line_breaks]["medication_name"]

4       Keppra 500 mg 3-0-2 max. 1 Woche, dann 2-0-2 f...
8                             1 OP Rivotril 0.5 mg\t1-1-1
9                             1 OP Rivotril 0.5 mg\t1-1-1
14                      1 Op Antra 40 mg 1-0-0 für 7 Tage
17                1 Blutdruckmessgertät zur Selbstmessung
                              ...                        
4428                        1x Bauchgurt bei Mobilisation
4429                                Fampyra 10 mg \t1-0-1
4430                  Fampyra (ImpD) Retardtabletten 10mg
4432                            Fampyra 10mg 1-0-1 per os
4434                          Sirdalud MR (Ret Kaps 6 mg)
Name: medication_name, Length: 2762, dtype: object

Some entries don't have medications in them but stuff like: "Bauchgurt" or "Blutdruckmessgerät".
Other entries don't have a full medication intake, like "Sirdalud MR (Ret Kaps 6 mg)"

In [3]:
# Select columns
kisim_medication = kisim_medication[["medication_name", "research_id"]].rename(columns={"medication_name": "text", "research_id": "rid"})

# Split text into lines
# kisim_medication = kisim_medication.assign(text=kisim_medication["text"].str.split("\n"))
# kisim_medication = kisim_medication.explode("text")

# Remove empty strings
kisim_medication = kisim_medication[kisim_medication["text"] != ""]

In [4]:
# Create Dataset
kisim = Dataset.from_dict({
    "text": kisim_medication["text"],
    "rid": kisim_medication["rid"],
})

In [5]:
# Save datasets
os.makedirs(paths.DATA_PATH_PREPROCESSED/"medication", exist_ok=True)
kisim_medication.to_csv(paths.DATA_PATH_PREPROCESSED/"medication/kisim_medication.csv", index=False)
kisim.save_to_disk(paths.DATA_PATH_PREPROCESSED/"medication/kisim_medication")

Saving the dataset (0/1 shards):   0%|          | 0/4410 [00:00<?, ? examples/s]

## Prompting with Outlines

The following part is used to create the files for the prompting task with outlines. First we create a list of known MS medications (existed in the old project). Then we write a suitable prompt.


In [37]:
# Medication List from old project, can be extended if other medications should be detected
medication_names = [
    "Avonex",
    "Betaferon",
    "Plegridy",
    "Copaxone",
    "Glatiramyl",
    "Aubagio",
    "Tecfidera",
    "Gilenya",
    "Tysabri",
    "Ocrevus",
    "Lemtrada",
    "Novantron",
    "Endoxan",
    "MabThera",
    "Imurek",
    "Mayzent",
    "Medrol",
    "Solu-Medrol",
    "Solumedrol",
    "Cortison",
    "Interferon beta-1a",
    "Interferon beta-1b",
    "Peginterferon beta-1a",
    "Glatirameracetat",
    "Teriflunomid",
    "Dimethylfumarat",
    "Fingolimod",
    "Natalizumab",
    "Ocrelizumab",
    "Alemtuzumab",
    "Mitoxantron",
    "Cyclophosphamid",
    "Rituximab",
    "Azathioprin",
    "Siponimod",
    "Glucocorticosteroid",
    "Kalium",
    "unknown",
]

swiss_medication = pd.read_excel(paths.DATA_PATH/"raw/medications/SwissMedic_Zugelassene_Arzneimittel_HAM.xlsx", skiprows=6)
# swiss_medication = swiss_medication["Unnamed: 2"].dropna().tolist()

# Medication keys
medication_keys = [medication.replace("-", "_").replace(" ", "_").lower() for medication in medication_names]

# Create dictionary
medication_dict = dict(zip(medication_keys, medication_names))

# Save dictionary
with open(paths.DATA_PATH_PREPROCESSED/"medication/medication_dict.json", "w") as file:
    json.dump(medication_dict, file, indent=4)

In [39]:
# Drop nan from swiss medication
swiss_medication["Heilmittelcode\n\n\nCatégorie du médicament "].unique()

array(['Impfstoffe', 'Blutprodukte', 'Bakterien- und Hefepräparate',
       'Synthetika', 'Phytoarzneimittel',
       'Anthroposophische Arzneimittel', 'Homöopathische Arzneimittel',
       'Biotechnologika', 'Bonbons', 'Antidota',
       'Tibetische Arzneimittel', 'Radiopharmazeutika',
       'Ayurvedische Arzneimittel', 'Allergen: Therapeutikum',
       'Generator', 'Allergen: Diagnostikum',
       'Transplantat: Gewebeprodukt', 'Antivenine',
       'Transplantat: Gentherapieprodukt',
       'Transplantat: Zelltherapieprodukt'], dtype=object)

In [48]:
synthetika = swiss_medication[swiss_medication["Heilmittelcode\n\n\nCatégorie du médicament "] == "Synthetika"]["Bezeichnung des Arzneimittels\n\n\nDénomination du médicament"].tolist()

In [42]:
kisim["text"]

['1 OP Medrol 32 mg \t1-0-0\n2 OP Keppra 500 mg\t3-0-3\n1 OP Urbanyl 10 mg \t0-0-0.5 noch für eine Woche\n1 OP Rivotril 0.5 mg\t1-1-0 noch für eine Woche, dann 1-1-1',
 '1 OP Propanolol 20 mg 1-0-1\n2 OP Medrol 32 mg 1-0-0\n1 OP Calcimagon D3 forte 1-0-0',
 '1 OP Propranolol 40 mg 1-0-0.5\n2 OP Medrol 32 mg 1-0-0\n1 OP Calcimagon D3 forte 1-0-0\n1 OP (100 Stk) Keppra 500 mg 3-0-3\n1 OP (100 Stk.) Pantozol 40 mg\n1 OP Kalium Brause, 100 Stk.\n\n\n',
 '1 OP Rivotril 0.5 mg\t1-1-1\n1 OP Propranolol 40 mg 1-0-0.5\n2 OP Medrol 16 mg 1-0-0',
 'Keppra 500 mg 3-0-2 max. 1 Woche, dann 2-0-2 für eine Woche',
 '2 OP Medrol 4 mg 1-0-0\n1 OP (100 Stk) Keppra 500 mg 2-0-2',
 '1 OP Keppra 500 mg\t\t1-1-1\n2 OP Medrol 4mg\t\tnoch eine Woche 10 mg Medrol täglich, dann 3\n\t\t\t\t\t\tWochen 8 mg, dann 3 Wochen 6 mg \n2 OP Kalium Brause\t\t1-0-1\t\n',
 '1 OP Rivotril 0.5 mg\t1-1-1\nPantoprazol Sandoz 40  1-0-0',
 '1 OP Rivotril 0.5 mg\t1-1-1',
 '1 OP Rivotril 0.5 mg\t1-1-1',
 'Keppra 500 mg\t\t1-1-1\nPro

In [None]:
# Prompt
task_instruction = """Your task is to extract specific information from medication descriptions. 
The input for this task is a medication description, and the output should be the medication name, the dose, the dose unit, and the amount of intake doses over the day in JSON format.
The JSON format should look like this:
{name: MedicationName, dose: MedicationDose, unit: MedicationDoseUnit, morning: MorningDose, noon: NoonDose, evening: EveningDose, night: NightDose}
- The MedicationName can consist of multiple words with whitespaces and should be returned as a single string. If the MedicationName is not detected or if there is no mentions of medications, it should be represented as \"unknown\". "
- The MedicationDose should be in one of the following formats: mg, ug, g, stk. If the MedicationDose is not detected, it should be represented as "unknown".
- The MedicationDoseUnit is a float value. If the MedicationDoseUnit is not detected, it should be represented as -99.
- The intake doses over the day can be given in two ways:
    - If the amount of doses is given in the format of float-float-float, it corresponds to MorningDose-NoonDose-EveningDose with NightDose being 0.
    - If it is given in the form float-float-float-float, it corresponds to MorningDose-NoonDose-EveningDose-NightDose.
    - If a schema like the two above is not detected, all doses should be represented as -99.
"""

system_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.
If a question does not makeany sense, or is not factually coherent, explain why instead of answering something not correct. 
If you don’t know the answer to a question, please don’t share false information.
"""

with open(paths.DATA_PATH_PREPROCESSED/"medication/task_instruction.txt", "w") as file:
    file.write(task_instruction)

with open(paths.DATA_PATH_PREPROCESSED/"medication/system_prompt.txt", "w") as file:
    file.write(system_prompt)

In [52]:
synthetika_keys = [med.strip().replace(" ", "_").replace("-", "_").replace(",","") for med in synthetika]

In [53]:
synthetika_keys

['Rennie_Peppermint_Lutschtabletten',
 'Alka_Seltzer_Brausetabletten',
 'Luvos_Heilerde_Ultra_zum_Einnehmen_Pulver',
 'Luvos_Heilerde_1_zum_Einnehmen_Pulver',
 'Luvos_Heilerde_2_äusserlich_Pulver_für_Umschläge',
 'Neutroses_Kautabletten',
 'Vi_De_3_solution_de_goutte',
 'Zeller_Balsam_Salbe',
 'Emser_Salz_Pulver_zur_Herstellung_einer_Nasenspülung/Mundspülung/Gurgellösung_oder_Lösung_zu_Inhalation',
 'Alcacyl_Tabletten',
 'Lebewohl_Hühneraugenpflaster',
 'Flügge_Kieselerde_Pulver',
 'Magnesia_San_Pellegrino_Pulver_zum_Einnehmen_ohne_Aroma',
 'Pyralvex_Lösung_zur_Anwendung_in_der_Mundhöhle',
 'Otalgan_solution',
 'Ichtholan_10%_Zugsalbe_Salbe',
 'Ichtholan_20%_Zugsalbe_Salbe',
 'Ichtholan_50%_Zugsalbe_Salbe',
 'Dermophil_Indien_Balsam_Stift',
 'Benerva_soluzione_iniettabile',
 'Fortalis_Baume/Balsam_Salbe',
 'Malveol_émulsion_buccale',
 'Leucen_Zugsalbe',
 'Geli_Stop_Tabletten',
 'Enavive_Injektionslösung',
 'Aphenylbarbit_50_mg_Tabletten',
 'Aphenylbarbit_100_mg_Tabletten',
 'Aphenylbar

In [None]:
# Examples for few shot
examples = [{"text": kisim["text"][0], "labels" : '{name: "Medrol", unit: "mg", amount: 32, morning: 1, noon: 0, evening: 0, night: 0}'},
            {"text": kisim["text"][12], "labels" : '{name: "Kalium Brause", unit: "stk.", amount: 100, morning: -99, noon: -99, evening: -99, night: -99}'},
]


In [58]:
from enum import Enum

medication_dict = dict(zip(synthetika_keys, synthetika))

MedicationName = Enum("MedicationName", [(key, value) for key, value in medication_dict.items()])


{'Rennie_Peppermint_Lutschtabletten': 'Rennie Peppermint, Lutschtabletten',
 'Alka_Seltzer_Brausetabletten': 'Alka-Seltzer, Brausetabletten',
 'Luvos_Heilerde_Ultra_zum_Einnehmen_Pulver': 'Luvos Heilerde Ultra zum Einnehmen, Pulver',
 'Luvos_Heilerde_1_zum_Einnehmen_Pulver': 'Luvos Heilerde 1 zum Einnehmen, Pulver',
 'Luvos_Heilerde_2_äusserlich_Pulver_für_Umschläge': 'Luvos Heilerde 2 äusserlich, Pulver für Umschläge',
 'Neutroses_Kautabletten': 'Neutroses, Kautabletten',
 'Vi_De_3_solution_de_goutte': 'Vi-De 3, solution de goutte',
 'Zeller_Balsam_Salbe': 'Zeller Balsam, Salbe',
 'Emser_Salz_Pulver_zur_Herstellung_einer_Nasenspülung/Mundspülung/Gurgellösung_oder_Lösung_zu_Inhalation': 'Emser Salz, Pulver zur Herstellung einer Nasenspülung/Mundspülung/Gurgellösung oder Lösung zu Inhalation',
 'Alcacyl_Tabletten': 'Alcacyl, Tabletten',
 'Lebewohl_Hühneraugenpflaster': 'Lebewohl, Hühneraugenpflaster',
 'Flügge_Kieselerde_Pulver': 'Flügge Kieselerde, Pulver',
 'Magnesia_San_Pellegrino_Pu