In [1]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths

import torch
import json
import pandas as pd

from src.utils import get_default_pydantic_model

import re

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
# Try to fix broken JSON by searching for last "}" then adding "]"
default_model = get_default_pydantic_model("medication")
def fix_json(json_str):
    last_index = json_str.rfind("}")
    if last_index != -1:  # If "}," is found
        fixed_json = json_str[:last_index + 1] + "]}"

        return fixed_json
    else:
        # If "}," is not found, return the default_model
        return default_model.model_dump_json()
    
# If all the values of morning, noon, evening, night are 0, then set them all to -99
def check_and_replace(df, cols_to_check):
    """
    Check if 4 specified columns in each row are all 0,
    then replace those 4 columns with -99.

    Parameters:
        df (pandas.DataFrame): Input DataFrame.
        cols_to_check (list): List of column names to check.

    Returns:
        pandas.DataFrame: DataFrame with replacements.
    """
    for index, row in df.iterrows():
        if (row[cols_to_check] == 0).sum() >= 4:
            df.loc[index, cols_to_check] = -99
    return df

def prepare_results(path: str)->pd.DataFrame:
    results = torch.load(path)
    df = pd.DataFrame(results)

    # Fix model_answers wherever successful is False
    _df_fixed = df[~df["successful"]].apply(lambda row: fix_json(row["model_answers"]), axis=1)
    df_fixed = df.copy()
    df_fixed.loc[~df["successful"], "model_answers"] = _df_fixed

    dfs = []
    for idx, (answer, text) in enumerate(zip(df_fixed["model_answers"], df_fixed["text"])):
        try:
            answer = json.loads(answer)
            medications = answer["medications"]
            for med in medications:
                med["text"] = text
                med["id"] = idx
                dfs.append((med))
        except:
            print(f"Error at index {idx}")
    res = pd.DataFrame(dfs)

    # If all the values of morning, noon, evening, night are 0, then set them all to -99
    res = check_and_replace(res, ["morning", "noon", "evening", "night"])

    # Convert everything to string and lowercase
    res = res.map(lambda x: str(x).lower())

    # Remove .0+ from every string
    expression = r"\.0+$"
    res = res.replace(expression, "", regex=True)

    return res

# Evaluation

In [5]:
def prepare_labels(path: str)->pd.DataFrame:
    labels = pd.read_excel(path) 

    # Convert everything to string and lowercase
    labels = labels.map(lambda x: str(x).lower())

    # Remove .0+ from every string
    expression = r"\.0+$"
    labels = labels.replace(expression, "", regex=True)

    return labels

def calculate_precision_recall(ground_truth, predicted):
    ground_truth = ground_truth.copy()
    predicted = predicted.copy()
    true_positives = {}
    false_positives = {}
    false_negatives = {}
    
    for pred in predicted:
        pred_name = pred["name"]
        true_positives.setdefault("name", 0)
        matched = False
        for i, truth in enumerate(ground_truth):
            if truth["name"] in pred_name or pred_name in truth["name"]: # First we match the medication to the corresponding ground truth
                matched = True
                pred.pop("name") # Remove name and put true positive
                true_positives["name"] += 1
                for key in pred: # Then iterate over the keys and count the true positives and false positives
                    if pred[key] == truth[key]:
                        true_positives.setdefault(key, 0)
                        true_positives[key] += 1
                    else:
                        # If there is no match it means that there is a ground truth without prediction (false negative) and a prediction without
                        # ground truth (false positive)
                        false_positives.setdefault(key, 0)
                        false_positives[key] += 1
                        false_negatives.setdefault(key, 0)
                        false_negatives[key] += 1
                        
                del ground_truth[i]  # Remove the matched item
                break  # Move to the next predicted item
        
        if not matched: # If there is no medication in the ground truth that matches, then it is a false positive for all keys
            for key in pred:
                false_positives.setdefault(key, 0)
                false_positives[key] += 1
    for truth in ground_truth:
        for key in truth:
            false_negatives.setdefault(key, 0)
            false_negatives[key] += 1
    
    precision = {}
    recall = {}
    f1_score = {}

    if len(predicted) == 0:
        true_positives = {key: 0 for key in ground_truth[0].keys()}
        false_positives = {key: 0 for key in ground_truth[0].keys()}

    for key in relevant_columns:
        # Precision: TP / (TP + FP)
        precision[key] = true_positives.get(key, 0) / (true_positives.get(key, 0) + false_positives.get(key, 0) + 1e-10)

        # Recall: TP / (TP + FN)
        recall[key] = true_positives.get(key, 0) / (true_positives.get(key, 0) + false_negatives.get(key, 0) + 1e-10)

        # F1 Score: 2 * (Precision * Recall) / (Precision + Recall)
        f1_score[key] = 2 * (precision[key] * recall[key]) / (precision[key] + recall[key] + 1e-10)
    
    # Catch NA values in true_positives
    
    
    return precision, recall, f1_score

def evaluate_df(ground_truth: pd.DataFrame, predicted: pd.DataFrame, relevant_columns: list)->pd.DataFrame:
    """
    Evaluates the predicted DataFrame against the ground truth DataFrame.

    Parameters:
        ground_truth (pandas.DataFrame): DataFrame with the ground truth.
        predicted (pandas.DataFrame): DataFrame with the predicted values.
        relevant_columns (list): List of relevant columns to evaluate.

    Returns:
        pandas.DataFrame: DataFrame with the evaluated scores.
    """
    
    evaluated = []
    assert len(ground_truth.id.unique()) == len(predicted.id.unique()), "The number of unique ids (texts) in the ground truth and predicted DataFrames do not match."

    for idx in ground_truth.id.unique():
        ground_truth_dict = ground_truth[ground_truth.id == idx][relevant_columns].to_dict("records")
        predicted_dict = predicted[predicted.id == idx][relevant_columns].to_dict("records")

        precision, recall, f1_score = calculate_precision_recall(ground_truth_dict, predicted_dict)

        precision_dict = {f"precision_{key}": value for key, value in precision.items()}
        recall_dict = {f"recall_{key}": value for key, value in recall.items()}
        f1_score_dict = {f"f1_score_{key}": value for key, value in f1_score.items()}
        
        merged = {**precision_dict, **recall_dict, **f1_score_dict}
        merged["text"] = predicted[predicted.id == idx]["text"].values[0]
        merged["id"] = idx
        evaluated.append(merged)

    return pd.DataFrame(evaluated)

def aggregate_scores(evaluated: pd.DataFrame, columns_to_drop: list = ["id", "text"])->pd.DataFrame:
    """
    Aggregates the metrics by averaging metrics over all unique texts. Also aggregates intake dosage metrics.

    Parameters:
        evaluated (pandas.DataFrame): DataFrame with the evaluated scores.
        columns_to_drop (list): List of columns to drop. Default is ["id", "text"].

    Returns:
        pandas.DataFrame: DataFrame with the aggregated scores.
    """
    agg_df = evaluated.drop(columns=columns_to_drop).mean()
    agg_df["precision_intake"] = agg_df[["precision_morning", "precision_noon", "precision_evening", "precision_night"]].mean()
    agg_df["recall_intake"] = agg_df[["recall_morning", "recall_noon", "recall_evening", "recall_night"]].mean()
    agg_df["f1_score_intake"] = agg_df[["f1_score_morning", "f1_score_noon", "f1_score_evening", "f1_score_night"]].mean()
    agg_df.drop(["precision_morning", "precision_noon", "precision_evening", "precision_night", "recall_morning", "recall_noon", "recall_evening", "recall_night", "f1_score_morning", "f1_score_noon", "f1_score_evening", "f1_score_night"], inplace=True)
    return agg_df

- True Positives (TP): Predicted and in ground truth.
- False Positives (FP): Predicted but not in ground truth. So if a dose for a tp medication is predicted wrongly, or if a medication was predicted that is not in the ground truth (and thus also a dose was predicted that is not in the ground truth)
- False Negatives (FN): Not predicted but in ground truth. Happens if a medication is not predicted (and thus also a dose not predicted for this) or if the wrong dose was predicted for the matched medication (entity predicted that is not in ground truth)

In [6]:
labels = prepare_labels(paths.RESULTS_PATH/"medication/labels.xlsx")

res = prepare_results(paths.RESULTS_PATH/"medication/medication_outlines_Llama2-MedTuned-13b_4bit_few_shot_instruction_examples_10.pt")

# Choose only the relevant columns
relevant_columns = ["name", "dose", "dose_unit", "morning", "noon", "evening", "night"]

evaluated = evaluate_df(labels, res, relevant_columns)

agg_df = aggregate_scores(evaluated)

type(agg_df)

pandas.core.series.Series

In [7]:
res

Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
0,prednison,100,mg,1,0,0,0,24.09. - 07.10.2016,prednison 100 mg\t1-0-0\t\t24.09. - 07.10.2016\nprednison 80 mg\t\t1-0-0\t\t08.10. - 14.10.2016\nprednison 60 mg\t\t1-0-0\t\t15.10. - 21.10.2016\nprednison 40 mg\t\t1-0-0\t\t22.10.2016 bis auf weiteres\n\npantozol 40 mg\t\t1-0-0 \t\tfür die dauer der prednison-behandlung,0
1,prednison,80,mg,1,0,1,0,08.10. - 14.10.2016,prednison 100 mg\t1-0-0\t\t24.09. - 07.10.2016\nprednison 80 mg\t\t1-0-0\t\t08.10. - 14.10.2016\nprednison 60 mg\t\t1-0-0\t\t15.10. - 21.10.2016\nprednison 40 mg\t\t1-0-0\t\t22.10.2016 bis auf weiteres\n\npantozol 40 mg\t\t1-0-0 \t\tfür die dauer der prednison-behandlung,0
2,prednison,60,mg,1,0,1,0,15.10. - 21.10.2016,prednison 100 mg\t1-0-0\t\t24.09. - 07.10.2016\nprednison 80 mg\t\t1-0-0\t\t08.10. - 14.10.2016\nprednison 60 mg\t\t1-0-0\t\t15.10. - 21.10.2016\nprednison 40 mg\t\t1-0-0\t\t22.10.2016 bis auf weiteres\n\npantozol 40 mg\t\t1-0-0 \t\tfür die dauer der prednison-behandlung,0
3,prednison,40,mg,1,0,0,0,22.10.2016 bis auf weiteres,prednison 100 mg\t1-0-0\t\t24.09. - 07.10.2016\nprednison 80 mg\t\t1-0-0\t\t08.10. - 14.10.2016\nprednison 60 mg\t\t1-0-0\t\t15.10. - 21.10.2016\nprednison 40 mg\t\t1-0-0\t\t22.10.2016 bis auf weiteres\n\npantozol 40 mg\t\t1-0-0 \t\tfür die dauer der prednison-behandlung,0
4,pantozol,40,mg,1,0,1,0,für die dauer der prednison-behandlung,prednison 100 mg\t1-0-0\t\t24.09. - 07.10.2016\nprednison 80 mg\t\t1-0-0\t\t08.10. - 14.10.2016\nprednison 60 mg\t\t1-0-0\t\t15.10. - 21.10.2016\nprednison 40 mg\t\t1-0-0\t\t22.10.2016 bis auf weiteres\n\npantozol 40 mg\t\t1-0-0 \t\tfür die dauer der prednison-behandlung,0
...,...,...,...,...,...,...,...,...,...,...
130,burfen,600,mg,1,0,1,0,b.b.,burfen 600mg b.b.,96
131,gilenya,0.5,mg,1,0,1,0,einmal täglich oral,"gültig für 12 monate\ngilenya®\nfingolimod.\n\neine kapsel mit 0.5 mg einmal täglich oral\n\nfür\nherr\nschumacher roger\nhirzenbachstrasse 32\nch-8051, zürich\n\nversichert bei\nconcordia\nvers.-nr. 989.08.07.009.7 (hauptgarant)",97
132,fingolimod,0.5,mg,1,0,1,0,für 12 monate,"gültig für 12 monate\ngilenya®\nfingolimod.\n\neine kapsel mit 0.5 mg einmal täglich oral\n\nfür\nherr\nschumacher roger\nhirzenbachstrasse 32\nch-8051, zürich\n\nversichert bei\nconcordia\nvers.-nr. 989.08.07.009.7 (hauptgarant)",97
133,eprex,30000,ie/0.75ml,-99,-99,-99,-99,fertspr,eprex (inj lös 30000 ie/0.75ml) (protecs) fertspr,98


# Examples for Thesis and Presentation

In [7]:
# Spelling mistakes
display(res[res["id"] == "40"]) # also here very imprecise for dose unit , means 1g?
display(res[res["id"] == "79"]) # Schema for medrol 10-0-0, should be 1-0-0-0 probably

Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
52,defalgan,1,unknown,-99,-99,-99,-99,2 tagen pro woche,defalgan 1 an 2 tagen pro woche,40


Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
105,medrol,100,mg,10,0,0,0,po 4 tage,medrol 100 mg 10-0-0 po 4 tage\npantoprazol 40 mg 1-0-0- 7 tage,79
106,pantoprazol,40,mg,1,0,0,0,7 tage,medrol 100 mg 10-0-0 po 4 tage\npantoprazol 40 mg 1-0-0- 7 tage,79


In [8]:
# Difficulty in medication names, would have to be discussed with people interested in task
display(res[res["id"] == "25"].iloc[[-1]]) # Excipial U Lipolotion would be more precise but just uses excipial

Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
35,excipial,1,g,-99,-99,-99,-99,oder alternativ,"diprosalic lösung: 1mal täglich (morgens und abends) tropfenweise auf die kopfhaut auftragen und leicht einreiben, dann 3x/woche weiter\n\ndaivobet salbe, \nb. bedarf auf psoriasistellen auf dem körper \n\ndermovate creme\nstärker wirksam, auf die infiltrierten psoriasisstellen im schub tgl für 10-14 tage, dann 2-3x/woche für 4 wochen\n\nexcipial u lipolotion \noder alternativ \nexcipial u hydrolotio",25


In [9]:
# Example where I did a bad job
display(res[res["id"] == "47"]) # Split in dosis 800/160, but couldn't as restricted to float

Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
61,bactrim,800,mg,1,0,0,0,tabl,bactrim forte (tabl) 800/160mg,47


In [10]:
# Where model struggles (if info not given explicitly)
display(res[res["id"] == "68"]) # like mg here (I had to google as well)
display(res[res["id"] == "77"]) # 2h vor schlafengehen means night/evening. But model didn't see any schema I 
display(res[res["id"] == "87"]) # For us clear that lioresal is meant for everything, but model struggles

Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
85,pk-merz,100,filmtabl,1,0,0,0,erste woche,pk-merz filmtabl 100 \n\nerste woche 1-0-0\nbei bedarf zweite woche 1-1-0 oder 2-0-0,68
86,pk-merz,100,filmtabl,1,1,0,0,bei bedarf zweite woche,pk-merz filmtabl 100 \n\nerste woche 1-0-0\nbei bedarf zweite woche 1-1-0 oder 2-0-0,68


Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
100,fampyra,10.0,mg,1,1,0,0,1-1-0,fampyra 10 mg \t\t1-1-0\nsifrol 0.125 mg \t\t2h vor schlafegehen,77
101,sifrol,0.125,mg,-99,-99,-99,-99,2h vor schlafegehen,fampyra 10 mg \t\t1-1-0\nsifrol 0.125 mg \t\t2h vor schlafegehen,77


Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
115,lioresal,10,mg,-99,-99,-99,-99,3 tage,lioresal 10 mg\ns: 0-0-1/2 \t3 tage\n1/2-0-1/2 \t3 tage\n1/2-1/2-1/2 \t3 tage\n1/2-1/2-1 \t3 tage\n1-1/2-1 \t\t3 tage\ndann 1-1-1\n\nbei abnahme der kraft oder nebenwirkungen wie schläfrigkeit zurück auf die vorherige dosisstufe\n,87
116,1/2,0,unknown,-99,-99,-99,-99,3 tage,lioresal 10 mg\ns: 0-0-1/2 \t3 tage\n1/2-0-1/2 \t3 tage\n1/2-1/2-1/2 \t3 tage\n1/2-1/2-1 \t3 tage\n1-1/2-1 \t\t3 tage\ndann 1-1-1\n\nbei abnahme der kraft oder nebenwirkungen wie schläfrigkeit zurück auf die vorherige dosisstufe\n,87
117,1/2-0-1/2,0,unknown,-99,-99,-99,-99,3 tage,lioresal 10 mg\ns: 0-0-1/2 \t3 tage\n1/2-0-1/2 \t3 tage\n1/2-1/2-1/2 \t3 tage\n1/2-1/2-1 \t3 tage\n1-1/2-1 \t\t3 tage\ndann 1-1-1\n\nbei abnahme der kraft oder nebenwirkungen wie schläfrigkeit zurück auf die vorherige dosisstufe\n,87
118,1/2-1/2-1/2,0,unknown,-99,-99,-99,-99,3 tage,lioresal 10 mg\ns: 0-0-1/2 \t3 tage\n1/2-0-1/2 \t3 tage\n1/2-1/2-1/2 \t3 tage\n1/2-1/2-1 \t3 tage\n1-1/2-1 \t\t3 tage\ndann 1-1-1\n\nbei abnahme der kraft oder nebenwirkungen wie schläfrigkeit zurück auf die vorherige dosisstufe\n,87
119,1/2-1/2-1,0,unknown,-99,-99,-99,-99,3 tage,lioresal 10 mg\ns: 0-0-1/2 \t3 tage\n1/2-0-1/2 \t3 tage\n1/2-1/2-1/2 \t3 tage\n1/2-1/2-1 \t3 tage\n1-1/2-1 \t\t3 tage\ndann 1-1-1\n\nbei abnahme der kraft oder nebenwirkungen wie schläfrigkeit zurück auf die vorherige dosisstufe\n,87
120,1,1,unknown,-99,-99,-99,-99,3 tage,lioresal 10 mg\ns: 0-0-1/2 \t3 tage\n1/2-0-1/2 \t3 tage\n1/2-1/2-1/2 \t3 tage\n1/2-1/2-1 \t3 tage\n1-1/2-1 \t\t3 tage\ndann 1-1-1\n\nbei abnahme der kraft oder nebenwirkungen wie schläfrigkeit zurück auf die vorherige dosisstufe\n,87


In [11]:
# Example for model hallucination (Avanex is insurance not medication)
display(res[res["id"] == "78"].iloc[[0]])

Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
102,avanex,1,unknown,-99,-99,-99,-99,für 1 jahr,für 1 jahr \n\navanex: vers.-nr. 96472969 \n\ntecfidera 120mg 1-0-1 \tfür 1 woche\nab 2. woche tecfidra 240mg 1-0-1 \n,78


In [12]:
# Example where LLM excels compared to rule-based
display(res[res["id"] == "48"]) # 3x3 täglich
display(res[res["id"] == "60"]) # 1 tbl. täglich

Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
62,otrivin,3,spray,1,1,1,0,für 5 tage,1 op otrivin spray 3x3/d für 5 tage\n1 op mefenacid 500mg 1-1-1,48
63,mefenacid,500,mg,1,1,1,0,1-1-1,1 op otrivin spray 3x3/d für 5 tage\n1 op mefenacid 500mg 1-1-1,48


Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
77,detrusitol,4,mg,1,0,0,0,sr,detrusitol sr 4mg\n1tbl. tgl.,60


In [13]:
examples = pd.concat([
    res[res["id"] == "0"],
           res[res["id"] == "48"],
           res[res["id"] == "60"],
        res[res["id"] == "40"],
           res[res["id"] == "79"],
          res[res["id"] == "25"].iloc[[-1]],
          res[res["id"] == "47"],
           res[res["id"] == "68"],
           res[res["id"] == "77"],
           res[res["id"] == "87"],
           res[res["id"] == "78"],
          ], axis = 0)
examples.to_csv(paths.THESIS_PATH/"presentation_examples.csv")

Problems in input:
- Some inputs are structured with one line per medication e.g. df["text"][0], others are medical recipes like df["text"][38]. The model struggles a bit sometimes with inputs that are not as well structured (but still really good)
- The problem above seems to be solvable by providing appropriate examples but I don't know if I get all the different input formats.
- Sometimes medications are misspelled (like Propanolol) and model extracts it the way it was (which is the desired behaviour I think, because I don't have the medical expertise to correct it). Unsure what the best way to correct it is.
- A lot of times the schema for intake changes, so after 2 weeks maybe it is less or more. Additionally extracting this in detail could be very hard and might negatively affect the performance of the other outputs (which seem more important to me, but I am no doctor). This is also only the case for a few of the examples as far as I can tell.
- If the text just mentions "once daily" or similar I told the model to map it all in the morning (so once daily is 1-0-0) but not sure if that would be desired behaviour.
- How would I evaluate the performance (spelling mistakes, forget medicine etc.). I could evaluate a test set myself (100 examples) but I can't guarantee that the criteria I set would be reasonable from a medical point of view.

# Results 13

In [21]:
# Llama 13B
results_13b = []
filenames13b = [filename for filename in os.listdir(paths.RESULTS_PATH/"medication") if filename.startswith("medication_outlines_Llama2-MedTuned-13b")]
for filename in filenames13b:
    res = prepare_results(paths.RESULTS_PATH/"medication"/filename)
    evaluated = evaluate_df(labels, res, relevant_columns)
    agg_df = aggregate_scores(evaluated)
    results_13b.append(agg_df)
results_13b = pd.concat(results_13b, axis=1).round(2)
results_13b.columns = [filename.split("4bit_")[1] for filename in filenames13b]

# Reorder rows
results_13b = results_13b.reindex(["precision_name", "precision_dose", "precision_dose_unit", "precision_intake", "recall_name", "recall_dose", "recall_dose_unit", "recall_intake", "f1_score_name", "f1_score_dose", "f1_score_dose_unit", "f1_score_intake"]).transpose()
results_13b.to_csv(paths.THESIS_PATH/"medication_results_13b.csv")

In [20]:
results_13b

Unnamed: 0,precision_name,precision_dose,precision_dose_unit,precision_intake,recall_name,recall_dose,recall_dose_unit,recall_intake,f1_score_name,f1_score_dose,f1_score_dose_unit,f1_score_intake
few_shot_vanilla_examples_10.pt,0.94,0.75,0.77,0.58,0.96,0.76,0.79,0.6,0.94,0.75,0.77,0.59
few_shot_instruction_examples_1.pt,0.93,0.79,0.83,0.66,0.92,0.78,0.83,0.66,0.92,0.78,0.82,0.65
zero_shot_vanilla.pt,0.9,0.64,0.71,0.59,0.91,0.65,0.73,0.6,0.89,0.64,0.71,0.59
few_shot_instruction_examples_4.pt,0.93,0.82,0.87,0.63,0.94,0.83,0.88,0.63,0.93,0.82,0.87,0.63
few_shot_instruction_examples_2.pt,0.94,0.75,0.76,0.48,0.95,0.76,0.77,0.49,0.93,0.75,0.76,0.48
few_shot_instruction_examples_10.pt,0.98,0.87,0.91,0.6,0.99,0.88,0.92,0.61,0.98,0.87,0.91,0.6
zero_shot_instruction.pt,0.95,0.67,0.76,0.64,0.94,0.67,0.76,0.64,0.94,0.67,0.76,0.64
few_shot_instruction_examples_8.pt,0.93,0.81,0.81,0.64,0.94,0.82,0.82,0.65,0.94,0.81,0.81,0.64


# Results 7B

In [23]:
# Llama 7B
results_7b = []
filenames7b = [filename for filename in os.listdir(paths.RESULTS_PATH/"medication") if filename.startswith("medication_outlines_Llama2-MedTuned-7b")]
for filename in filenames7b:
    res = prepare_results(paths.RESULTS_PATH/"medication"/filename)
    evaluated = evaluate_df(labels, res, relevant_columns)
    agg_df = aggregate_scores(evaluated)
    results_7b.append(agg_df)
results_7b = pd.concat(results_7b, axis=1).round(2)
results_7b.columns = [filename.split("4bit_")[1] for filename in filenames7b]

# Reorder rows
results_7b = results_7b.reindex(["precision_name", "precision_dose", "precision_dose_unit", "precision_intake", "recall_name", "recall_dose", "recall_dose_unit", "recall_intake", "f1_score_name", "f1_score_dose", "f1_score_dose_unit", "f1_score_intake"]).transpose()
results_7b.to_csv(paths.THESIS_PATH/"medication_results_7b.csv")

In [24]:
results_7b

Unnamed: 0,precision_name,precision_dose,precision_dose_unit,precision_intake,recall_name,recall_dose,recall_dose_unit,recall_intake,f1_score_name,f1_score_dose,f1_score_dose_unit,f1_score_intake
few_shot_instruction_examples_1.pt,0.93,0.77,0.76,0.64,0.84,0.69,0.7,0.62,0.87,0.72,0.72,0.63
zero_shot_instruction.pt,0.77,0.63,0.49,0.37,0.73,0.59,0.47,0.35,0.73,0.6,0.47,0.35
few_shot_instruction_examples_10.pt,0.91,0.71,0.68,0.59,0.84,0.65,0.63,0.55,0.85,0.66,0.64,0.56
few_shot_vanilla_examples_10.pt,0.91,0.71,0.68,0.64,0.84,0.64,0.64,0.62,0.85,0.65,0.64,0.62
zero_shot_vanilla.pt,0.72,0.61,0.49,0.29,0.67,0.57,0.47,0.26,0.67,0.56,0.47,0.26
few_shot_instruction_examples_4.pt,0.92,0.73,0.7,0.59,0.86,0.67,0.66,0.55,0.88,0.68,0.67,0.56
few_shot_instruction_examples_8.pt,0.91,0.7,0.65,0.65,0.86,0.65,0.62,0.62,0.86,0.66,0.62,0.62
few_shot_instruction_examples_2.pt,0.91,0.71,0.71,0.51,0.86,0.66,0.66,0.47,0.87,0.67,0.68,0.48


# Rule Based Approach
From old project

In [20]:
def load_list_medi_ms():
    
    '''
    load list of MS medications
    
    '''
    
    with open(paths.PROJECT_ROOT/"resources/old_project/medication_for_ms.txt", "r") as f:
        list_medi_ms = f.readlines()
    list_medi_ms = [item.strip() for item in list_medi_ms]
    
    return list_medi_ms

def _split_dose_and_unit(test_str, list_unit):
    
    '''
    split strings for dose and unit which aren't separated by a space, e.g. '120mg' and '0.5mg'
    
    '''
    
    def _contains_alpha_and_numeric(test_str):
        '''
        helper function to determine whether string could represent a dose and a unit and if it contains a dot
        '''

        # initiliaze
        status = 'no'
        
        # dose and unit start with a digit and end with a letter
        if (test_str[0].isdigit()) & (test_str[-1].isalpha()):

            # does it contain a dot
            if '.' in test_str:
                status = 'w/_dot'
            else:
                status = 'w/o_dot'

        return status

    # initialize
    list_tokens = [test_str]

    # get status whether it could be a dose and unit
    status = _contains_alpha_and_numeric(test_str)
    
    # if it contains 1 dot but no other special characters, and all letters represent a unit
    if status == 'w/_dot':

        if (test_str.count('.') == 1) & (len(re.findall('[\W]', test_str.replace('.', ''))) == 0):
            
            if re.findall('[a-zA-Z]+', test_str)[0] in list_unit:
    
                list_tokens = list(re.findall('(\d+)\.(\d+)?(\w+)', test_str)[0])
                list_tokens = ['.'.join(list_tokens[:2]), list_tokens[-1]]

    # if it doesn't contain a dot and all characters are either digits or letters
    if status == 'w/o_dot':
        
        if test_str.isalnum():
    
            list_tokens = list(re.findall('(\d+)(\w+)', test_str)[0])
    
    return list_tokens

def extract_dose_and_unit(list_tokens, list_unit_match):

    # intialize
    dose = ''
    unit = ''
    
    # extract dose and unit if there is exactly one match
    if len(list_unit_match) == 1:

        unit = list_unit_match[0]
        dose = list_tokens[list_tokens.index(unit) - 1]   
        
    return dose, unit

def extract_dosage_across_day(list_dose_match):
    
    # initialize
    morning = ''
    noon = ''
    evening = ''
    night = ''
    
    # extract dosage for first entry
    if len(list_dose_match) >= 1:
        
        medi_dose = list_dose_match[0]
        list_medi_doses = medi_dose.split('-')
        
        # 3 entries, e.g 1-1-1
        if len(list_medi_doses) == 3:
            morning = list_medi_doses[0]
            noon = list_medi_doses[1]
            evening = list_medi_doses[2]
            night = 0
            
        # 4 entries, e.g. 1-0-0-0
        elif len(list_medi_doses) == 4:
            morning = list_medi_doses[0]
            noon = list_medi_doses[1]
            evening = list_medi_doses[2]
            night = list_medi_doses[3]
            
    return morning, noon, evening, night

   
def flatten_listoflists(listoflists):
    '''
    function to flatten a list of lists
    
    input:
    - listoflists: nested list
    
    output:
    - flat_list: unnested list
    '''
    
    flat_list = [item for sublist in listoflists for item in sublist]
    
    return flat_list


In [21]:
df_medi = pd.read_csv(paths.DATA_PATH_PREPROCESSED/"medication/kisim_medication_sample.csv")
df_medi["id"] = df_medi.index

In [22]:
# list of units and MS medications
list_unit = ['mg', 'ug', 'g']
list_medi_ms = load_list_medi_ms()

# intitialize
list_output = list()

# for each row
for _, row in df_medi.iterrows():
    
    # get research id, etc.
    rid = row['rid']
    text_all = row['text']
    id = row['id']
    
    # split text into lines
    list_text_all = text_all.splitlines()
    
    # for each text line
    for text in list_text_all:
        
        # get tokens and split dose and unit, e.g. '120mg'
        list_tokens = text.split()
        list_tokens = flatten_listoflists([_split_dose_and_unit(item, list_unit) for item in list_tokens])
        
        # match medication names, units and dosing (e.g. 1-1-1)
        list_name_match = list(set(list_tokens).intersection(list_medi_ms))
        list_unit_match = list(set(list_tokens).intersection(list_unit))
        list_dose_match = [item for item in list_tokens if '-' in item]

        # if an MS medication name was matched
        if len(list_name_match) >=  1:

            # get (first) medication name (there are very few cases with > 1 name)
            name = list_name_match[0]

            # get dose and unit
            dose, unit = extract_dose_and_unit(list_tokens, list_unit_match)
              
            # get dosage across day
            morning, noon, evening, night = extract_dosage_across_day(list_dose_match)

            # extra field
            extra = ""

            # append
            list_output.append((name, dose, unit, morning, noon, evening, night, extra, text_all, id))       
            
# generate output data frame
df_results = pd.DataFrame(list_output, 
                         columns = [ 
                                    'name', 'dose', 'dose_unit', 
                                    'morning', 'noon', 'evening','night',
                                    'extra',
                                    'text', "id"])
output_ids = set(df_results.id.unique())
print("Number of reports that were processed:", len(output_ids))
left_over_ids = set(df_medi.id.unique()) - output_ids

left_over_dfs = []
for id in left_over_ids:
    _df = {**default_model.model_dump()["medications"][0], "text": df_medi[df_medi.id == id].text.values[0], "id": id}
    left_over_dfs.append(_df)
left_over_df = pd.DataFrame(left_over_dfs)

df_results = pd.concat([df_results, left_over_df]).sort_values("id").reset_index(drop=True)

# To get comparability need to map to string

df_results = df_results.map(lambda x: str(x).lower())
expression = r"\.0+$"
df_results = df_results.replace(expression, "", regex=True)


df_results.head()

Number of reports that were processed: 4


Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
0,unknown,-99,unknown,-99,-99,-99,-99,,prednison 100 mg\t1-0-0\t\t24.09. - 07.10.2016\nprednison 80 mg\t\t1-0-0\t\t08.10. - 14.10.2016\nprednison 60 mg\t\t1-0-0\t\t15.10. - 21.10.2016\nprednison 40 mg\t\t1-0-0\t\t22.10.2016 bis auf weiteres\n\npantozol 40 mg\t\t1-0-0 \t\tfür die dauer der prednison-behandlung,0
1,unknown,-99,unknown,-99,-99,-99,-99,,auge rechts:\nfloxal at 4x/d für 5 tage\nvitamine a as zur nacht,1
2,unknown,-99,unknown,-99,-99,-99,-99,,"volare handgelenksschiene zur nacht, bitte 1x für beide hände\n\ndg.: cts bds",2
3,unknown,-99,unknown,-99,-99,-99,-99,,ebrufen 200 mg,3
4,unknown,-99,unknown,-99,-99,-99,-99,,paracetamol 500 hänseler neue formel tabl 20 (teilbar)\nbei bedarf\n\n\nibuprofen adico filmtabl 400 mg 50 \nbei bedarf\n\ndauerrezept,4


In [23]:
df_results[df_results.id == "93"]

Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
93,gilenya,5,mg,1,0,0,0,,"gilenya 0,5 mg p.o.\t1-0-0\n\nassura kranken- und unfallversicherung\nvers.-nr. 00000914176\n\n1 jahr gültig",93


In [24]:
res[res["id"] == "78"]

Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
89,avanex: : ot,1,unknown,-99,-99,-99,-99,für : ot 1 : ot 1 : ot 1,für 1 jahr \n\navanex: vers.-nr. 96472969 \n\ntecfidera 120mg 1-0-1 \tfür 1 woche\nab 2. woche tecfidra 240mg 1-0-1 \n,78


In [25]:
labels[labels["id"] == "78"]

Unnamed: 0,name,dose,dose_unit,morning,noon,evening,night,extra,text,id
103,tecfidera,120,mg,1,0,1,0,für 1 woche,für 1 jahr \n\navanex: vers.-nr. 96472969 \n\ntecfidera 120mg 1-0-1 \tfür 1 woche\nab 2. woche tecfidra 240mg 1-0-1 \n,78
104,tecfidera,240,mg,1,0,1,0,ab 2. woche,für 1 jahr \n\navanex: vers.-nr. 96472969 \n\ntecfidera 120mg 1-0-1 \tfür 1 woche\nab 2. woche tecfidra 240mg 1-0-1 \n,78


In [26]:
evaluated_rule = evaluate_df(labels, df_results, relevant_columns)
agg_df_rule = aggregate_scores(evaluated_rule)
agg_df_rule

precision_name         0.070000
precision_dose         0.060000
precision_dose_unit    0.070000
recall_name            0.060000
recall_dose            0.050000
recall_dose_unit       0.060000
f1_score_name          0.063333
f1_score_dose          0.053333
f1_score_dose_unit     0.063333
precision_intake       0.070000
recall_intake          0.060000
f1_score_intake        0.063333
dtype: float64

In [27]:
# For the ones it predicted:
predicted_examples_ids = [str(id) for id in output_ids]
aggregate_scores(evaluated_rule[evaluated_rule.id.isin(predicted_examples_ids)])

precision_name         1.000000
precision_dose         0.750000
precision_dose_unit    1.000000
recall_name            0.750000
recall_dose            0.500000
recall_dose_unit       0.750000
f1_score_name          0.833333
f1_score_dose          0.583333
f1_score_dose_unit     0.833333
precision_intake       1.000000
recall_intake          0.750000
f1_score_intake        0.833333
dtype: float64

In [39]:
res_rule = pd.DataFrame([aggregate_scores(evaluated_rule), aggregate_scores(evaluated_rule[evaluated_rule.id.isin(predicted_examples_ids)])]).round(2)
res_rule["sample"] = ["whole test set", "extracted"]
res_rule.to_csv(paths.THESIS_PATH/"medication_results_rule.csv")

In [40]:
res_rule

Unnamed: 0,precision_name,precision_dose,precision_dose_unit,recall_name,recall_dose,recall_dose_unit,f1_score_name,f1_score_dose,f1_score_dose_unit,precision_intake,recall_intake,f1_score_intake,sample
0,0.07,0.06,0.07,0.06,0.05,0.06,0.06,0.05,0.06,0.07,0.06,0.06,whole test set
1,1.0,0.75,1.0,0.75,0.5,0.75,0.83,0.58,0.83,1.0,0.75,0.83,extracted


Notes for rule based:
- Only first example of medication is extracted.
- Only 4 out of 100 examples were even detected. (Also in original one they only detected around 6% of examples)
- Even for the ones it detected, if there are multiple medications it won't extract them. So recall not as high.
- For precision of course very high, but also here mistakes. Like dose-unit 0,5 is different from 0.5, which LLM catches as it outputs float format, while rule based does text matching

## Intermezzo
Just to check if they also just extracted so few examples

In [30]:
df_medi1 = pd.read_csv(paths.DATA_PATH_RSD/'reports_kisim_medication.csv')
# drop empty medication text
df_medi1 = df_medi1[df_medi1['medication_name'].notnull()]
# list of units and MS medications
list_unit = ['mg', 'ug', 'g']
list_medi_ms = load_list_medi_ms()

# intitialize
list_output = list()

# for each row
for _, row in df_medi1.iterrows():
    
    # # get research id, etc.
    # rid = row['rid']
    # text_all = row['text']
    # get research id, etc.
    rid = row['research_id']
    date = row['medication_prescription_date']
    prescription = row['medication_prescription_name']
    text_all = row['medication_name']
    
    # split text into lines
    list_text_all = text_all.splitlines()
    
    # for each text line
    for text in list_text_all:
        
        # get tokens and split dose and unit, e.g. '120mg'
        list_tokens = text.split()
        list_tokens = flatten_listoflists([_split_dose_and_unit(item, list_unit) for item in list_tokens])
        
        # match medication names, units and dosing (e.g. 1-1-1)
        list_name_match = list(set(list_tokens).intersection(list_medi_ms))
        list_unit_match = list(set(list_tokens).intersection(list_unit))
        list_dose_match = [item for item in list_tokens if '-' in item]

        # if an MS medication name was matched
        if len(list_name_match) >=  1:

            # get (first) medication name (there are very few cases with > 1 name)
            name = list_name_match[0]

            # get dose and unit
            dose, unit = extract_dose_and_unit(list_tokens, list_unit_match)
              
            # get dosage across day
            morning, noon, evening, night = extract_dosage_across_day(list_dose_match)

            # append
            list_output.append((rid, name, dose, unit, morning, noon, evening, night, text, text_all))        
            
# generate output data frame
df_results1 = pd.DataFrame(list_output, 
                         columns = ['rid', 
                                    'name', 'dose', 'unit', 
                                    'morning', 'evening', 'noon', 'night',
                                    'text_line', 'text_all'])
len(df_results1)/len(df_medi1)

0.06009885150784365