In [1]:
import pandas as pd
import csv
from FuzzySubstanceMatching import get_substances
import make_train_data as tr
import train_and_predict as tp
import numpy as np

In [2]:
URL_to_Example_Data = "https://raw.githubusercontent.com/robert-koch-institut/Bundesweiter_klinischer_Krebsregisterdatensatz-Datenschema_und_Klassifikationen/refs/heads/main/Beispieldaten/csv/substanz.csv"
free_text_data = pd.read_csv(URL_to_Example_Data, sep=";")
free_text_data[free_text_data["TypOfSYST_TypSubstanz"] == "Bezeichnung"]
example_data = free_text_data["Bezeichnung"] 
example_data.head()

0     Methotrexat
1    Atezolizumab
2      Filgrastim
3     Bevacizumab
4     Fulvestrant
Name: Bezeichnung, dtype: object

In [3]:
URL_to_list = "https://gitlab.opencode.de/robert-koch-institut/zentrum-fuer-krebsregisterdaten/cancerdata-references/-/raw/main/data/v2/Klassifikationen/substanz.csv?ref_type=heads"
reference_list = pd.read_csv(URL_to_list, sep = ";")
reference_series = reference_list["substanz"].to_list()
print(reference_series[0:3])

['Abarelix', 'Abatacept', 'Abemaciclib']


In [4]:
ouput_string_matching = get_substances(input_col = example_data,
                                        reference_series = reference_series,
                                        split_string = True,
                                        split_pattern = r"[;,]",
                                        fuzzy_threshold = 90)
ouput_string_matching.to_csv("output_string_matchting.csv", index=False, sep = ";", quoting=csv.QUOTE_NONNUMERIC)
ouput_string_matching.head()

Unnamed: 0,ID,Original,Predicted,Similarity
0,1,Methotrexat,Methotrexat,100.0
1,2,Atezolizumab,Atezolizumab,100.0
2,3,Filgrastim,Filgrastim,100.0
3,4,Bevacizumab,Bevacizumab,100.0
4,5,Fulvestrant,Fulvestrant,100.0


In [5]:
subset_uncertain_rows = ouput_string_matching[ouput_string_matching["Similarity"] < 90]
df_for_predictions = subset_uncertain_rows.copy()
df_for_predictions = df_for_predictions.rename(columns={"Original": "input_text"})
filtered_df = df_for_predictions[(df_for_predictions['input_text'] != "") & (df_for_predictions['input_text'].notna())]
filtered_df.to_csv("df_for_predictions.csv", sep = ";", index = False, quoting=csv.QUOTE_NONNUMERIC)

In [6]:
all_subs = reference_list["substanz"].unique().tolist()

train_data = pd.DataFrame({
    "input_text": all_subs,
    "label": all_subs
})

word_list = ["(o.n.a.)", "(wöchentlich)", "(i.v.)", "(n.n.)", "(version)", "(lokal)", "(zyklus)"]

labeled_train_data = tr.create_labeled_train_data(train_data=train_data, word_list=word_list)
labeled_train_data.to_csv("labeled_train_data.csv", sep=";", index=False)

In [7]:
df = tp.load_data("labeled_train_data.csv")
df['input_text'] = df['input_text'].astype(str)
df['label'] = df['label'].astype(str)

train_df, val_df = tp.train_test_split(df, test_size=0.2)
train_examples = tp.prepare_train_examples(train_df, add_negative_samples=False)
val_examples = tp.prepare_train_examples(val_df, add_negative_samples=True)
 
   
model = tp.train_model(train_examples, val_examples, epochs=1) #change to more epochs, this is just a test

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Val-eval Pearson Cosine,Val-eval Spearman Cosine
62,No log,No log,0.794648,0.811167


In [8]:
df_for_predictions = tp.load_data("df_for_predictions.csv")
reference_list_input = reference_list["substanz"].str.strip().unique().tolist()
tp.predict_substances_batch(df_for_predictions, model, reference_list_input)

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Predictions saved to predictions.csv


In [9]:
model_matches = pd.read_csv("predictions.csv", sep = ";")
model_matches["Method"] = "ML_Model"
string_matches = pd.read_csv("output_string_matchting.csv", sep = ";")
string_matches["Method"] = "String_Matching"

extracted_substances = pd.concat([string_matches, model_matches], ignore_index=True).sort_values(by="ID", ascending=True)
extracted_substances["Flag"] = np.where(extracted_substances["Similarity"] <= 90, "yes", "no")

print(extracted_substances)

      ID      Original     Predicted  Similarity           Method Flag
0      1   Methotrexat   Methotrexat       100.0  String_Matching   no
1      2  Atezolizumab  Atezolizumab       100.0  String_Matching   no
2      3    Filgrastim    Filgrastim       100.0  String_Matching   no
3      4   Bevacizumab   Bevacizumab       100.0  String_Matching   no
4      5   Fulvestrant   Fulvestrant       100.0  String_Matching   no
..   ...           ...           ...         ...              ...  ...
195  196    Ribociclib    Ribociclib       100.0  String_Matching   no
196  197  Fluorouracil  Fluorouracil       100.0  String_Matching   no
197  198           NaN           NaN         0.0  String_Matching  yes
198  199           NaN           NaN         0.0  String_Matching  yes
199  200           NaN           NaN         0.0  String_Matching  yes

[216 rows x 6 columns]
