In [1]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths

import pandas as pd

In [2]:
# Read the test rids
test_rids = pd.read_csv(os.path.join(paths.DATA_PATH_PREPROCESSED, "ms-diag/test_rids.txt"), header=None).values.flatten()

In [3]:
# Kisim diagnoses
kisim_diagnoses = pd.read_csv(os.path.join(paths.DATA_PATH_SEANTIS, "kisim_diagnoses.csv")).rename(columns={"diagnosis_label": "text"})

# Some text will be doubled in dataset as doctors just append, and there is now way to know from this data what text belongs to what.
# LastUpdateDateTime is shared with first part of diagnosis id and then the second part is the indicator of the individual texts so 4085680|1 for example

# Number of nan values in text
print(f"Number of nan values in text: {kisim_diagnoses['text'].isna().sum()}")
kisim_diagnoses.dropna(subset=["text"], inplace=True)

# Split the text into lines
kisim_diagnoses["text"] = kisim_diagnoses["text"].str.split("\n")

def preprocess_text(text):
    # Remove empty strings and whitespace-only strings
    try:
        lines = [str(item) for item in text if not (not item or item.isspace())]
    except:
        print(text)
    # Add 'INDENT' to lines starting with '-', '·', '··', and remove '·'
    lines = ['INDENT ' + item.replace('·', '') if item.startswith(('-', '·', '··')) else item for item in lines]
    return lines

kisim_diagnoses["text"] = kisim_diagnoses["text"].apply(preprocess_text)

# Select longest text for each research_id
kisim_diagnoses_to_remove = kisim_diagnoses[kisim_diagnoses["research_id"].isin(test_rids)]
remove_ids = kisim_diagnoses_to_remove.groupby("research_id").apply(lambda x: x.loc[x["text"].apply(len).idxmax()])["diagnosis_id"]

kisim_diagnoses = kisim_diagnoses[~kisim_diagnoses["diagnosis_id"].isin(remove_ids)]

kisim_diagnoses["text"] = kisim_diagnoses.apply(lambda x: "\n".join(x["text"]), axis=1)

Number of nan values in text: 2


  remove_ids = kisim_diagnoses_to_remove.groupby("research_id").apply(lambda x: x.loc[x["text"].apply(len).idxmax()])["diagnosis_id"]


In [4]:
# Reports kisim diagnoses also contains more texts
kisim_reports_diagnoses = pd.read_csv(os.path.join(paths.DATA_PATH_RSD, "reports_kisim_diagnoses.csv")).rename(columns={"diagnosis_label": "text"})
kisim_reports_diagnoses["text"] = kisim_reports_diagnoses["text"].str.split("\n")
kisim_reports_diagnoses.dropna(subset=["text"], inplace=True)
kisim_reports_diagnoses["text"] = kisim_reports_diagnoses["text"].apply(preprocess_text)
kisim_reports_diagnoses["text"] = kisim_reports_diagnoses.apply(lambda x: "\n".join(x["text"]), axis=1)

# Combine the two datasets
df_all = pd.concat([kisim_diagnoses, kisim_reports_diagnoses], axis=0)

# Drop double diagnoses ids
df_all.drop_duplicates(subset="diagnosis_id", inplace=True)

In [42]:
# Prepare for training
context_length = 128

# tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(paths.MODEL_PATH/"Llama2-MedTuned-13b")

# Tokenize the text
def tokenize(example):
    outputs = tokenizer(
    example["text"],
    max_length = context_length,
    return_overflowing_tokens=True,
    return_length = True,
    truncation = True,
    )
    return outputs


# outputs = tokenizer(
#     df_all["text"].tolist(),
#     truncation=True,
#     max_length=context_length,
#     return_overflowing_tokens=True,
#     return_length=True,
# )

from datasets import Dataset
df = Dataset.from_dict({"text": df_all["text"]})

In [43]:
df = df.select(range(2)).map(tokenize, remove_columns=df.column_names)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [44]:
print(f"Input IDs length: {len(df['input_ids'])}")
print(f"Input chunk lengths: {(df['length'])}")
print(f"Chunk mapping: {df['overflow_to_sample_mapping']}")

Input IDs length: 2
Input chunk lengths: [[128, 8], [70]]
Chunk mapping: [[0, 0], [0]]


In [7]:
from src.utils import load_line_label_data
# Comparison medication kisim reports
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 20)
line_labels = pd.DataFrame(load_line_label_data()["train"])
line_labels[line_labels["class_agg"] == "medms"]["text"].iloc[10]
# It seems that in reports they don't have dosis and when, but more like start date and what medication
line_labels[line_labels["class_agg"] == "medo_unk_do_so"]["text"]

37                                                                                               INDENT EDSS 08/2016: 3.0
80                                                                                    2. Rezidivierende Uveitiden EM 1993
81                                                                                    3. Lymphopenie Grad 1-2, ED 02/2018
82                                                                 DD Interferon-/Rituximab-assoziiert, DD unklare Genese
131                                                        INDENT Modasomil (10/2019 Valens), Fampyra, Lioresal (12/2019)
                                                              ...                                                        
899                                   INDENT Stomatitis aphthosa mit schmerzhafter Aphthe an Zungenspitze seit 06.03.2020
900                                                                              INDENT Re-Soor-Stomatitis, ED 11.03.2020
901                     

In [12]:
# Medications
kisim_medications = pd.read_csv(os.path.join(paths.DATA_PATH_SEANTIS, "kisim_medication.csv"))
kisim_medications_text = kisim_medications["medication_name"]
print(kisim_medications_text.iloc[0])

1 OP Medrol 32 mg 	1-0-0
2 OP Keppra 500 mg	3-0-3
1 OP Urbanyl 10 mg 	0-0-0.5 noch für eine Woche
1 OP Rivotril 0.5 mg	1-1-0 noch für eine Woche, dann 1-1-1


In [9]:
# medications.csv not really helpful as of now
medications = pd.read_csv(os.path.join(paths.DATA_PATH_SEANTIS, "medications.csv"))
medications.head()

NameError: name 'kisim_diagnoses_text' is not defined

# Dataset

In [None]:
# Dataset for unsupervised pretrain will only consist of kisim_diagnoses_text

In [None]:

# I will adapt the code from original project, that also produced preprocessed data for previous tasks.
# Then c

def _get_diag_lines(diag):
    
    '''
    get list of text lines for a kisim_diagnoses.csv 'diag_label' entry
    
    input:
    - diag: str with text from one 'diag_label' entry
    
    ouptut:
    - diag_lines: list of text lines
    
    '''
    
    diag_lines = diag.splitlines()
    diag_lines = [item for item in diag_lines if not (not item or item.isspace())]
    diag_lines = ['INDENT ' + item.replace('·', '') if item.startswith(('-', '·', '··')) else item for item in diag_lines]
    
    return diag_lines


def extract_longest_diag_per_rid(df, var_date, var_diag):
    '''
    function to extract longest lines of all diagnosis texts for each research id
    
    input:
    - df: dataframe containing all information
    - var_date: column name of date column
    - var_diag: column name of diagnosis text column
    
    ouptut:
    - dict_diags: dictionary of diagnosis text (key: research id-diag_index-date, value: list of text lines)   
    '''
    
    dict_diags = dict()
    list_vars = [var_date, var_diag]

    for rid in df['research_id'].unique():

        list_lines = list()
        date = ''
        _df = df[df['research_id'] == rid].sort_values([var_date])[list_vars]

        for diag_index, row in _df.iterrows():

            diag = row[var_diag]

            if isinstance(diag, str):
                
                diag_lines = _get_diag_lines(diag)
                
                if len(diag_lines) >= len(list_lines):
                    
                    list_lines = diag_lines[:]
                    date = str(row[var_date])[:11]
                    
        key = '_'.join((rid, str(diag_index), date))
                
        dict_diags[key] = list_lines

    return dict_diags