In [None]:
# Polars import
# Import the data
def load_line_labelling():
    """Loading the data from the nested csv files in the different "imported_time" directories. Labelled reports have a "rev.csv" ending.
    """

    df = pl.DataFrame(
    
    )

    for root, dirs, files in os.walk(paths.DATA_PATH_LABELLED):
        for file in files:
            # Get the research id from filename
            rid = file.split("_")[0]
            
            if (file.endswith("rev.csv") and "mri" not in file):
                # Create a dataframe from the csv file
                _df = pl.read_csv(os.path.join(root, file))
                
                # Add the rid to the dataframe
                _df = _df.select(
                    pl.col("text").alias("text"),
                    pl.col("class").alias("class"),
                    pl.lit(rid).alias("rid"),
                )
                # Append the dataframe to the main dataframe
                try: 
                    df = df.vstack(_df)
                except:
                    print("Error with file: ", file)
                    print("df head: ", df.head(5))
                    print("_df head: ", _df.head(5))
                    continue
    return df

def clean_line_text(df: pl.DataFrame):
    """Cleans the dataframe from the load_line_labelling function. 
    Text is cleaned by:
    1) Removes whitespace from beginning and end of text
    2) removes double spaces,
    3) remove empty lines and lines starting with "·" or "··".

    Args:
        df (pl.DataFrame): Input dataframe
    """

    df = df.with_columns(
        pl.col("text").map_elements(lambda s: s.strip())
        .map_elements(lambda s: s.replace("  ", " "))
        .map_elements(lambda s: s.replace("·", ""))
        .map_elements(lambda s: s.replace("··", ""))
        ).filter(pl.col("text").is_not_null())

    return df

def clean_line_class(df: pl.DataFrame):
    """Cleans the dataframe labels in "class".
    1) Removes whitespace from beginning and end of text
    2) Correct spelling mistakes
    3) Exclude classes that are not part of the original approach
    4) Create a new column "class_agg" with the aggregated classes of the original approach.
    5) OneHotEncode the "class_agg" column

    Args:
        df (pl.DataFrame): Input dataframe
    """

    # Class mapping spelling mistakes
    class_mapping_spelling = {
        'memds': 'medms',
    }

    # Classes of original approach abbreviation
    classes_orig = ["dm", "do", "cu", "his", "sym", "so", "mr", "labr", "labo", "medms", "medo", "tr", "head", "unk"]

    
    
    # Class mapping of original approach
    class_mapping_agg = {
        'his': 'his_sym_cu',
        'sym': 'his_sym_cu',
        'cu': 'his_sym_cu',
        'labr': 'labr_labo',
        'labo': 'labr_labo',
        'to': 'to_tr',
        'tr': 'to_tr',
        'medo': 'medo_unk_do_so',
        'unk': 'medo_unk_do_so',
        'do': 'medo_unk_do_so',
        'so': 'medo_unk_do_so',
    }

    # Cleaning the class column
    df = df.with_columns(
        pl.col("class").map_elements(lambda s: s.strip())
        .map_elements(lambda s: class_mapping_spelling.get(s, s))
        .map_elements(lambda s: s if s in classes_orig else None)
    )


    # Creating a new column with the aggregated classes
    df = df.with_columns(
        pl.col("class").map_elements(lambda s: class_mapping_agg.get(s, s))
        .alias("class_agg"),
    )

    return df

# Cleaning Text
def clean_line_text(df: pd.DataFrame):
    """Cleans the dataframe from the load_line_labelling function. 
    Text is cleaned by:
    1) Removes whitespace from beginning and end of text
    2) removes double spaces,
    3) remove empty lines and lines starting with "·" or "··".

    Args:
        df (pd.DataFrame): Input dataframe
    """

    df = (
        df
        .fillna("")
        .assign(text=lambda d: d["text"].str.strip()
                                        .str.replace("  ", " ")
                                        .str.replace("·", "")
                                        .str.replace("··", ""))
        

    )

    return df

In [None]:
# Split data in train, validation and test
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(df['text'], df['class_agg'], test_size=0.2, random_state=42, shuffle=True)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_val_texts, train_val_labels, test_size=0.2, random_state=42, shuffle=True)

In [None]:
train_val_data, train_val_labels = train_test_split(df.drop("class_agg"), df, test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Create huggingface dataset
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

# Concatenate into one dataset
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# OneHotEncode the labels with ClassLabel
dataset = dataset.class_encode_column("label")

# Save the dataset
dataset.save_to_disk(os.path.join(paths.DATA_PATH_PREPROCESSED, "line_labelling_clean_dataset"))

Casting to class labels:   0%|          | 0/1219 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/305 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/381 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1219 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/305 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/381 [00:00<?, ? examples/s]

In [None]:
# Relevant columns
text_cleaned = text[["research_id", "diagnosis_date", "diagnosis_label"]].rename(columns={"research_id": "rid", "diagnosis_date": "date", "diagnosis_label": "text"})

# Date column to YYYY-MM-DD format with default 2023-01-01 then set latest date per rid if date is missing
text_cleaned["date"] = text_cleaned["date"].apply(lambda x: parser.parse(str(x), default=datetime.datetime(2023, 1, 1)).strftime("%Y-%m-%d") if pd.notna(x) else x)
text_cleaned['date'] = pd.to_datetime(text_cleaned['date'])
text_cleaned['date'] = text_cleaned.groupby('rid')['date'].transform(lambda x: x.fillna(x.max()))

# Extract longest text per rid
text_cleaned = text_cleaned.groupby("rid").apply(lambda x: x.loc[x['text'].str.len().idxmax()]).reset_index(drop=True)

In [None]:
map_dict = {
    "Schubförmig remittierende Multiple Sklerose (RRMS)": "relapsing_remitting_multiple_sclerosis",
    "Schubförmig remittierende Multiple Sklerose": "relapsing_remitting_multiple_sclerosis",
    "Schubförmig remittierende Multiple Sklerose ": "relapsing_remitting_multiple_sclerosis",
    "Multiple Sklerose mit V.a. sekundär chronisch-progredienten Verlauf": "secondary_progressive_multiple_sclerosis",
    "Multiple Sklerose a.e. primär progredient": "primary_progressive_multiple_sclerosis",
    "Multiple Sklerose mit a.e. primär-progredientem Verlauf": "primary_progressive_multiple_sclerosis",
}

label_list = ["relapsing_remitting_multiple_sclerosis", "secondary_progressive_multiple_sclerosis", "primary_progressive_multiple_sclerosis"]

labels_cleaned = labels.replace(map_dict)
labels_cleaned = labels_cleaned[labels_cleaned["disease"].isin(label_list)]
print("Label distribution:")
print(labels_cleaned["disease"].value_counts(), "\n")
print("Unique patient IDs: ", len(labels_cleaned['research_id'].unique()))
print("Duplicate patient IDs: ", len(labels_cleaned['research_id']) - len(labels_cleaned['research_id'].unique()))

Label distribution:
disease
relapsing_remitting_multiple_sclerosis      286
secondary_progressive_multiple_sclerosis     28
primary_progressive_multiple_sclerosis       21
Name: count, dtype: int64 

Unique patient IDs:  328
Duplicate patient IDs:  7


In [None]:
# Date column to YYYY-MM-DD format with default 2023-01-01
labels_cleaned["diagnosis_date"] = labels_cleaned["diagnosis_date"].apply(lambda x: parser.parse(x, default=datetime.datetime(2023, 1, 1)).strftime("%Y-%m-%d"))

In [None]:
# Remove rows with non-confirmed diagnosis
labels_cleaned = labels_cleaned[labels_cleaned["diagnosis_reliability"] == "confirmed"]
# Keep only date, rid and disease and rename columns
labels_cleaned = labels_cleaned[["research_id", "diagnosis_date", "disease", "diagnosis_reliability"]].rename(columns={"research_id": "rid", "diagnosis_date": "date", "disease": "label"})

In [None]:
# Print the entries for the patients with multiple labels
print("Examples Patients with multiple labels:")
display((labels_cleaned[labels_cleaned["rid"].duplicated(keep=False)].sort_values(by=["rid", "date"])))

print("Summary of cleaned dataset: \n")
print(labels_cleaned.describe().iloc[:2], "\n")
print("shape: ", labels_cleaned.shape, "\n")
print("label distribution: \n", labels_cleaned["label"].value_counts(), "\n")
print("missing: \n", labels_cleaned.isna().sum(), "\n")

Examples Patients with multiple labels:


Unnamed: 0,rid,date,label,diagnosis_reliability


Summary of cleaned dataset: 

        rid date label diagnosis_reliability
count   260  260   260                   260
unique  260  149     3                     1 

shape:  (260, 4) 

label distribution: 
 label
relapsing_remitting_multiple_sclerosis      228
secondary_progressive_multiple_sclerosis     17
primary_progressive_multiple_sclerosis       15
Name: count, dtype: int64 

missing: 
 rid                      0
date                     0
label                    0
diagnosis_reliability    0
dtype: int64 



In [3]:
import pandas as pd
import os
import sys
sys.path.append(os.getcwd()+"/../..")
from src import paths
a = ["bla", "blub", "blabla"]
# Save as csv
pd.Series(a).to_csv(paths.RESULTS_PATH/"test.csv", index=False, header=False)