# MS Prediction Data Cleaning

In [None]:
import pandas as pd

import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths

from dateutil import parser
import datetime

from sklearn.model_selection import train_test_split

## Diagnoses labels

Found in `seantis/diagnoses.csv` in the `disease` column.

In [None]:
labels = pd.read_csv(paths.DATA_PATH_SEANTIS/'diagnoses.csv')

In [None]:
display(labels.head())
print(labels.shape)

In [None]:
print("Label distribution:")
print(labels['disease'].value_counts())
print("None: ", labels['disease'].isna().sum(), "\n")
print("Unique patient IDs: ", len(labels['research_id'].unique()))
print("Duplicate patient IDs: ", len(labels['research_id']) - len(labels['research_id'].unique()))

In [None]:
print("Diagnosis reliability:")
print(labels['diagnosis_reliability'].value_counts())

In [None]:
# Print missing dates
print("Missing dates:")
print(labels["diagnosis_date"].isna().sum())
print("\n")

# Date column formats
print("Date column formats:")
print(labels["diagnosis_date"].apply(lambda x: len(x)).value_counts())
print("\n")

# Print a 10 character date
print("Example date:")
print(labels["diagnosis_date"][labels["diagnosis_date"].apply(lambda x: len(x)) == 10].iloc[1])

In [None]:
# Guessing the date format from the length and example. Seems to follow english format with YYYY-MM-DD.
date_map = {
    7: "YYYY-MM",
    10: "YYYY-MM-DD",
    4: "YYYY",
}
print(labels["diagnosis_date"].apply(lambda x: len(x))
      .value_counts()
      .reset_index()
      .replace({"diagnosis_date": date_map})
      .rename(columns={"diagnosis_date": "format"}))

In [None]:
# Print the entries for the patients with multiple labels
print("Examples Patients with multiple labels:")
display((labels[labels["research_id"].duplicated(keep=False)].sort_values(by=["research_id", "disease_onset_date"])))

### Problems

- Mix of german and english labels.
- Irrelevant labels
- Class imbalance
- There could be multiple diagnoses per patient
- The date column contains multiple formats, YYYY-MM, YYYY, YYYY-MM-DD
- Some diagnoses are not confirmed, could be detrimental to model training. In old approach they excluded these.

### Conclusion

- In the old project they mention, that only english labels are reliable. And they only use confirmed diagnoses for training. Manually check if the german labels are accurate.
- Exclude irrelevant labels (like clinically_isolated_syndrome). Keep relevant labels (relapsing_remitting_multiple_sclerosis (RRMS), secondary_progressive_multiple_sclerosis (SPMS), primary_progressive_multiple_sclerosis (PPMS))
- Do dataset-splits stratified
- Match with text from `kisim_diagnosis` based on research_id, check again how much remains.
- Multiple diagnoses per patient are reasonable, but problem is automatically solved by excluding non-confirmed diagnoses as all of the RRMS diagnoses were made status_post which shouldn't be info in the text.
- Clean date column by using YYYY-MM-DD format, mapping YYYY to the YYYY-01-01 and YYYY-MM to YYYY-MM-01
- Exclude non-confirmed diagnoses like in old approach.

## Diagnosis text

The diagnosis text is in `kisim_diagnoses.csv`

In [None]:
text = pd.read_csv(paths.DATA_PATH_SEANTIS/'kisim_diagnoses.csv')

In [None]:
text.head()

In [None]:
print("Description of text dataset: \n")
print(text.describe().iloc[:2], "\n")

print("Shape of text dataset: \n")
print(text.shape, "\n")

print("None values: \n")
print(text.isna().sum(), "\n")

In [None]:
# Unique patient IDs
print("Unique patient IDs: ", len(text['research_id'].unique()))

# Is there at least one date per research_id?
print("There is at least one date per rid: ", text.groupby("research_id")["diagnosis_date"].apply(lambda x: x.notna().sum().all()))

In [None]:
# Date column formats
print("Date column formats:")
print(text["diagnosis_date"].dropna().apply(lambda x: len(x)).value_counts())

In [None]:
print("Example date:")
print(text["diagnosis_date"].dropna().iloc[0])

### Problems
- There are multiple text entries per rid (diagnosis_label column). This is because doctors mostly just appended to existing reports, meaning the longest one should be the newest, or fullest report. 
- The LastUpdateDate is not useful for the dates, as different rows, that correspond to the same entries, share this date
- The diagnosis_date column has a lot of missing values. Because of missing values matching is hard to match using date.
- The date column is formated in the YYYY-MM-DD hh:mm:ss.ms format, could reformat

### Conclusions
- Multiple entries per rid, I will use the longest. This was already done for the old project. The entries are stored in /midatams/preprocessed_nlp/midata-text-extraction/data/diagnoses/diags_seantis_kisim_longest.json
- Date column might not be used for analysis but might prove useful for time-series follow-up. After join, this column should be cleaned.

## Joining Datasets

We can join the datasets based on the rid. For rids with multiple entries we will use the date column to match them exactly. 

In [None]:
# Overlap of rid between labels and text
print("Unique patient IDs in labels: ", len(labels['research_id'].unique()))
print("Unique patient IDs in text: ", len(text['research_id'].unique()))
print("Overlap of research_id between labels and text: ", len(set(labels["research_id"]).intersection(set(text["research_id"]))))

# Double labels
# Join labels and text on rid
double_labeled = labels["research_id"][labels["research_id"].duplicated(keep="first")]
print("Number of double labeled patients in both datasets: ", len(set(double_labeled).intersection(set(text["research_id"]))))

### Processing steps
Steps: 
1. Merge, rename, drop non-confirmed diagnoses and irrelevant columns
2. German labels could be unreliable. Check manually, then relabel
3. Drop all entries that are not one of RRMS, PPMS or SPMS. 
4. primary progressive and secondary progressive have low counts. Very important that text matches these. Check manually.
5. Try to mine more SPMS or PPMS by checking entries that might have been wrongly labelled as RRMS and remap them.

#### 1. Merge, rename, drop

In [None]:
# Loading data. Text data is loaded from json file using the same data as in old approach
import json
labels = pd.read_csv(paths.DATA_PATH_SEANTIS/'diagnoses.csv').rename(columns={'research_id': 'rid'})
text = json.load(open(paths.DATA_PATH_PREPROCESSED/'midatams/diags_seantis_kisim_longest.json', 'r'))
text = {k.split('_')[0]: ' '.join(v) for k, v in text.items()} # File name was rid_something_date
text = pd.DataFrame.from_dict(text, orient='index').reset_index().rename(columns={0: 'text', 'index': 'rid'})

# Merge on rid, drop irrelevant columns, rename columns
df = pd.merge(labels, text, on='rid', how='inner')

# Only use confirmed diagnosis
df = df[df["diagnosis_reliability"] == "confirmed"]

# Remove and rename columns
df = df[["rid", "diagnosis_date", "disease", "text"]].rename(columns={"diagnosis_date": "date", "disease": "labels"})

#### 2. German labels

In [None]:
# Check special cases where I rewrite the label
map_dict = {
    "Schubförmig remittierende Multiple Sklerose (RRMS)": "relapsing_remitting_multiple_sclerosis",
    "Schubförmig remittierende Multiple Sklerose": "relapsing_remitting_multiple_sclerosis",
    "Schubförmig remittierende Multiple Sklerose ": "relapsing_remitting_multiple_sclerosis",
    "Multiple Sklerose mit V.a. sekundär chronisch-progredienten Verlauf": "secondary_progressive_multiple_sclerosis",
    "Multiple Sklerose a.e. primär progredient": "primary_progressive_multiple_sclerosis",
    "Multiple Sklerose mit a.e. primär-progredientem Verlauf": "primary_progressive_multiple_sclerosis",
}

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
df[df["labels"].isin(map_dict.keys())]

In [None]:
# It seems text entry of 252 does not contain enough information to be useful The rest of the entries are fine and can be remapped
df.drop(252, inplace=True)

#### 3. Remapping, drop non interesting labels

In [None]:
# Remap labels
df = df.replace(map_dict)

# Only use confirmed diagnosis and the relevant labels
labels_list = ["relapsing_remitting_multiple_sclerosis", "secondary_progressive_multiple_sclerosis", "primary_progressive_multiple_sclerosis"]
df = df[df["labels"].isin(labels_list)]

#### 4. SPMS and PPMS manual check

In [None]:
# Because mapping was done manually, check if label matches text for classes with low counts
display(df[df["labels"] == "secondary_progressive_multiple_sclerosis"])

In [None]:
# For entries (47 (RRMS), 210 (unclear), 211 (RRMS), 218 (unclear)) text is not consistent with label. 
# I will remap entries 47 and 211 to RRMS and drop the other two entries.
df.drop([210, 218], inplace=True)
df.loc[df.index == 47, "labels"] = "relapsing_remitting_multiple_sclerosis"
df.loc[df.index == 211, "labels"] = "relapsing_remitting_multiple_sclerosis"

In [None]:
# Check primary_progressive_multiple_sclerosis
display(df[df["labels"] == "primary_progressive_multiple_sclerosis"])

In [None]:
# Everything seems fine here

#### 5. Manual Mining of SPMS and PPMS

In [None]:
# Find entries that contain "SPMS" or sekundär and have label relapsing_remitting_multiple_sclerosis
df[(df["labels"] == "relapsing_remitting_multiple_sclerosis") & (df["text"].str.contains("SPMS|sekundär"))]

There are entries where a RRMS is diagnosed, but SPMS is suspected. Keep in mind when looking at results. I won't remap.

In [None]:
# Find entries that contain "PPMS" or primär and have label relapsing_remitting_multiple_sclerosis
df[(df["labels"] == "relapsing_remitting_multiple_sclerosis") & (df["text"].str.contains("PPMS|primär"))]

#### 6. Date reformat

In [None]:
# Reformat date column
df["date"] = df["date"].apply(lambda x: parser.parse(x, default=datetime.datetime(2023, 1, 1)).strftime("%Y-%m-%d"))

#### 7. Summary

In [None]:
# Summary of cleaned dataset
print("Summary of cleaned dataset: \n")
print(df.info(), "\n")
print("shape: ", df.shape, "\n")
print("label distribution: \n", df["labels"].value_counts(), "\n")
print("missing: \n", df.isna().sum(), "\n")

In [None]:
# Train test split
train, test = train_test_split(df, test_size=0.3, random_state=42, stratify=df["labels"])
train, val = train_test_split(train, test_size=0.1, random_state=42, stratify=train["labels"])

# Save to csv
train.to_csv(paths.DATA_PATH_PREPROCESSED/'ms-diag/ms-diag_clean_train.csv', index=False)
val.to_csv(paths.DATA_PATH_PREPROCESSED/'ms-diag/ms-diag_clean_val.csv', index=False)
test.to_csv(paths.DATA_PATH_PREPROCESSED/'ms-diag/ms-diag_clean_test.csv', index=False)

In [None]:
print("Train label distribution: \n", train["labels"].value_counts(), "\n")
print("Val label distribution: \n", val["labels"].value_counts(), "\n")
print("Test label distribution: \n", test["labels"].value_counts(), "\n")

# Old Project Preprocessing
To get comparability to old project I will use parts of their data preprocessing. Specifically I use the seantis_kisim.csv that they have prepared. Their approach:

1. Extract the longest diagnosis per rid (most lines) from the csv and if the rid had a manually line labelled text, they used this instead.
2. Results in dataset consisting of text lines per row with a label for the line.

Further processing:
3. I can use this to create a df with joined text per rid and determined what content will be in the text. Later I can filter useful examples based on this.
4. If a content entry is False, we know that there is no such information in the text. This is useful to estimate the accuracy of a model to correctly determine the lack of information. If the entry is NaN we don't know if the information is present.
5. As there are very few entries for SPMS and PPMS I will manually check their lines for dm content. If I make any changes to line labellings I will report this in the "line_label_origin" column as "manual", while original labels are stored as "original". Later I also add a third option for "classifier1" where a Model automatically labels the lines.
6. As there are very few entries for SPMS and PPMS I will manually check their labels for correctness. If I change any of their labels I will remark this in the "label_origin" column as "manual", while original labels will be stored as "original". Later I also add a third option for "classifier2" where a Model automatically labels the text.

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv(os.path.join(paths.DATA_PATH_PREPROCESSED,"line_labelling", "seantis_kisim.csv"))

In [None]:
# Check if rid text contains a certain content
contains_variable_dict = {line_class: f"contains_{line_class}" for line_class in df["class"].unique()}

# Create columns
df_clean = pd.DataFrame(columns=["rid", "text"] + list(contains_variable_dict.values()) + ["line_label_origin"])
line_labelled_reports = 0

# Fill df_clean
for rid, rid_data in df.groupby("research_id"):

    # Concatenate text lines
    text = rid_data["text"].dropna().str.cat(sep="\n")

     # If all line classes are nan fill with nan
    if rid_data["class"].isna().all():
        _df = pd.DataFrame({"rid": rid, "text": text, **{k: np.nan for k in contains_variable_dict.values()}, "line_label_origin": np.nan}, index=[0])
        df_clean = pd.concat([df_clean, _df], ignore_index=True)
    
    # If value is contained fill with True, else False
    else:
        text_contains = rid_data["class"].unique()
        text_contains_dict = {v: False for k, v in contains_variable_dict.items()}
        text_contains_dict.update({contains_variable_dict[v]: True for v in text_contains})
        _df = pd.DataFrame({"rid": rid, "text": text, **text_contains_dict, "line_label_origin": "original"}, index=[0])
        df_clean = pd.concat([df_clean, _df], ignore_index=True)
        line_labelled_reports += 1

In [None]:
print("Number of line labelled reports: ", line_labelled_reports)
print("Number of reports without line labels: ", len(df_clean) - line_labelled_reports)
print("Number of reports in df_clean: ", len(df_clean))

In [None]:
# Loading data. Text data is loaded from json file using the same data as in old approach
labels = pd.read_csv(paths.DATA_PATH_SEANTIS/'diagnoses.csv').rename(columns={'research_id': 'rid'}).drop(columns=["diagnosis_date", "disease_onset_date"])

# Add label_origin and fill with "original"
labels["label_origin"] = "original"

# Merge on rid, drop irrelevant columns, rename columns
df_merged = pd.merge(labels, df_clean, on='rid', how='inner')

# Only use confirmed diagnosis
df_merged = df_merged[df_merged["diagnosis_reliability"] == "confirmed"].drop(columns=["diagnosis_reliability"])

# Remove and rename columns
df_merged = df_merged.rename(columns={"disease": "labels"})

# Check special cases where I rewrite the label
map_dict = {
    "Schubförmig remittierende Multiple Sklerose (RRMS)": "relapsing_remitting_multiple_sclerosis",
    "Schubförmig remittierende Multiple Sklerose": "relapsing_remitting_multiple_sclerosis",
    "Schubförmig remittierende Multiple Sklerose ": "relapsing_remitting_multiple_sclerosis",
    "Multiple Sklerose mit V.a. sekundär chronisch-progredienten Verlauf": "secondary_progressive_multiple_sclerosis",
    "Multiple Sklerose a.e. primär progredient": "primary_progressive_multiple_sclerosis",
    "Multiple Sklerose mit a.e. primär-progredientem Verlauf": "primary_progressive_multiple_sclerosis",
}

# Remap labels
df_merged = df_merged.replace(map_dict)

# Only use confirmed diagnosis and the relevant labels
labels_list = ["relapsing_remitting_multiple_sclerosis", "secondary_progressive_multiple_sclerosis", "primary_progressive_multiple_sclerosis"]
df_merged = df_merged[df_merged["labels"].isin(labels_list)]


In [None]:
df_merged[df_merged.contains_dm == True].labels.value_counts()

In [None]:
df_merged[df_merged.labels == "secondary_progressive_multiple_sclerosis"].isna().sum()

For PPMS and SPMS there are not enough line labels. I will check myself for ms and set to true if there is MS in report and False elsewhere.

In [None]:
df_merged[df_merged.labels == "secondary_progressive_multiple_sclerosis"]

In [None]:
# From inspection it seems entries 210 and 218 do not contain dm, the rest does. Also entries 47 and 211 are relapsing_remitting_multiple_sclerosis and contain dm.
# Set contains_dm to True for all secondary_progressive_multiple_sclerosis entries
df_merged.loc[df_merged.labels == "secondary_progressive_multiple_sclerosis", "contains_dm"] = True
df_merged.loc[df_merged.labels == "secondary_progressive_multiple_sclerosis", "line_label_origin"] = "manual"
df_merged.loc[df_merged.index == 210, "contains_dm"] = False
df_merged.loc[df_merged.index == 218, "contains_dm"] = False

# Set label of entries 47, 50 and 211 to relapsing_remitting_multiple_sclerosis
df_merged.loc[df_merged.index == 47, "labels"] = "relapsing_remitting_multiple_sclerosis"
df_merged.loc[df_merged.index == 47, "label_origin"] = "manual"
df_merged.loc[df_merged.index == 211, "labels"] = "relapsing_remitting_multiple_sclerosis"
df_merged.loc[df_merged.index == 211, "label_origin"] = "manual"

In [None]:
# Check primary_progressive_multiple_sclerosis
df_merged[df_merged.labels == "primary_progressive_multiple_sclerosis"]

In [None]:
# Seems everything is fine with PPMS all contain dm
df_merged.loc[df_merged.labels == "primary_progressive_multiple_sclerosis", "contains_dm"] = True
df_merged.loc[df_merged.labels == "primary_progressive_multiple_sclerosis", "line_label_origin"] = "manual"

# Summary

In [None]:
# Label distribution
print("Label distribution all:")
print(df_merged.labels.value_counts(), "\n\n")

# Texts that contain diagnosis label
print("Label distribution contains_dm:")
print(df_merged[df_merged.contains_dm == True].labels.value_counts())

In [None]:
# Save to csv
df_merged.to_csv(paths.DATA_PATH_PREPROCESSED/'ms-diag/ms-diag_line_annotated.csv', index=False)

In [None]:
df_merged[df_merged.contains_dm == False]