In [None]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths
import pandas as pd
import importlib
from src.data import data_loader
import json

## diagnoses.csv

- contains the disease labels for a given rid.
- Question: there are sometimes multiple diagnoses per rid

In [None]:
def get_nested_csv(dir_name: str, file_name: str):
    """
    Returns a joint pandas dataframe from the files matching file_name
    in all the different import dates subdirectories of the directory
    specified by dir_name
    
    :param dir_name: The name of the directory (e.g. "seantis")
    :param file_name: The name of the csv file to be read. (e.g. "diagnoses.csv")
    """
    list_dfs = []

    for root, dirs, files in os.walk(os.path.join(paths.DATA_PATH_RAW, dir_name)):
        
        if root.split(os.sep)[-1].startswith("imported_20210507"):
            try:
                _df = pd.read_csv(os.path.join(root, file_name))
                list_dfs.append(_df)
            except FileNotFoundError:
                print(f"File not found in: {root}")
                continue
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError in: {root}")
                continue

    df = pd.concat(list_dfs)
    return df

In [None]:
# Diagnoses.csv files
diagnoses = get_nested_csv("seantis", "diagnoses.csv")
display(diagnoses.head())

# Count rows
print("Number of rows: ", len(diagnoses))

# Count unique research_ids
print("Number of unique research_ids: ", diagnoses.research_id.nunique())

# rids that have 2 or more occurences
print("Number of rids that have 2 or more occurences: ", diagnoses[diagnoses.research_id.duplicated()].research_id.nunique())

# Occurences per rid
print("Occurences per rid: ", diagnoses.research_id.value_counts())

# Occurences per disease
print("Occurences per disease: ", diagnoses.disease.value_counts())

# Examples of duplicate research_ids
display(diagnoses[diagnoses.research_id.duplicated()].iloc[:2])

# List of rids
diagnoses_rids = diagnoses.research_id.unique()

In [None]:
diagnoses[diagnoses.research_id == "2048FE8D-4DFF-4939-9739-1B5A470914DA"]

### kisim_diagnoses.csv
- Nearly all rid have multiple diagnoses
- Some have up to 676 diagnoses. How do I match this to the diagnoses.csv file?

In [None]:
# Kisim Diagnoses 
kisim_diagnoses = get_nested_csv("seantis", "kisim_diagnoses.csv")
display(kisim_diagnoses.head())
print("Lenght of kisim_diagnoses: ", len(kisim_diagnoses))

#Count duplicate diagnosis ids
print("Duplicate diagnosis_id: ", kisim_diagnoses.diagnosis_id.duplicated().sum())

# Count unique research ids
print("Unique research_ids: ", kisim_diagnoses.research_id.nunique())

# Number of rid that have 2 or more occurences
print("Number of rids that have 2 or more occurences: : ", kisim_diagnoses[kisim_diagnoses.research_id.duplicated()].research_id.nunique())


In [None]:
# Count number of diagnosis per research_id
print("Number of diagnosis per research_id: ")
display(kisim_diagnoses["research_id"].value_counts())

# Id "2048FE8D-4DFF-4939-9739-1B5A470914DA" has 676 diagnoses
display(kisim_diagnoses[(kisim_diagnoses.research_id == "2048FE8D-4DFF-4939-9739-1B5A470914DA")])

#Extract longest diagnosis_label from kisim_diagnoses[(kisim_diagnoses.research_id == "2048FE8D-4DFF-4939-9739-1B5A470914DA")]
print("Longest Diagnosis Length" ,kisim_diagnoses[(kisim_diagnoses.research_id == "2048FE8D-4DFF-4939-9739-1B5A470914DA")].diagnosis_label.str.len().max())

In [None]:
# Example for a kisim_diagnosis
kisim_diagnoses.diagnosis_label.iloc[0]

In [None]:
# Overlap with diagnoses.csv rid
print("Overlap with diagnoses.csv rid: ", kisim_diagnoses.research_id.isin(diagnoses_rids).sum())

### reports_kisim_diagnoses

- How do these differ from seantis/kisim_diagnoses? Sometimes there are doubles?

In [None]:
# Reports diagnosis
reports_kisim_diagnoses = get_nested_csv("reports_with_struct_data",
                                         "reports_kisim_diagnoses.csv")
display(reports_kisim_diagnoses.head())

# Length of reports_kisim_diagnoses
print("Length of reports_kisim_diagnoses: ", len(reports_kisim_diagnoses))

# Count unique research_ids
print("Unique research_ids: ", len(reports_kisim_diagnoses.research_id.unique()))

# Number of rid that have 2 or more occurences
print("Number of rids that have 2 or more occurences: ", reports_kisim_diagnoses[reports_kisim_diagnoses.research_id.duplicated()].research_id.nunique())

# Count number of diagnosis per research_id
print("Number of diagnosis per research_id: ")
display(reports_kisim_diagnoses["research_id"].value_counts())

# Overlap with diagnoses.csv rid
print("Overlap with diagnoses.csv rid: ", reports_kisim_diagnoses.research_id.isin(diagnoses_rids).sum())

In [None]:
# Checking if these are identical to kisim_diagnoses
pd.set_option("max_colwidth", None)
reports_kisim_diagnoses[reports_kisim_diagnoses.diagnosis_id == kisim_diagnoses.diagnosis_id.iloc[0]].diagnosis_label

### Reports

- What is important here? Beurteilung? Zusatz? Form?

In [None]:
reports = pd.read_csv(os.path.join(paths.PROJECT_ROOT, r"preprocessed_nlp/midata-text-extraction/data/reports_with_struct_data/imported_20210507/all_info_reports.csv"))

In [None]:
reports.head()

In [None]:
reports.columns

In [None]:
with open(os.path.join(paths.PROJECT_ROOT, r"preprocessed_nlp/midata-text-extraction/data/reports_with_struct_data/imported_20210507/reports/Report_MitrendS_000B5446-F07C-4D9A-A336-39691B65AA7A_2021.05.05-12-39-58.json")) as f:
    file = json.load(f)

json.dumps(file, indent=4)

In [None]:
def flatten_json(d, parent_key='', sep='_'):
    """
    Flattens a nested JSON dict and handles the special case where there is an "Item" key with "@num" and "CONTENT" values.
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, dict):
            # Handle special case where there is an "Item" key with "@num" and "CONTENT" values
            if "Item" in v.keys():
                try:
                    item_num = v["Item"]["@num"]
                    item_content = v["Item"]["CONTENT"]
                    items.append((new_key + sep + "Item_" + item_num, item_content))
                    del v["Item"]
                except TypeError:
                    print(f"TypeError in {new_key}")
                    continue
                items.extend(flatten_json(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)
        

def get_nested_json(path: str):
    with open(path) as f:
        data = json.load(f)
    return flatten_json(data)


def get_reports():
    """
    Returns the reports dataframe.
    """

    # research_id and filename are stored in reports_with_struct_data/reportfilename_researchid.csv files
    rid_filename = get_nested_csv("reports_with_struct_data", "reportfilename_researchid.csv")

    # Reports are in JSON format in reports_with_struct_data/reports
    # We need to 1) read the JSON files 2) flatten them to pd.DataFrame() format and 3) add the research_id from the csv files
    ls_reports = []

    for root, dirs, files in os.walk(os.path.join(paths.DATA_PATH_RAW, "reports_with_struct_data")):

        # All reports are in the reports folder
        if root.endswith("reports"):
            for file in files:
                try: 
                    # Get the research_id from the csv file
                    _rid = rid_filename[rid_filename.report_filename + ".json" == file].research_id.iloc[0]

                    # Read the JSON file
                    with open(os.path.join(root, file)) as f:
                        _json = json.load(f)
                    _df = pd.json_normalize(_json)

                    # Add the research_id
                    _df["research_id"] = _rid

                    # Append _df to list
                    ls_reports.append(_df)
                
                except IndexError:
                    print(f"No research_id found for {file}")
                    continue
    
    return pd.concat(ls_reports)

In [None]:
df_reports = get_reports()
df_reports.columns

### medication.csv

In [None]:
# kisim_medication.csv
kisim_medication = get_nested_csv("seantis", "kisim_medication.csv")
display(kisim_medication.head())

# Length of kisim_medication
print("Length of kisim_medication: ", len(kisim_medication))

# Count unique research_ids
print("Unique research_ids: ", len(kisim_medication.research_id.unique()))

# Number of rid that have 2 or more occurences
print("Number of rids that have 2 or more occurences: ", kisim_medication[kisim_medication.research_id.duplicated()].research_id.nunique())

# Count number of medication per research_id
print("Number of medication entries per research_id: ")
display(kisim_medication["research_id"].value_counts())

# Overlap with diagnoses.csv rid
print("Overlap with diagnoses.csv rid: ", kisim_medication.research_id.isin(diagnoses_rids).sum())

In [None]:
# EDSS Score
visits = get_nested_csv("seantis", "visits.csv")
display(visits.head())

# Count visits
print("Number of visits: ", len(visits))

# Count columns
print("Number of columns: ", len(visits.columns))

# Count unique research_ids
print("Unique research_ids: ", visits.research_id.nunique())

# Number of rid that have 2 or more occurences
print("Number of rids that have 2 or more occurences: ", visits[visits.research_id.duplicated()].research_id.nunique())

In [None]:
# visits.csv contains the EDSS score
display(visits.edss_score.head())

### Exacerbations

In [None]:
exacerbations = get_nested_csv("seantis", "exacerbations.csv")
display(exacerbations.head())

# Count exacerbations
print("Number of exacerbations: ", len(exacerbations))

# Count columns

### MRI

In [None]:
# MRI
mri = get_nested_csv("seantis", "magnetic_resonance_images.csv")
display(mri.head())

# Count MRI
print("Number of MRI: ", len(mri))

# Count columns
print("Number of columns: ", len(mri.columns))

# Count unique research_ids
print("Unique research_ids: ", mri.research_id.nunique())

# Number of rid that have 2 or more occurences
print("Number of rids that have 2 or more occurences: ", mri[mri.research_id.duplicated()].research_id.nunique())

# Count number of MRI per research_id
print("Number of MRI per research_id: ")
display(mri["research_id"].value_counts())

# Overlap with diagnoses.csv rid
print("Overlap with diagnoses.csv rid: ", mri.research_id.isin(diagnoses_rids).sum())

## My current understanding
- Taking the newest import (20210507) is sufficient
- I have three sources of input text (seantis/kisim_diagnoses.csv and reports_with_struct_data/reports_kisim_diagnoses.csv) and the reports?
- I have labels per text line in the preprocessed_nlp/labelling directory to classify text lines. But these only have the rid and not diagnosis id.

## Questions

### General
- Different imports, are all relevant?
- Are the dates listed in the files accuracte (could be used for matching?)
- There are a lot of data files, in different directories. Is the original data directory in: dataset/midatams/data? Because there is also some inside the preprocessed_nlp directory and then again in the midata extraction.
- What are the keywords you mention for the classifier 2?
- Is the seantis data done updating?
- The crf classifier uses probability matrices generated from random forrest. I have never seen this approach is there a paper for that? Or a documentation? Or can you explain shortly how you use the RF+CRF pipeline for both classifier 1&2? How does it help?

### diagnoses.csv
- What is the purpose of diagnoses.csv? Is this the labelling file for MS diagnosis?
- Is the diagnosis the "disease" column?
- Are you just interested in the 3 types of MS in the PDF (10 PPMS, 128 RRMS, 11 SPMS)
- Why are there duplicate research_ids? Are there multiple diagnoses per patient?
- What should we do if a patient has multiple diagnoses? Take last (from date)? 
- You used only 149 samples for training for the MS classifier?

### kisim_diagnoses.csv
- What is the purpose of kisim_diagnosis? Is this the text I should base the prediction on (in diagnosis_label)?
- What are the columns of interest here? diagnosis_label?
- Diagnosis ranking primary/secondary etc.? 
- What was labelled in diagnoses.csv? How do I match kisim_diagnosis.csv text to diagnosis.csv?
- What if there are multiple texts per rid, did you embed them, then aggregate and classify?

### reports_with_struct_data/reports_kisim_diagnoses
- Are these identical to the kisim_diagnoses? From the example it seems so. Or do they have overlap?
- What are the columns of interest here? diagnosis_label?

### Reports
- What exactly are the entries in the JSON that are of importance? ZUSATZ? FORM? Visum?
- How do I match the reports to diagnosis if there are multiple ids?

### seantis/medications.csv, seantis/kisim_medications.csv, rsd/kisim_medications.csv
- Are these the labels file for the medication task? Should I predict the medications based on seantis/kisim_diagnoses? or rsd/kisim_rsd_reports?
- How do I match the report to the medication label if I only have rids?
- Did you do prediction for this or just extracted the labels rule based? Do you want me to construct an approach that is not rule based and can be applied to seantis/medications.csv, seantis/kisim_medications.csv, rsd/kisim_medications.csv to extract medications? Or should I take these files as the labels and try to extract this info from seantis/kisim_diagnoses? or rsd/kisim_rsd_reports

### seantis/visits.csv
- Does this contain all the info about the edss score? Is this the labels file of the edss score?
- What if there are multiple entries per rid? How do I match it to the correct text?

### mri
- I can see the mri kind (spinal/cranial) but how do I check new/old and KM yes/no?

### Labelling
- There is a subdirectory preprocessed_nlp/labelling containing labelling for some text. What text exactly was labelled (from seantis/kisim_diagnoses)?
- There is only the rid to go off, but how would I match the labelling to patients with multiple diagnoses? How would I match the label to the correct text?

### Goal
- Do you want me to do predictions based on one text file, or aggregated text files (if multiple per rid) and then give one prediction for MS diagnosis, MRI, medication and edss score?

## Tasks:
- MS diagnosis: use diagnoses.csv for labels per rid, then predict from seantis/kisim_diagnoses.csv (diagnosis_label) or reports_with_struct_data/reports_kisim_diagnoses.csv (diagnosis_label) or the reports
- MRI: use magnetic_resonance_images.csv for labels, then predict from seantis/kisim_diagnoses.csv (diagnosis_label) or reports_with_struct_data/reports_kisim_diagnoses.csv (diagnosis_label) or the reports
- Medication: labels in seantis/medications.csv, seantis/kisim_medications.csv, rsd/kisim_medications.csv (need to be extracted rule based) then predict from seantis/kisim_diagnoses.csv (diagnosis_label) or reports_with_struct_data/reports_kisim_diagnoses.csv (diagnosis_label) or the report
- EDSS Score: labels in seantis/visits (edss_score) seantis/kisim_diagnoses.csv and reports (extract rule based) then predict from seantis/kisim_diagnoses.csv (diagnosis_label) or reports_with_struct_data/reports_kisim_diagnoses.csv (diagnosis_label) or the reports