In [None]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths
import pandas as pd
import importlib
from src.data import data_loader
import json

## Kisim Diagnosis

There are multiple subdirectotires relating to imports from different dates in the data/seantis directory. There seem to be two files of interest for the MS task: diagnoses.csv and kisim_diagnoses.csv

In [None]:
def get_nested_csv(dir_name: str, file_name: str):
    """
    Returns a joint pandas dataframe from the files matching file_name
    in all the different import dates subdirectories of the directory
    specified by dir_name
    
    :param dir_name: The name of the directory (e.g. "seantis")
    :param file_name: The name of the csv file to be read. (e.g. "diagnoses.csv")
    """
    list_dfs = []

    for root, dirs, files in os.walk(os.path.join(paths.DATA_PATH_RAW, dir_name)):
        
        if root.split(os.sep)[-1].startswith("imported_"):
            try:
                _df = pd.read_csv(os.path.join(root, file_name))
                list_dfs.append(_df)
            except FileNotFoundError:
                print(f"File not found in: {root}")
                continue
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError in: {root}")
                continue

    df = pd.concat(list_dfs)
    return df

In [None]:
# Diagnoses.csv files
diagnoses = get_nested_csv("seantis", "diagnoses.csv")
display(diagnoses.head())

# Count duplicate research ids
print("Duplicate research_ids: ", diagnoses.research_id.duplicated().sum())

# List unique diseases:
print("Unique Diseases:\n", diagnoses.disease.unique())

In [None]:
# Kisim Diagnoses 
kisim_diagnoses = get_nested_csv("seantis", "kisim_diagnoses.csv")
display(kisim_diagnoses.head())

#Count duplicate diagnosis ids
print("Duplicate diagnosis_id: ", kisim_diagnoses.diagnosis_id.duplicated().sum())

In [None]:
# Example for a kisim_diagnosis
kisim_diagnoses.diagnosis_label.iloc[0]

In [None]:
# Reports diagnosis
reports_kisim_diagnoses = get_nested_csv("reports_with_struct_data",
                                         "reports_kisim_diagnoses.csv")
reports_kisim_diagnoses.head()

In [None]:
# Checking if these are identical to kisim_diagnoses
pd.set_option("max_colwidth", None)
reports_kisim_diagnoses[reports_kisim_diagnoses.diagnosis_id == kisim_diagnoses.diagnosis_id.iloc[0]].diagnosis_label

## Reports

In [None]:
import json
with open(os.path.join(paths.DATA_PATH_RAW, "reports_with_struct_data", "imported_20201612", "reports", "RNOSPOL02-29052941.json")) as f:
    reports_kisim_diagnoses_json = json.load(f)

In [None]:
def flatten_json(d, parent_key='', sep='_'):
    """
    Flattens a nested JSON dict and handles the special case where there is an "Item" key with "@num" and "CONTENT" values.
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, dict):
            # Handle special case where there is an "Item" key with "@num" and "CONTENT" values
            if "Item" in v.keys():
                try:
                    item_num = v["Item"]["@num"]
                    item_content = v["Item"]["CONTENT"]
                    items.append((new_key + sep + "Item_" + item_num, item_content))
                    del v["Item"]
                except TypeError:
                    print(f"TypeError in {new_key}")
                    continue
                items.extend(flatten_json(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)
        

def get_nested_json(path: str):
    with open(path) as f:
        data = json.load(f)
    return flatten_json(data)


def get_reports():
    """
    Returns the reports dataframe.
    """

    # research_id and filename are stored in reports_with_struct_data/reportfilename_researchid.csv files
    rid_filename = get_nested_csv("reports_with_struct_data", "reportfilename_researchid.csv")

    # Reports are in JSON format in reports_with_struct_data/reports
    # We need to 1) read the JSON files 2) flatten them to pd.DataFrame() format and 3) add the research_id from the csv files
    ls_reports = []

    for root, dirs, files in os.walk(os.path.join(paths.DATA_PATH_RAW, "reports_with_struct_data")):

        # All reports are in the reports folder
        if root.endswith("reports"):
            for file in files:
                try: 
                    # Get the research_id from the csv file
                    _rid = rid_filename[rid_filename.report_filename + ".json" == file].research_id.iloc[0]

                    # Read the JSON file
                    with open(os.path.join(root, file)) as f:
                        _json = json.load(f)
                    _df = pd.json_normalize(_json)

                    # Add the research_id
                    _df["research_id"] = _rid

                    # Append _df to list
                    ls_reports.append(_df)
                
                except IndexError:
                    print(f"No research_id found for {file}")
                    continue
    
    return pd.concat(ls_reports)

In [None]:
df_reports = get_reports()
df_reports.columns

## Questions
### diagnoses.csv
- What is the purpose of diagnoses.csv? 
- Is the diagnosis the disease in? How does it differ from the diagnosis_label in kisim_diagnosis?
- Why are there duplicate research_ids? Are there multiple diagnoses per patient?
- Are the unique labels all different diseases, or are they maybe coded differently for different imports?

### kisim_diagnoses
- What is the purpose of kisim_diagnosis? 
- Why are there duplicate diagnosis_ids (shouldn't they be unique)?

### reports_with_struct_data/reports_kisim_diagnoses
- Are these identical to the kisim_diagnoses? From the example it seems so

### Reports
- What exactly are the entries in the JSON that are of importance? ZUSATZ? FORM? Visum?

### Goal
- Do you want me to map from kisim_diagnosis.diagnosis_label (input) to a class in diagnoses.disease (output)? 
- How would I know which input corresponds to which output if there are multiple diagnoses_ids and research_ids?