In [None]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths
import pandas as pd
import importlib
from src.data import data_loader

## Kisim Diagnosis

There are multiple subdirectotires relating to imports from different dates in the data/seantis directory. There seem to be two files of interest for the MS task: diagnoses.csv and kisim_diagnoses.csv

In [None]:
def get_nested_csv(dir_name: str, file_name: str):
    """
    Returns a joint pandas dataframe from the files matching file_name
    in all the different import dates subdirectories of the directory
    specified by dir_name
    
    :param dir_name: The name of the directory (e.g. "seantis")
    :param file_name: The name of the csv file to be read. (e.g. "diagnoses.csv")
    """
    list_dfs = []

    for root, dirs, files in os.walk(os.path.join(paths.DATA_PATH_RAW, dir_name)):
        if "imported_" in root:
            try:
                _df = pd.read_csv(os.path.join(root, file_name))
                list_dfs.append(_df)
            except FileNotFoundError:
                print(f"File not found in: {root}")
                continue
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError in: {root}")
                continue

    df = pd.concat(list_dfs)
    return df

In [None]:
# Diagnoses.csv files
diagnoses = get_nested_csv("seantis", "diagnoses.csv")
display(diagnoses.head())

# Count duplicate research ids
print("Duplicate research_ids: ", diagnoses.research_id.duplicated().sum())

# List unique diseases:
print("Unique Diseases:\n", diagnoses.disease.unique())

In [None]:
# Kisim Diagnoses 
kisim_diagnoses = get_nested_csv("seantis", "kisim_diagnoses.csv")
display(kisim_diagnoses.head())

#Count duplicate diagnosis ids
print("Duplicate diagnosis_id: ", kisim_diagnoses.diagnosis_id.duplicated().sum())

In [None]:
# Example for a kisim_diagnosis
kisim_diagnoses.diagnosis_label.iloc[0]

In [None]:
# Reports diagnosis
reports_kisim_diagnoses = get_nested_csv("reports_with_struct_data",
                                         "reports_kisim_diagnoses.csv")
reports_kisim_diagnoses.head()

In [None]:
# Checking if these are identical to kisim_diagnoses
pd.set_option("max_colwidth", None)
reports_kisim_diagnoses[reports_kisim_diagnoses.diagnosis_id == kisim_diagnoses.diagnosis_id.iloc[0]].diagnosis_label

## Questions
### diagnoses.csv
- What is the purpose of diagnoses.csv? 
- Is the diagnosis the disease in? How does it differ from the diagnosis_label in kisim_diagnosis?
- Why are there duplicate research_ids? Are there multiple diagnoses per patient?
- Are the unique labels all different diseases, or are they maybe coded differently for different imports?

### kisim_diagnoses
- What is the purpose of kisim_diagnosis? 
- Why are there duplicate diagnosis_ids (shouldn't they be unique)?

### reports_with_struct_data/reports_kisim_diagnoses
- Are these identical to the kisim_diagnoses? From the example it seems so

### Goal
- Do you want me to map from kisim_diagnosis.diagnosis_label (input) to a class in diagnoses.disease (output)? 
- How would I know which input corresponds to which output if there are multiple diagnoses_ids and research_ids?