In [10]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")
from src import paths
import pandas as pd
import importlib
from src.data import data_loader

## Kisim Diagnosis

There are multiple subdirectotires relating to imports from different dates in the data/seantis directory. There seem to be two files of interest for the MS task: diagnoses.csv and kisim_diagnoses.csv

In [42]:
def get_seantis_csv(file_name: str):
    """
    Returns a joint pandas dataframe from all csv files with
    the specified file name in the seantis folder.

    :param file_name: The name of the csv file to be read.
    """
    list_dfs = []

    for root, dirs, files in os.walk(os.path.join(paths.DATA_PATH, "seantis")):
        if "imported_" in root:
            try:
                _df = pd.read_csv(os.path.join(root, file_name))
                list_dfs.append(_df)
            except FileNotFoundError:
                print(f"File not found in: {root}")
                continue
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError in: {root}")
                continue

    df = pd.concat(list_dfs)
    return df

In [47]:
# Diagnoses.csv files
diagnoses = get_seantis_csv("diagnoses.csv")
display(diagnoses.head())

# Count duplicate research ids
print("Duplicate research_ids: ", diagnoses.research_id.duplicated().sum())

UnicodeDecodeError in: /cluster/dataset/midatams/data/seantis/imported_20200925
UnicodeDecodeError in: /cluster/dataset/midatams/data/seantis/imported_20200624


Unnamed: 0,research_id,disease,disease_onset_date,diagnosis_date,diagnosis_reliability
0,A0684D32-19C0-4538-AFDE-52959CCD8B63,Progressive multifokale Leukencephalopathie (PML),2011-10,2011-10,confirmed
1,A0684D32-19C0-4538-AFDE-52959CCD8B63,relapsing_remitting_multiple_sclerosis,2002-06,2002-08,confirmed
2,A0684D32-19C0-4538-AFDE-52959CCD8B63,St.n. symptomatischer Epilepsie mit einfach fo...,2011-11,2011-11,confirmed
3,088B7604-2F11-459A-8C8C-8E1A19943333,relapsing_remitting_multiple_sclerosis,2013-01,2015-10,confirmed
4,C36F69FD-ED93-4F53-8BA6-EA155501A965,relapsing_remitting_multiple_sclerosis,,2013-11-07,


Duplicate research_ids:  555


In [48]:
# Kisim Diagnoses 
kisim_diagnoses = get_seantis_csv("kisim_diagnoses.csv")
display(kisim_diagnoses.head())

#Count duplicate diagnosis ids
print("Duplicate diagnosis_id: ", kisim_diagnoses.diagnosis_id.duplicated().sum())

UnicodeDecodeError in: /cluster/dataset/midatams/data/seantis/imported_20200925
UnicodeDecodeError in: /cluster/dataset/midatams/data/seantis/imported_20200624


Unnamed: 0,research_id,data_provider_code,LastUpdateDateTime,diagnosis_id,diagnosis_encoded,diagnosis_code,diagnosis_code_system,diagnosis_code_system_version,diagnosis_date,diagnosis_role,diagnosis_ranking,diagnosis_label
0,E9C0656D-2A3F-4538-8A8F-F6DC09871E87,CHE-108.904.325,2019-10-29 09:17:59.233,6391589|1,no,,,,,discharge,primary,"Schubförmige Multiple Sklerose EM 15.08.2019, ..."
1,E9C0656D-2A3F-4538-8A8F-F6DC09871E87,CHE-108.904.325,2019-09-16 00:53:09.617,6398911|2,no,,,,,discharge,secondary,Neuritis nervi optici rechts ED 19.08.019\r\n·...
2,E9C0656D-2A3F-4538-8A8F-F6DC09871E87,CHE-108.904.325,2019-09-16 00:53:09.617,6398911|1,no,,,,,discharge,primary,Mittelgradige depressive Episode\r\n··i.R. Dia...
3,E9C0656D-2A3F-4538-8A8F-F6DC09871E87,CHE-108.904.325,2019-09-05 06:30:45.030,6376986|1,no,,,,,discharge,primary,Neuritis nervi optici rechts ED 19.08.019\r\n·...
4,A2ABB322-AF69-412C-B293-A6154DC8446B,CHE-108.904.325,2014-04-02 08:58:46.837,4043704|2,no,,,,,discharge,secondary,Familiäre Schwerhörigkeit\r\n


Duplicate diagnosis_id:  8074
