# Preprocessing Old Project

To gain better comparability to old project some parts of the preprocessing are reused here. In large parts this will be just making sure that the same data is used (reports, files etc.). The data of the old project can be found in data/raw/midatams_preprocessed.

The code in this notebook was mostly written by Lilian Gasser at SDSC for a project before this using the same data.

In [None]:
import sys
import os
sys.path.append(os.getcwd()+"/../..")

from src import paths

import pandas as pd

import json

## Seantis Kisim MS Diagnosis
To ensure the same reports are used I use the selection code of the original project (I only adapted the data paths)

In [None]:
# select export
imported = 'imported_20210507/'

# select input (for which text should output be generated?)
which_input = 'seantis-kisimdiagnoses'   # seantis/kisim_diagnoses.csv (before, run diagnosis text extraction)
#which_input = 'rsd-kisimdiagnoses'       # reports_with_struct_data/kisim_diagnoses.csv (before, run diagnosis text extraction)
# which_input = 'reports'                  # reports (before, run report extraction and diagnosis text extraction)

# select output
which_output = 'dm'    # MS diagnosis labels
# which_output = 'mri'   # MRI

# use pretrained embedding or retrain on text lines from selected input
use_pretrained_embedding = True

path_seantis = str(paths.DATA_PATH) + '/raw/seantis/' + imported

def load_seantis_file(path_seantis, which_output):
    
    if which_output == 'dm':
        
        # diagnoses
        file = 'diagnoses.csv'
        df_s = pd.read_csv(path_seantis + file)
        
    elif which_output == 'mri':
        
        # mri
        file = 'magnetic_resonance_images.csv'
        df_s = pd.read_csv(path_seantis + file)

        # drop rows with NAs in date and mri kind
        df_s = df_s[(df_s['mri_date'].notnull()) & (df_s['mri_kind'].notnull())]

        # mri.csv (shorten date to yyyy-mm)
        df_s = df_s.assign(mri_date_short = df_s['mri_date'].apply(lambda x: x[:7]))
        
    return df_s

df_s = load_seantis_file(path_seantis, which_output)


def load_data(path_data = './data/', which_diags = 'unique'):
    '''
    load three dictionaries where key is the research ids and value is a list of unique lines of the diagnosis
    '''
    if which_diags == 'unique':
        str_end = '_unique.json'
    elif which_diags == 'longest':
        str_end = '_longest.json'
    elif which_diags == 'all':
        str_end = '_all.json'
    else:
        print("Warning: which_diags is not properly set")
        
    with open(path_data + 'diags_seantis_kisim' + str_end,) as json_file:
        diags_s_kdiag = json.load(json_file)

    with open(path_data + 'diags_rsd_kisim' + str_end,) as json_file:
        diags_r_kdiag = json.load(json_file)

    imported = path_data.split('/')[-2]
    if imported in ['imported_20210302', 'imported_20201120', 'imported_20201126']:
        diags_reports = dict()
    else:
        with open(path_data + 'diags_reports' + str_end,) as json_file:
            diags_reports = json.load(json_file)

    return diags_s_kdiag, diags_r_kdiag, diags_reports


def load_labelled_kdiag_files():
    '''
    load labelled (by Marc) seantis/kisim_diagnosis texts
    
    Marc labelled data from 20201120 and 20210302 export
    
    input:
    - none
    
    output:
    - df_labelled: data frame with labelled texts, 1 row = 1 line
    
    '''

    def get_labelled_set(list_dfs, path_data):
        list_files = [file for file in os.listdir(path_data) if file.endswith(which_diags + '_rev.csv')]
        for file in list_files:
            _df = pd.read_csv(path_data + file).set_index("Unnamed: 0")
            _df.index.names = ['']
            _df['research_id'] = file.split('_')[0]
            list_dfs.append(_df[['research_id', 'text', 'class']])    
            
        return list_dfs
        
    
    imported_20201120 = 'imported_20201120/'
    imported_20210302 = 'imported_20210302/'
    
    # only 'longest' was labelled, there is no 'unique'
    which_diags = 'longest'

    # get list of data frames where one data frame corresponds to all lines from a research id
    list_dfs = list()

    # set 1 files
    path_data = str(paths.DATA_PATH) + '/raw/midatams_preprocessed/labelling/' + imported_20201120 + 'set1/'
    list_dfs = get_labelled_set(list_dfs, path_data)

    # set 2 files
    path_data = str(paths.DATA_PATH) + '/raw/midatams_preprocessed/labelling/' + imported_20201120 + 'set2/'
    list_dfs = get_labelled_set(list_dfs, path_data)

    # set 3 files
    path_data = str(paths.DATA_PATH) + '/raw/midatams_preprocessed/labelling/' + imported_20210302 + 'set3/'
    list_dfs = get_labelled_set(list_dfs, path_data)

    # create big data frame
    df_labelled = pd.concat(list_dfs)
    df_labelled = df_labelled[df_labelled['text'].notnull()]
    df_labelled['class'] = [item.replace(' ', '') for item in df_labelled['class']]

    # reset indices to get unique index for each entry
    df_labelled.index.name = 'index_within_rid'
    df_labelled = df_labelled.reset_index()
    df_labelled = df_labelled.reset_index()

    return df_labelled

def get_df_diag(dict_diags, imported, df_s_diag = None):
    
    '''
    get a data frame based on diagnosis texts from seantis/kisim_diagnoses.csv
    
    input:
    - dict_diags: dictionary with diagnosis texts
    - df_labelled: data frame with labelled texts
    - imported: str for import
    - df_s_diag: data frame of seantis/diagnoses.csv
    
    output:
    - df_s_kdiag: data frame where one row corresponds to a text line
    '''
    
    def get_one_df(rid, diag, df_labelled, date):
        
        '''
        helper function to get data frame per research id based on 
        '''
        
        list_rids_labelled = list(df_labelled['research_id'].unique())

        # copy labelled data frames
        if rid in list_rids_labelled:

            _df = df_labelled[df_labelled['research_id'] == rid][['research_id', 'text', 'class']]
            _df = _df.reset_index(drop = True)
            _df['date_of_entry'] = date
            
        # generate unlabelled data frames
        else:

            _df = pd.DataFrame(diag).rename(columns = {0: 'text'})
            _df['research_id'] = rid
            _df['class'] = ''
            _df['date_of_entry'] = date
        
        return _df
        
    # get df_labelled
    df_labelled = load_labelled_kdiag_files()
    list_rids_labelled = list(df_labelled['research_id'].unique())

    # initialize
    list_dfs = list()

    # old import
    if imported in ['imported_20201120/']:
        
        # old: get list of rids which are both in seantis/kisim_diagnoses and seantis/diagnoses
        df_s_kdiag_ids = pd.DataFrame(list(dict_diags.keys()), columns = ['research_id']).assign(in_document = True)
        df_merged = pd.merge(df_s_diag, df_s_kdiag_ids, left_on = 'research_id', right_on = 'research_id', how = 'outer')
        df_merged_inboth = df_merged[(df_merged['disease'].notnull()) & (df_merged['in_document'] == True)]
        list_rids_diag = list(df_merged_inboth['research_id'].unique())

        # old: consider research IDs from seantis/diagnosis.csv plus IDs with diagnosis texts containing "Multiple Sklerose"
        str_MS = "Multiple Sklerose"
        for rid, diag in dict_diags.items():

            if (rid in list_rids_diag) | (str_MS in ' '.join(diag)): 
        
                # get dataframe
                _df = get_one_df(rid, diag, df_labelled)
                list_dfs.append(_df)

                
    # current import
    elif imported in ['imported_20210302/', 'imported_20210507/']:

        # consider all research ids in seantis/kisim_diag
        list_rids_added = list()
        for key, diag in dict_diags.items():
            
            if len(key.split('_')) == 3:
                rid = key.split('_')[0]
                date = key.split('_')[2]
            else:
                rid = key
                date = ''

            # append to list of labelled rids
            if rid in list_rids_labelled:
                list_rids_added.append(rid)

            # get dataframe
            _df = get_one_df(rid, diag, df_labelled, date)
            list_dfs.append(_df)
                
        # add labelled research ids which aren't in new kisim_diag
        for rid in list_rids_labelled:

            if rid not in list_rids_added:

                # get dataframe
                _df = get_one_df(rid, diag, df_labelled, date)
                list_dfs.append(_df)

    # create big data frame
    df_diag = pd.concat(list_dfs, sort = True)
    df_diag = df_diag[df_diag['text'].notnull()]

    # reset indices to get unique index for each entry
    df_diag.index.name = 'index_within_rid'
    df_diag = df_diag.reset_index()
    df_diag = df_diag.reset_index()
    
    return df_diag


def load_text_file(path_diag, imported, which_input):
    
    # set diagnosis extraction type
    which_diags = 'longest'   # 'unique' not suitable as Marc labelled longest

    # load dictionary of diagnosis
    if which_input == 'seantis-kisimdiagnoses':
        
        dict_diag, _, _ = load_data(path_data = path_diag, which_diags = which_diags)

    elif which_input == 'rsd-kisimdiagnoses':
        
        _, dict_diag, _ = load_data(path_data = path_diag, which_diags = which_diags)

    elif which_input == 'reports':
        
        _, _, dict_diag = load_data(path_data = path_diag, which_diags = which_diags)
    
    # get dataframe
    df_diag = get_df_diag(dict_diag, imported)
    
    return df_diag

path_diag = str(paths.DATA_PATH) + '/raw/midatams_preprocessed' + '/diagnoses/' + imported
df_diag = load_text_file(path_diag, imported, which_input)

In [None]:
# Make preprocessed directory and save file
midatams_dir = paths.DATA_PATH_PREPROCESSED/'midatams'
os.makedirs(midatams_dir, exist_ok=True)
df_diag.to_csv(os.path.join(midatams_dir, 'seantis_kisim.csv'), index=False)