In [None]:
import pandas as pd
import numpy as np
import os
import string
import random
import json

MIMIC_DATA_DIR = '/path/to/MIMIC-III v1.4'
SCAN_REPO_DIR = '/path/to/ScAN'
DATASET_DIR = 'Datasets'

## Read data and annotation

In [None]:
notes = pd.read_csv(os.path.join(MIMIC_DATA_DIR, 'NOTEEVENTS.csv'))
diags = pd.read_csv(os.path.join(MIMIC_DATA_DIR, 'DIAGNOSES_ICD.csv'))
diag_dict = pd.read_csv(os.path.join(MIMIC_DATA_DIR, 'D_ICD_DIAGNOSES.csv'))

In [None]:
with open(os.path.join(SCAN_REPO_DIR, 'annotations', 'val_hadm.json'), 'r') as f:
    val = json.load(f)
with open(os.path.join(SCAN_REPO_DIR, 'annotations', 'train_hadm.json'), 'r') as f:
    train = json.load(f)
with open(os.path.join(SCAN_REPO_DIR, 'annotations', 'test_hadm.json'), 'r') as f:
    test = json.load(f)

In [None]:
SA_hadms = [i.split('_')[1] for i in val.keys()] + [i.split('_')[1] for i in train.keys()] + [i.split('_')[1] for i in test.keys()]
SA_hadms = [int(item) for item in SA_hadms]
len(set(SA_hadms))

In [None]:
SA_subs = [i.split('_')[0] for i in val.keys()] + [i.split('_')[0] for i in train.keys()] + [i.split('_')[0] for i in test.keys()]
SA_subs = [int(item) for item in SA_subs]
len(set(SA_subs))

In [None]:
SAwords = ['suicide', 'suicidal', 'self-inflicted', 'overdose', 'poison', 'vehicle', 'drowning']
neutralHadms, neutralSubs = [], []
allSubs = diags['SUBJECT_ID'].unique()
random.shuffle(allSubs)
for hd in allSubs:
    if len(neutralHadms) == 29:
        break
    if hd in SA_subs or hd in neutralSubs:
        continue
    des = diags[diags['SUBJECT_ID'] == hd].merge(diag_dict[['ICD9_CODE','SHORT_TITLE', 'LONG_TITLE']], on='ICD9_CODE')['LONG_TITLE']
    des = (' '.join(des)).lower()
    SAcheck = False
    for w in SAwords:
        if w in des:
            SAcheck = True
            break
    if not SAcheck:
        hadms = list(set(diags[diags['SUBJECT_ID'] == hd]['HADM_ID']))
        chosenHadm = random.choice(hadms)
        if len(notes[notes['HADM_ID'] == chosenHadm]) == 0:
            continue
        neutralHadms.append(str(hd) + '_' + str(chosenHadm))
        neutralSubs.append(hd)

In [None]:
randHadm = random.choice(neutralHadms)
diags[diags['HADM_ID'] == randHadm].merge(diag_dict[['ICD9_CODE','SHORT_TITLE', 'LONG_TITLE']], on='ICD9_CODE')

In [None]:
train_neutral = {item:{} for item in neutralHadms[:1000]}
val_neutral = {item:{} for item in neutralHadms[1000:]}

with open(os.path.join(DATASET_DIR, 'train_neutral_hadm.json'), 'w') as outfile:
    json.dump(train_neutral, outfile)
with open(os.path.join(DATASET_DIR, 'val_neutral_hadm.json'), 'w') as outfile:
    json.dump(val_neutral, outfile)

## Assign stay label

In [None]:
def assign_label(folder_path, neutral_folder_path):
    labels = {}
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        sents = pd.read_csv(file_path)
        y, unsure, n = 0, 0, 0
        for lb in sents['label']:
            if lb == 'SA_positive':
                y += 1
            elif lb == 'SA_negative':
                n += 1
            elif lb == 'SA_unsure':
                unsure += 1

        if y > n and y > unsure:
            labels[int(sents.iloc[0]['hadmid'])] = 'SA_positive'
        elif n > y and n > unsure:
            labels[int(sents.iloc[0]['hadmid'])] = 'SA_negative'
        elif unsure > y and unsure > n:
            labels[int(sents.iloc[0]['hadmid'])] = 'SA_unsure'
        elif y + n + unsure == 0:
            labels[int(sents.iloc[0]['hadmid'])] = 'SA_negative'
        else:
            print(sents.iloc[0]['hadmid'], y, unsure, n)
            
    for filename in os.listdir(neutral_folder_path):
        hadm = int(filename.split('.')[0])
        train_label[hadm] = 'SA_negative'

    return labels

train_label = assign_label(os.path.join(SCAN_REPO_DIR, 'ScAN_segmentation', 'train'),
                           os.path.join(DATASET_DIR, 'train_neutral'))
val_label = assign_label(os.path.join(SCAN_REPO_DIR, 'ScAN_segmentation', 'val'), 
                         os.path.join(DATASET_DIR, 'val_neutral'))

In [None]:
with open(os.path.join(DATASET_DIR, 'ScAN_segmentation', 'val_label.json'), 'w') as outfile:
    json.dump(train_label, outfile)

## Split validation set

Split validation set from the training set and use the original val set as the test set.

In [None]:
with open(os.path.join(DATASET_DIR, 'ScAN_segmentation', 'train_label.json'), 'r') as f:
    train_label = json.load(f)
    
y_stays = [s for s in train_label if train_label[s] == 'SA_positive']
unsure_stays = [s for s in train_label if train_label[s] == 'SA_unsure']
n_stays = [s for s in train_label if train_label[s] == 'SA_negative']

In [None]:
val_set = random.sample(y_stays, 36) \
    + random.sample(unsure_stays, 10) \
    + random.sample(n_stays, 130)

In [None]:
with open(os.path.join(DATASET_DIR, 'ScAN_segmentation', 'validationHadms.json'), 'r') as f:
    val_set = json.load(f)