In [23]:
### CODE IN THIS FILE IS BASED ON THE FORMAT OF https://github.com/danicaxiao/CONTENT/blob/master/transform.py
### HOWEVER THE CODE IS WRITTEN IN MY OWN FORM FOR BETTER UNDERSTANDING

import zipfile
import os
import urllib.request
import pandas as pd
import csv
import pickle
import numpy as np

In [2]:
# Global Definitions

# Util Functions From https://github.com/danicaxiao/CONTENT/blob/master/util.py
def save_pkl(path, dump):
    with open(path, 'wb') as file:
        pickle.dump(dump, file)

def load_pkl(path):
    with open(path, 'rb') as file:
        return pickle.load(file)
    
def save_npy(path, obj):
    np.save(path, obj)

def load_npy(path):
    obj = np.load(path)
    return obj

# Embedding Defs
RARE_WORD = 100
STOP_WORD = 1e4
UNKNOWN = 1

# General File Paths
VOCAB_FILE = "./data/vocab.txt"
STOP_FILE = "./data/stop.txt"
VOCAB_PKL = "./data/vocab.pkl"
SIM_DATA_URL = "https://journals.plos.org/plosone/article/file?type=supplementary&id=10.1371/journal.pone.0195024.s001"
SIM_DATA_ZIP = "./txtData.zip"
INPUT_FILE = "./data/S1_File.txt"

# Train/Validation/Test Data File Paths
X_TRAIN_FILE = "./data/X_train.pkl"
X_VALID_FILE = "./data/X_valid.pkl"
X_TEST_FILE = "./data/X_test.pkl"

# Train/Validation/Test Label File Paths
Y_TRAIN_FILE = "./data/Y_train.pkl"
Y_VALID_FILE = "./data/Y_valid.pkl"
Y_TEST_FILE = "./data/Y_test.pkl"

# Train/Validation Split Values
TRAIN_COUNT = 4000
VALID_COUNT = 700

In [7]:
def retrieve_data(print_out=False):
    # Retrieve Data If Not In Our Active Directory
    if not os.path.exists(SIM_DATA_ZIP):
        urllib.request.urlretrieve(SIM_DATA_LOC, SIM_DATA_ZIP)

    # Unzip our Data into Usable Form
    if not os.path.exists("./data"):
        with zipfile.ZipFile(SIM_DATA_ZIP, 'r') as zipped_file:
            zipped_file.extractall("./data")


    # Read Our Data into a Pandas Table
    data = pd.read_csv(INPUT_FILE, sep='\t', header=0)

    # Check Our Data
    if print_out:
        print("\n", data.head(), "\n")
    
    return data

print("Retrieving Data...")
data = retrieve_data(True)
print("Done!")

Retrieving Data...

    PID  DAY_ID                               DX_GROUP_DESCRIPTION  \
0    1   73888                                    ANGINA PECTORIS   
1    1   73888  MONONEURITIS OF UPPER LIMB AND MONONEURITIS MU...   
2    1   73888  SYMPTOMS INVOLVING RESPIRATORY SYSTEM AND OTHE...   
3    1   73880                                 ACUTE APPENDICITIS   
4    1   73880                                  DIABETES MELLITUS   

     SERVICE_LOCATION  OP_DATE  
0      DOCTORS OFFICE    74084  
1      DOCTORS OFFICE    74084  
2      DOCTORS OFFICE    74084  
3  INPATIENT HOSPITAL    74084  
4  INPATIENT HOSPITAL    74084   

Done!


In [8]:
def data_to_csv(data):
    # Group Our Data By Description
    desc = data.groupby('DX_GROUP_DESCRIPTION').size().to_frame('SIZE').reset_index()
    rare = desc[desc['SIZE'] > RARE_WORD]
    stop = desc[desc['SIZE'] > STOP_WORD]

    rare = rare.sort_values(by = 'SIZE').reset_index()['DX_GROUP_DESCRIPTION']
    stop = stop.reset_index()['DX_GROUP_DESCRIPTION']
        
    rare.index += 2 # We will follow the studies format of keeping "Unknown" as 1
    
    print("Writing Vocab List to CSV...")
    rare.to_csv(VOCAB_FILE, sep = '\t', header = False, index = True)
    print("Done!")
    
    print("\nWriting Stop Word List to CSV...")
    stop.to_csv(STOP_FILE, sep = '\t', header = False, index = False)
    print("Done!")
    
    print("\nData Successfully Written as {} and {} in CSV Format!".format(VOCAB_FILE, STOP_FILE))
    
data_to_csv(data)

Writing Vocab List to CSV...
Done!

Writing Stop Word List to CSV...
Done!

Data Successfully Written as ./data/vocab.txt and ./data/stop.txt in CSV Format!


In [9]:
def load_data_from_file():
    word2ind = {}
    
    with open(VOCAB_FILE, 'r') as vocab_file:
        read_in = csv.reader(vocab_file, delimiter='\t')
        word2ind = { entry[1]:int(entry[0]) for entry in read_in }
        
    # Save Ind2Word Vec to Pickled File
    save_pickle(VOCAB_PKL, {val:key for key, val in word2ind.items()})
    
    return word2ind

# load_data_from_file()
# load_pickle(VOCAB_PKL)

In [16]:
# THIS FUNCTION IS DIRECTLY RE-USED FROM https://github.com/danicaxiao/CONTENT/blob/master/transform.py
def convert_format(word_to_index, events):
    # order by PID, DAY_ID
    with open(INPUT_FILE, mode='r') as f:
        # header
        header = f.readline().strip().split('\t')
        print(header)
        pos = {}
        for key, value in enumerate(header):
            pos[value] = key
        print(pos)

        docs = []
        doc = []
        sent = []
        labels = []
        label = []

        # init
        line = f.readline()
        tokens = line.strip().split('\t')
        pid = tokens[pos['PID']]
        day_id = tokens[pos['DAY_ID']]
        label.append(tag(events, pid, day_id))

        while line != '':
            tokens = line.strip().split('\t')
            c_pid = tokens[pos['PID']]
            c_day_id = tokens[pos['DAY_ID']]

            # closure
            if c_pid != pid:
                doc.append(sent)
                docs.append(doc)
                sent = []
                doc = []
                pid = c_pid
                day_id = c_day_id
                labels.append(label)
                label = [tag(events, pid, day_id)]
            else:
                if c_day_id != day_id:
                    doc.append(sent)
                    sent = []
                    day_id = c_day_id
                    label.append(tag(events, pid, day_id))

            word = tokens[pos['DX_GROUP_DESCRIPTION']]
            try:
                sent.append(word_to_index[word])
            except KeyError:
                sent.append(UNKNOWN)

            line = f.readline()

        # closure
        doc.append(sent)
        docs.append(doc)
        labels.append(label)

    return docs, labels

# THIS FUNCTION IS DIRECTLY RE-USED FROM https://github.com/danicaxiao/CONTENT/blob/master/transform.py
def tag(events, pid, day_id):
    return 1 if tag_logic(events, pid, day_id) else 0

# THIS FUNCTION IS DIRECTLY RE-USED FROM https://github.com/danicaxiao/CONTENT/blob/master/transform.py
def tag_logic(events, pid, day_id):
    try:
        patient = events.loc[int(pid)]

        # test whether have events within 30 days
        if isinstance(patient, pd.Series):
            return (int(day_id) <= patient.DAY_ID) & (patient.DAY_ID < int(day_id) + 30)

        return patient.loc[(int(day_id) <= patient.DAY_ID) & (patient.DAY_ID < int(day_id) + 30)].shape[0] > 0
    except KeyError:
        # the label is not in the [index]
        return False

# THIS FUNCTION IS DIRECTLY RE-USED FROM https://github.com/danicaxiao/CONTENT/blob/master/transform.py
def extract_events():
    # extract event "INPATIENT HOSPITAL"
    target_event = 'INPATIENT HOSPITAL'

    df = pd.read_csv(INPUT_FILE, sep='\t', header=0)
    events = df[df['SERVICE_LOCATION'] == target_event]

    events = events.groupby(['PID', 'DAY_ID', 'SERVICE_LOCATION']).size().to_frame('COUNT').reset_index()\
        .sort_values(by=['PID', 'DAY_ID'], ascending=True)\
        .set_index('PID')

    return events

In [20]:
def splits(X, labels):
    save_pickle(X_TRAIN_FILE, X[:TRAIN_COUNT])
    save_pickle(X_VALID_FILE, X[TRAIN_COUNT:(TRAIN_COUNT + VALID_COUNT)])
    save_pickle(X_TEST_FILE,  X[TRAIN_COUNT + VALID_COUNT:])
    save_pickle(Y_TRAIN_FILE, labels[:TRAIN_COUNT])
    save_pickle(Y_VALID_FILE, labels[TRAIN_COUNT:(TRAIN_COUNT + VALID_COUNT)])
    save_pickle(Y_TEST_FILE,  labels[TRAIN_COUNT + VALID_COUNT:])

In [21]:
# This Cell Mimics Main() in https://github.com/danicaxiao/CONTENT/blob/master/transform.py
# This Should Setup the Data Needed For Training, Validation, and Testing
def main():
    data = retrieve_data(True)
    data_to_csv(data)
    word2ind = load_data_from_file()
    events = extract_events()
    data, labels = convert_format(word2ind, events)
    splits(data, labels)

In [22]:
main()


    PID  DAY_ID                               DX_GROUP_DESCRIPTION  \
0    1   73888                                    ANGINA PECTORIS   
1    1   73888  MONONEURITIS OF UPPER LIMB AND MONONEURITIS MU...   
2    1   73888  SYMPTOMS INVOLVING RESPIRATORY SYSTEM AND OTHE...   
3    1   73880                                 ACUTE APPENDICITIS   
4    1   73880                                  DIABETES MELLITUS   

     SERVICE_LOCATION  OP_DATE  
0      DOCTORS OFFICE    74084  
1      DOCTORS OFFICE    74084  
2      DOCTORS OFFICE    74084  
3  INPATIENT HOSPITAL    74084  
4  INPATIENT HOSPITAL    74084   

Writing Vocab List to CSV...
Done!

Writing Stop Word List to CSV...
Done!

Data Successfully Written as ./data/vocab.txt and ./data/stop.txt in CSV Format!
['PID', 'DAY_ID', 'DX_GROUP_DESCRIPTION', 'SERVICE_LOCATION', 'OP_DATE']
{'PID': 0, 'DAY_ID': 1, 'DX_GROUP_DESCRIPTION': 2, 'SERVICE_LOCATION': 3, 'OP_DATE': 4}
