In [46]:
import zipfile
import os
import urllib.request
import pandas as pd
import csv
import pickle
import numpy as np


In [52]:
# Global Definitions

# Util Functions
def save_pickle(path, dump):
    with open(path, 'wb') as file:
        pickle.dump(dump, file)

def load_pickle(path):
    with open(path, 'rb') as file:
        return pickle.load(file)

# Embedding Defs
RARE_WORD = 100
STOP_WORD = 1e4
UNKNOWN = 1

# File Names
VOCAB_FILE = "./data/vocab.txt"
STOP_FILE = "./data/stop.txt"
INPUT_FILE = "./data/input.txt"
VOCAB_PKL = "./data/vocab.pkl"
SIM_DATA_URL = "https://journals.plos.org/plosone/article/file?type=supplementary&id=10.1371/journal.pone.0195024.s001"
SIM_DATA_ZIP = "./txtData.zip"
SIM_DATA_FILE = "./data/S1_File.txt"

In [11]:
def retrieve_data(print_out=False):
    # Retrieve Data If Not In Our Active Directory
    if not os.path.exists(SIM_DATA_ZIP):
        urllib.request.urlretrieve(SIM_DATA_LOC, SIM_DATA_ZIP)

    # Unzip our Data into Usable Form
    if not os.path.exists("./data"):
        with zipfile.ZipFile(SIM_DATA_ZIP, 'r') as zipped_file:
            zipped_file.extractall("./data")


    # Read Our Data into a Pandas Table
    data = pd.read_csv(SIM_DATA_FILE, sep='\t', header=0)

    # Check Our Data
    if print_out:
        print("\n", data.head(), "\n")

print("Retrieving Data...")
retrieve_data(True)
print("Done!")

Retrieving Data...

    PID  DAY_ID                               DX_GROUP_DESCRIPTION  \
0    1   73888                                    ANGINA PECTORIS   
1    1   73888  MONONEURITIS OF UPPER LIMB AND MONONEURITIS MU...   
2    1   73888  SYMPTOMS INVOLVING RESPIRATORY SYSTEM AND OTHE...   
3    1   73880                                 ACUTE APPENDICITIS   
4    1   73880                                  DIABETES MELLITUS   

     SERVICE_LOCATION  OP_DATE  
0      DOCTORS OFFICE    74084  
1      DOCTORS OFFICE    74084  
2      DOCTORS OFFICE    74084  
3  INPATIENT HOSPITAL    74084  
4  INPATIENT HOSPITAL    74084   

Done!


In [25]:
def data_to_csv():
    # Group Our Data By Description
    desc = data.groupby('DX_GROUP_DESCRIPTION').size().to_frame('SIZE').reset_index()
    rare = desc[desc['SIZE'] > RARE_WORD]
    stop = desc[desc['SIZE'] > STOP_WORD]

    rare = rare.sort_values(by = 'SIZE').reset_index()['DX_GROUP_DESCRIPTION']
    stop = stop.reset_index()['DX_GROUP_DESCRIPTION']
        
    rare.index += 2 # We will follow the studies format of keeping "Unknown" as 1
    
    print("Writing Vocab List to CSV...")
    rare.to_csv(VOCAB_FILE, sep = '\t', header = False, index = True)
    print("Done!")
    
    print("\nWriting Stop Word List to CSV...")
    stop.to_csv(STOP_FILE, sep = '\t', header = False, index = False)
    print("Done!")
    
    print("\nData Successfully Written as {} and {} in CSV Format!".format(VOCAB_FILE, STOP_FILE))
    
data_to_csv()

Writing Vocab List to CSV...
Done!

Writing Stop Word List to CSV...
Done!

Data Successfully Written as ./data/vocab.txt and ./data/stop.txt in CSV Format!


In [55]:
def load_data_from_file():
    word2ind = {}
    
    with open(VOCAB_FILE, 'r') as vocab_file:
        read_in = csv.reader(vocab_file, delimiter='\t')
        word2ind = { entry[1]:int(entry[0]) for entry in read_in }
        
    # Save Ind2Word Vec to Pickled File
    save_pickle(VOCAB_PKL, {val:key for key, val in word2ind.items()})
    
    return word2ind

load_data_from_file()
# load_pickle(VOCAB_PKL)

{'DISEASES OF THE ORAL SOFT TISSUES, EXCLUDING LESIONS SPECIFIC FOR GINGIVA AND TONGUE': 2,
 'ALCOHOL DEPENDENCE SYNDROME': 3,
 'FAMILY HISTORY OF MALIGNANT NEOPLASM': 4,
 'MALIGNANT NEOPLASM OF GALLBLADDER AND EXTRAHEPATIC BILE DUCTS': 5,
 'OTHER DISORDERS OF PROSTATE': 6,
 'SPECIFIC NONPSYCHOTIC MENTAL DISORDERS FOLLOWING ORGANIC BRAIN DAMAGE': 7,
 'ILL-DEFINED INTESTINAL INFECTIONS': 8,
 'OTHER AND UNSPECIFIED INFECTIOUS AND PARASITIC DISEASES': 9,
 'DISORDERS OF MENSTRUATION AND OTHER ABNORMAL BLEEDING FROM FEMALE GENITAL TRACT': 10,
 'TERAZOSIN HYDROCHLORIDE': 11,
 'OTHER PERSONS SEEKING CONSULTATION WITHOUT COMPLAINT OR SICKNESS': 12,
 'DISORDERS OF IRIS AND CILIARY BODY': 13,
 'OTHER DISORDERS OF THYROID': 14,
 'OPEN WOUND OF TOE(S)': 15,
 'FALL ON SAME LEVEL FROM SLIPPING, TRIPPING, OR STUMBLING': 16,
 'TRIAMTERENE': 17,
 'FRACTURE OF PELVIS': 18,
 'MALIGNANT NEOPLASM OF LARYNX': 19,
 'FRACTURE OF TIBIA AND FIBULA': 20,
 'KETOCONAZOLE': 21,
 'CINACALCET HYDROCHLORIDE': 22,
 'AS

In [None]:
def fix_format(word2ind, events):
    with open(INPUT_FILE, 'r') as input_file:
        file_header = input_file.readline().strip().split('\t')
        pos = { entry : i for i, entry in enumerate(file_header) }
        
        docs, doc, sent, labels, label = ([], ) * 5
        read_in = csv.reader(input_file, delimiter='\t')
        
