In [1]:
import sys
import _pickle as pickle
import numpy as np
from datetime import datetime
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
admissionFile = '../sah_dataset/gan_pat_df.csv'
diagnosisFile = '../sah_dataset/top200diag.csv'

# 1. build mapping for patient table 

In [3]:
infd = open(admissionFile, 'r')
infd.readline()

',patient_id,patient_sk,race,gender,marital_status,encounter_id,discharge_disposition_id,discharged_dt_tm,deadOrAlive\n'

In [4]:
print('Building pid-encounter id mapping, discharged-date mapping')
pidEncMap = {}
disDateMap = {}
for line in infd:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    encId = int(float(tokens[6]))
    disTime = datetime.strptime(tokens[8], '%Y-%m-%d %H:%M:%S')
    disDateMap[encId] = disTime
    if pid in pidEncMap: pidEncMap[pid].append(encId)
    else: pidEncMap[pid] = [encId]
infd.close()

Building pid-encounter id mapping, discharged-date mapping


# 1.2 build convariance table

In [5]:
print('Collecting gender information')
pidSexMap = {}
infd = open(admissionFile, 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    gender = tokens[4]
    if gender == 'Male':
         pidSexMap[pid] = 1
    else:
         pidSexMap[pid] = 0
infd.close()    
    
print('Collecting race information')
pidRaceMap = {}
infd = open(admissionFile, 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    race = tokens[3]
    if race == 'Caucasian':
         pidRaceMap[pid] = 0
    elif race == 'African American':
         pidRaceMap[pid] = 1
    elif race == 'Native American':
         pidRaceMap[pid] = 2
    elif race == 'Asian':
         pidRaceMap[pid] = 3            
    elif race == 'Hispanic':
         pidRaceMap[pid] = 4
    else:
         pidRaceMap[pid] = 5           
infd.close()      

Collecting gender information
Collecting race information


# 2. build mapping for diagnosis table

In [6]:
infd = open(diagnosisFile, 'r')
infd.readline()

',encounter_id,diagnosis_id,diagnosis_priority,diagnosis_type_id,present_on_admit_id,third_party_ind\n'

In [7]:
print('Building discharged-dxList mapping')
encDxMap = {}
for line in infd:
    tokens = line.strip().split(',')
    encId = int(float(tokens[1]))
    dxStr = int(float(tokens[2]))
    if encId in encDxMap: encDxMap[encId].append(dxStr)
    else: encDxMap[encId] = [dxStr]
infd.close()

Building discharged-dxList mapping


# 3. Building pid-sortedVisits mapping

In [8]:
print('Building pid-sortedVisits mapping')
pidSeqMap = {}
for pid, encIdList in pidEncMap.items():
    #if len(encIdList) < 2: continue
    sortedList = sorted([(disDateMap[encId], encDxMap[encId]) for encId in encIdList if encId in encDxMap], reverse = True)
    #### this line did not consider encId mismatch in diagnosis table
    pidSeqMap[pid] = sortedList

Building pid-sortedVisits mapping


In [9]:
len(pidSeqMap)

65988

# 4. Building pids, dates, strSeqs

In [10]:
print('Building pids, dates, strSeqs,convariance')
pids = []
dates = []
seqs = []
races = []
genders = []
for pid, visits in pidSeqMap.items():
    pids.append(pid)
    races.append(pidRaceMap[pid])
    genders.append(pidSexMap[pid])
    seq = []
    date = []
    for visit in visits:
        date.append(visit[0])
        seq.append(visit[1])
    dates.append(date)
    seqs.append(seq)

Building pids, dates, strSeqs,convariance


In [11]:
len(races)

65988

# 5. Converting strSeqs to intSeqs, and making types

In [12]:
print('Converting strSeqs to intSeqs, and making types')
types = {}
newSeqs = []
for patient in seqs:
    newPatient = []
    for visit in patient:
        newVisit = []
        for code in visit:
            if code in types:
                newVisit.append(types[code])
            else:
                types[code] = len(types)
                newVisit.append(types[code])
        newPatient.append(newVisit)
    newSeqs.append(newPatient)

Converting strSeqs to intSeqs, and making types


In [49]:
len(types)

200

# 6. Constructing the matrix

In [50]:
otuFile = "SAH_binary"

In [13]:
print('Constructing the matrix')
numPatients = len(newSeqs)
numCodes = len(types)
matrix = np.zeros((numPatients, numCodes)).astype('float32')
for i, patient in enumerate(newSeqs):
    for visit in patient:
        for code in visit:
                matrix[i][code] = 1.                

Constructing the matrix


In [52]:
matrix.shape

(65988, 200)

In [53]:
#each row is a patient
#each column is a type id(convert from diagnosis id)

In [54]:
matrix

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [55]:
df = pd.DataFrame(data=matrix)

In [56]:
df.insert(200, "race", races)

In [57]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,race
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [58]:
x=np.zeros(65988) 

In [59]:
len(x)

65988

In [60]:
df.insert(201, "causian", x)
df.insert(202, "asian", x)
df.insert(203, "hispanic", x)

In [75]:
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,194,195,196,197,198,199,race,causian,asian,hispanic
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0


In [87]:
df.loc[df['race'] == 0, 'causian'] = 1
df.loc[df['race'] == 3, 'asian'] = 1
df.loc[df['race'] == 4, 'hispanic'] = 1

In [96]:
covariance=df[df.columns[201:204]] 

In [97]:
cov=covariance.values.tolist()

In [98]:
cov

[[0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0, 0.0],
 [1.0, 0.0

In [22]:
pickle.dump(matrix, open(outFile + '.matrix', 'wb'), -1)

In [23]:
pickle.dump(types, open(outFile+'.types', 'wb'), -1)

In [24]:
pickle.dump(pids, open(outFile+'.pids', 'wb'), -1)

In [99]:
pickle.dump(cov, open(outFile+'.covs', 'wb'), -1)