In [2]:
import numpy as np
import pandas as pd
import tqdm
import scipy.sparse
import pickle
import os

#os.mkdir('matrix')
icd_desc = pd.read_csv('D_ICD_DIAGNOSES.csv.gz')
MIMIC3 = pd.read_csv('DIAGNOSES_ICD.csv.gz')[["SUBJECT_ID", "ICD9_CODE"]].dropna(axis=0)

# preprocess ICD9_CODE to match phecode. Rule: 3 chars --> no change. 4 chars --> add . before last char. 5 chars --> add . before second last char
icd_desc_icd_recoded = [icd if len(icd) == 3 else icd[:3] + '.' + icd[-1] if len(icd) == 4 else icd[:3] + '.' + icd[-2:] for icd in icd_desc['ICD9_CODE']]
MIMIC3_icd_recoded = [icd if len(icd) == 3 else icd[:3] + '.' + icd[-1] if len(icd) == 4 else icd[:3] + '.' + icd[-2:] for icd in MIMIC3['ICD9_CODE'].astype('string')]
icd_desc['ICD9_CODE'] = icd_desc_icd_recoded
MIMIC3['ICD9_CODE'] = MIMIC3_icd_recoded

# only need ICD9 code, PheCode and description
phecode = pd.read_csv('phecode_icd9_rolled.csv')[['ICD9', 'PheCode','ICD9 String', 'Phenotype']]

# join MIMIC3 and phecode based on ICD9 codes, so that each row is an ICD code and its associated PheCode for a patient
MIMIC3_phecode = MIMIC3.join(phecode.set_index('ICD9'), on='ICD9_CODE', how='inner')


patient_list = np.sort(MIMIC3_phecode.SUBJECT_ID.unique().tolist())
D = len(patient_list)

In [3]:
# patient_idx_meta = {k: v for v, k in enumerate(patient_list)}
ICD_list = np.sort(MIMIC3_phecode.ICD9_CODE.unique().tolist())
V = len(ICD_list)

In [4]:
## Link word with index, HERE ORDER MISMATCH BETWEEN META AND N_WD OR N_DK
patient_idx_meta = {k: v for v, k in enumerate(patient_list)}
idx_patient_meta = {v: k for v, k in enumerate(patient_list)}
vocab_idx_meta = {k: v for v, k in enumerate(ICD_list)}
idx_vocab_meta = {v: k for v, k in enumerate(ICD_list)}

In [5]:
## Get document by word matrix (D X V)
# n_wd = pd.crosstab(MIMIC3.loc[MIMIC3["SUBJECT_ID"].isin(patient_list)]["SUBJECT_ID"], MIMIC3.loc[MIMIC3["ICD9_CODE"].isin(ICD_list)]["ICD9_CODE"])
n_dw = pd.crosstab(MIMIC3_phecode["SUBJECT_ID"], MIMIC3_phecode["ICD9_CODE"])
docs = n_dw.values

In [6]:
## mappings and save D X V matrix
# with open('matrix2/vocab_idx_map.pkl', 'wb') as f:
#     pickle.dump(vocab_idx_meta, f)
# with open('matrix2/idx_vocab_map.pkl', 'wb') as f:
#     pickle.dump(idx_vocab_meta, f)
    
# with open('matrix2/patient_idx_map.pkl', 'wb') as f:
#     pickle.dump(patient_idx_meta, f)
# with open('matrix2/idx_patient_map.pkl', 'wb') as f:
#     pickle.dump(idx_patient_meta, f)

# sparse_docs = scipy.sparse.csc_matrix(docs)
# scipy.sparse.save_npz('matrix2/n_dw.npz', sparse_docs)



In [6]:
# Get document by topic count matrix (D X K)
phecode_patient_counts = pd.crosstab(MIMIC3_phecode["SUBJECT_ID"], MIMIC3_phecode["PheCode"])
n_dk = phecode_patient_counts.values

In [7]:
# alpha (K * M)
# M = 4 ## hyper parameter
# m_subtopics = sys.argv[1]

PheCode = np.unique(MIMIC3_phecode['PheCode'].values)
K = PheCode.size

In [8]:
M=4
alpha_dk = np.zeros((len(patient_list), K, M))

# first try assign alpha for phecode-associated subtopics with 1.
flatten_alpha_dk = []

with tqdm.tqdm(total = len(patient_list)) as pbar:
    for idx, pat in enumerate(patient_list):
        # get phecode idx for patient 
        d_k_idx = np.where(n_dk[idx] != 0)
        # assign 1 to the subtopics of phecodes
        alpha_dk[idx][d_k_idx] = 1
        # flatten alpha_dk for patient d
        flatten_alpha_dk.append(alpha_dk[idx].flatten())
        pbar.update(1)

flatten_alpha_dk = np.array(flatten_alpha_dk)



100%|██████████| 46278/46278 [00:04<00:00, 10158.59it/s]


In [9]:
## Save generated alpha
sparse_alpha = scipy.sparse.csc_matrix(flatten_alpha_dk)
# scipy.sparse.save_npz('matrix2/alpha_dk_flattenedM4.npz', sparse_alpha) 
scipy.sparse.save_npz('matrix2/alpha_M4.npz', sparse_alpha)
# with open('matrix2/phecode_list.pkl', 'wb') as f:
#     pickle.dump(PheCode, f)

In [11]:
n_wk = pd.crosstab(MIMIC3_phecode['ICD9_CODE'], MIMIC3_phecode["PheCode"])


In [12]:
icd_phecode_mapping = n_wk.apply(lambda x: np.argmax(x.values != 0), axis=1).values


In [13]:
with open('matrix2/icd_phecode_map.pkl', 'wb') as f:
    pickle.dump(icd_phecode_mapping, f)

In [14]:
alpha_dkM1 = np.zeros((len(patient_list), K))

with tqdm.tqdm(total = len(patient_list)) as pbar:
    for idx, pat in enumerate(patient_list):
        # get phecode idx for patient 
        d_k_idx = np.where(n_dk[idx] != 0)
        # assign 1 to the subtopics of phecodes
        alpha_dkM1[idx][d_k_idx] = 1

  0%|          | 0/46278 [00:01<?, ?it/s]


In [15]:
alpha_dkM1.shape

(46278, 1641)

In [16]:
sparse_alphaM1 = scipy.sparse.csc_matrix(alpha_dkM1)
scipy.sparse.save_npz('matrix2/alpha_M1.npz', sparse_alphaM1) 