# Data Processing

In [1]:
# Imports
import os
import pickle
import torch
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from scipy.io import mmread

---
# iBKH

In [2]:
from utils.datasets import IBKHDataset

In [3]:
# (IMPORTANT) Figure out why row2identity contains duplicate labels. Conclusion: vocab files contain icd_9 duplicates for FUCKING REASONS.sdjirngddhjtdhj

try: # LOAD FROM CACHE
    input_triplet = torch.load('inputs/KGE/input_triplet.pt', weights_only=False)
    with open('inputs/GraphETM/embedding_map.pkl', 'rb') as f:
        embedding_map = pickle.load(f)
    print('Data loaded from cache...')

except IOError: # ELSE GENERATE NEW DATASET
    iKBH_dataset = IBKHDataset(data_dir='data/iBKH')
    input_triplet = iKBH_dataset.build_data()
    embedding_map = iKBH_dataset.row2entity # FIXME: ASAP Save indexes for: genes, diseases, and drugs (filtering by type)

    # Inputs processing
    torch.save(input_triplet, 'inputs/KGE/input_triplet.pt')              # Save triplets
    torch.save(input_triplet.edge_index, 'inputs/GraphETM/edge_index.pt') # Save edge index
    with open('inputs/GraphETM/embedding_map.pkl', 'wb') as f:            # Save embedding map
        pickle.dump(embedding_map, f)

Data loaded from cache...


---
# EHR

In [2]:
# DATA
filepath = 'data/MIMIC-III'
filepath = os.path.expanduser(filepath)

In [3]:
icd_metadata_df = pd.read_csv(os.path.join(filepath, 'D_ICD_DIAGNOSES.csv'))
icd_metadata_df

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,55,0065,Amebic brain abscess,Amebic brain abscess
1,56,0066,Amebic skin ulceration,Amebic skin ulceration
2,57,0068,Amebic infection NEC,Amebic infection of other sites
3,58,0069,Amebiasis NOS,"Amebiasis, unspecified"
4,59,0070,Balantidiasis,Balantidiasis
...,...,...,...,...
14562,10053,V1368,"Hx-cong malform-skin,ms",Personal history of (corrected) congenital mal...
14563,11201,79508,Unsat cerv cytlogy smear,Unsatisfactory cervical cytology smear
14564,11202,79509,Abn pap cervix HPV NEC,Other abnormal Papanicolaou smear of cervix an...
14565,11203,79510,Abn gland pap smr vagina,Abnormal glandular Papanicolaou smear of vagina


In [13]:
ehr_df = pd.read_csv(os.path.join(filepath, 'DIAGNOSES_ICD.csv'),
                     usecols=['SUBJECT_ID','ICD9_CODE'],
                     ).dropna(ignore_index=True)

# Rename
ehr_df.rename(columns={
    'SUBJECT_ID': 'patient_id',
    'ICD9_CODE': 'icd_9'
}, inplace=True)

# FUNCTION TO REPLACE 7100 (MIMIC-III) -> 710.0 (IBKH)
def insert_dot_icd9(code):
    digit_positions = [i for i, c in enumerate(code) if c.isdigit()]
    if len(digit_positions) <= 3:
        return code
    dot_pos = digit_positions[2] + 1
    return code[:dot_pos] + '.' + code[dot_pos:]

ehr_df['icd_9'] = ehr_df['icd_9'].apply(insert_dot_icd9)

In [14]:
# Isolate Immune conditions
immune_icd9_codes = ['238.7', '279', '279.0', '279.1', '279.2', '279.3', '279.4', '288.0', '288.5', '288.6', '555.0', '556.0', '710.0', '714.0', '725', '995.2', '995.27', '995.3']
has_immune = ehr_df['icd_9'].isin(immune_icd9_codes) # Boolean mask
immune_patient_id = ehr_df.loc[has_immune, 'patient_id'].unique()

ehr_df = ehr_df[ehr_df['patient_id'].isin(immune_patient_id)].reset_index(drop=True)

# NOTE: 710.0 = Lupus for iBKH dataset
# NOTE: 7100 = Lupus in MIMIC-III

# Make into Bag-of-Words (BoW) # FIXME: Categorize by disease type.
ehr_df = (ehr_df
          .pivot_table(index='patient_id',
                       columns='icd_9',
                       aggfunc='size',
                       fill_value=0)
          .sort_index())
# Helena and cam were here

In [15]:
# Re-order based on iBKH
disease_order = [
    name for typ, name in embedding_map
    if typ == 'disease' and name in ehr_df.columns # IF disease name in (iBKH) AND disease name in (EHR)
]
ehr_df = ehr_df[disease_order]

# Remove samples with 0s
ehr_df = ehr_df.loc[ehr_df.sum(axis=1) > 0]    # Drop empty patients (all-zero rows)
ehr_df = ehr_df.loc[:, ehr_df.sum(axis=0) > 0] # Drop empty diseases (all-zero columns)

ehr_data = ehr_df                     # DataFrame Shape(patient_id, icd_9)
X_ehr = torch.Tensor(ehr_data.values) # Tensor

In [16]:
# Inputs processing
input_filepath = 'inputs/GraphETM'

torch.save(X_ehr, os.path.join(input_filepath, 'X_ehr.pt'))
ehr_data.to_csv(os.path.join(input_filepath, 'input_EHR.csv'), index=False)

# (Optional) Metadata
icd_metadata_df.to_csv(os.path.join(input_filepath, 'optional/input_EHR_metadata.csv'), index=False) # TODO: These codes do not map with the update.

In [17]:
# Save disease indices (for embeddings)
id_embed_ehr = [
    idx for idx, (typ, name) in enumerate(embedding_map)
    if typ == 'disease' and name in ehr_df.columns # IF disease name in (iBKH) AND disease name in (EHR)
]

np.save(os.path.join(input_filepath, 'id_embed_ehr.npy'), np.array(id_embed_ehr))

---
# PBMC

In [4]:
# DATA
filepath = 'data/PBMC'
filepath = os.path.expanduser(filepath)

In [5]:
pbmc_df = mmread(os.path.join(filepath, 'matrix.mtx')).tocsc()
pbmc_names = pd.read_csv(os.path.join(filepath, 'genes.tsv'), sep='\t', header=None)
pbmc_df = pd.DataFrame(pbmc_df.toarray().T, columns=pbmc_names[1])

In [6]:
# Drop duplicate columns
pbmc_df = pbmc_df.loc[:, ~pbmc_df.columns.duplicated()].copy()

# Re-order based on iBKH
gene_order = [
    name for typ, name in embedding_map
    if typ == 'gene' and name in pbmc_df.columns # IF gene name in (iBKH) AND gene name in (PBMC)
]

pbmc_df = pbmc_df[gene_order]

In [7]:
# Filter data
pbmc_df = pbmc_df.loc[pbmc_df.sum(axis=1) > 0]    # Drop empty cells (all-zero rows)
pbmc_df = pbmc_df.loc[:, pbmc_df.sum(axis=0) > 0] # Drop empty genes (all-zero columns)

pbmc_adata = ad.AnnData(X=pbmc_df) # Anndata for scanpy

sc.pp.filter_genes(pbmc_adata, min_cells=4) # Gene expressed in <3 cells

sc.pp.log1p(pbmc_adata) # FIXME: Label cell types.
sc.pp.highly_variable_genes(
    pbmc_adata,
    n_top_genes=1000,
    subset=True, # Drop non-HVG columns in-place
)

pbmc_data = pd.DataFrame(data=pbmc_adata.X, columns=pbmc_adata.var.index) # Dataframe
X_sc = torch.Tensor(pbmc_data.values)                                     # Tensor



In [40]:
# PBMC Input Processing
input_filepath = 'inputs/GraphETM'

torch.save(X_sc, os.path.join(input_filepath, 'X_sc.pt'))
pbmc_data.to_csv(os.path.join(input_filepath, 'input_PBMC.csv'), index=False)

In [41]:
# Save gene indices (for embeddings)
id_embed_sc = [
    idx for idx, (typ, name) in enumerate(embedding_map)
    if typ == 'gene' and name in pbmc_data.columns # IF gene name in (iBKH) AND gene name in (PBMC)
]

np.save(os.path.join(input_filepath, 'id_embed_sc.npy'), np.array(id_embed_sc))

In [None]:
# DONE