# Data Processing

In [1]:
# Imports
import os
import pickle
import torch
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
import umap
import plotly.express as px
from scipy.io import mmread
import celltypist

---
# iBKH

In [2]:
from utils.datasets import IBKHDataset

In [3]:
try: # LOAD FROM CACHE
    input_triplet = torch.load('inputs/KGE/input_triplet.pt', weights_only=False)
    with open('inputs/GraphETM/embedding_map.pkl', 'rb') as f:
        embedding_map = pickle.load(f)
    print('Data loaded from cache...')

except IOError: # ELSE GENERATE NEW DATASET
    iKBH_dataset = IBKHDataset(data_dir='data/iBKH')
    input_triplet = iKBH_dataset.build_data()
    embedding_map = iKBH_dataset.row2entity

    # Inputs processing
    torch.save(input_triplet, 'inputs/KGE/input_triplet.pt')              # Save triplets
    torch.save(input_triplet.edge_index, 'inputs/GraphETM/edge_index.pt') # Save edge index
    with open('inputs/GraphETM/embedding_map.pkl', 'wb') as f:            # Save embedding map
        pickle.dump(embedding_map, f)

Data loaded from cache...


---
# Drugs

In [4]:
# Inputs processing
input_filepath = 'inputs/GraphETM'

# Save (all) drug indices (for embeddings)
id_embed_drugs = [
    idx for idx, (typ, name) in enumerate(embedding_map)
    if typ == 'drug'  # All drug entities from iBKH
]

np.save(os.path.join(input_filepath, 'id_embed_drugs.npy'), np.array(id_embed_drugs))

---
# Diseases (EHR)

In [5]:
# DATA
filepath = 'data/MIMIC-III'
filepath = os.path.expanduser(filepath)

In [6]:
# FUNCTION TO REPLACE 7100 (MIMIC-III) -> 710.0 (IBKH)
def insert_dot_icd9(code):
    digit_positions = [i for i, c in enumerate(code) if c.isdigit()]
    if len(digit_positions) <= 3:
        return code
    dot_pos = digit_positions[2] + 1
    return code[:dot_pos] + '.' + code[dot_pos:]

In [7]:
icd_metadata_df = pd.read_csv(os.path.join(filepath, 'D_ICD_DIAGNOSES.csv'), usecols=['ICD9_CODE', 'SHORT_TITLE', 'LONG_TITLE'])
icd_metadata_df.rename(columns={
    'ICD9_CODE': 'icd_9',
    'SHORT_TITLE': 'short_title',
    'LONG_TITLE': 'long_title',
}, inplace=True)

icd_metadata_df['icd_9'] = icd_metadata_df['icd_9'].apply(insert_dot_icd9)
icd_metadata_df

Unnamed: 0,icd_9,short_title,long_title
0,006.5,Amebic brain abscess,Amebic brain abscess
1,006.6,Amebic skin ulceration,Amebic skin ulceration
2,006.8,Amebic infection NEC,Amebic infection of other sites
3,006.9,Amebiasis NOS,"Amebiasis, unspecified"
4,007.0,Balantidiasis,Balantidiasis
...,...,...,...
14562,V136.8,"Hx-cong malform-skin,ms",Personal history of (corrected) congenital mal...
14563,795.08,Unsat cerv cytlogy smear,Unsatisfactory cervical cytology smear
14564,795.09,Abn pap cervix HPV NEC,Other abnormal Papanicolaou smear of cervix an...
14565,795.10,Abn gland pap smr vagina,Abnormal glandular Papanicolaou smear of vagina


In [8]:
ehr_df = pd.read_csv(os.path.join(filepath, 'DIAGNOSES_ICD.csv'), usecols=['SUBJECT_ID', 'ICD9_CODE']).dropna(ignore_index=True)
ehr_df.rename(columns={
    'SUBJECT_ID': 'patient_id',
    'ICD9_CODE': 'icd_9'
}, inplace=True)

ehr_df['icd_9'] = ehr_df['icd_9'].apply(insert_dot_icd9)

In [9]:
# Isolate Immune conditions
type2codes = {
    'autoimmune': ['250.01', '250.03', '279.4', '340', '446.0', '446.1', '446.4', '446.5', '446.7', '710.0', '710.1', '710.2', '714.0'],
    'immunodeficiency': ['042', '043', '279', '279.0', '279.1', '279.2', '279.3'],
    'allergy': ['995.27', '995.3'],
    'chronic_inflammatory': ['555.0', '555.1', '555.2', '555.9', '556.0'],
    'wbc_disorder': ['288.0', '288.5', '288.6'],
    'other': ['238.4', '238.7', '725']}

codes2type = {item: key for key, values in type2codes.items() for item in values}
immune_icd9_codes = [item for sublist in list(type2codes.values()) for item in sublist]

has_immune = ehr_df['icd_9'].isin(immune_icd9_codes) # Boolean mask
immune_patient_id = ehr_df.loc[has_immune, 'patient_id'].unique()

ehr_df = ehr_df[ehr_df['patient_id'].isin(immune_patient_id)].reset_index(drop=True)

# NOTE: 710.0 = Lupus for iBKH dataset
# NOTE: 7100 = Lupus in MIMIC-III

# Make into Bag-of-Words (BoW) # FIXME: Categorize by disease type.
ehr_df = (ehr_df
          .pivot_table(index='patient_id',
                       columns='icd_9',
                       aggfunc='size',
                       fill_value=0)
          .sort_index())
# Helena and cam were here

In [10]:
# Assign patient_ids to primary disease type
def assign_primary_disease_type(patient_row):
    patient_codes = patient_row[patient_row > 0].index.tolist()
    patient_types = [codes2type.get(code, 'unknown') for code in patient_codes if code in codes2type]

    if not patient_types:
        return 'unknown'

    # Count types and return most common
    type_counts = pd.Series(patient_types).value_counts()
    return type_counts.index[0]
disease_types = ehr_df.apply(assign_primary_disease_type, axis=1)

ehr_df = ehr_df.set_index([ehr_df.index, disease_types])

In [11]:
# Re-order based on iBKH
disease_order = [
    name for typ, name in embedding_map
    if typ == 'disease' and name in ehr_df.columns # IF disease name in (iBKH) AND disease name in (EHR)
]
ehr_df = ehr_df[disease_order]

# Remove samples with 0s
ehr_df = ehr_df.loc[:, ehr_df.sum(axis=0) > 0] # Drop empty diseases (all-zero columns)
ehr_df = ehr_df.loc[ehr_df.sum(axis=1) > 0]    # Drop empty patients (all-zero rows)

ehr_data = ehr_df                     # DataFrame Shape(patient, disease)
X_ehr = torch.Tensor(ehr_data.values) # Tensor

In [12]:
# Get Labels
ehr_labels_num = ehr_df.index.codes[1]                     # Num. labels
ehr_labels_str = ehr_df.index.get_level_values(1).tolist() # Str. labels
ehr_categories = ehr_df.index.levels[1].tolist()           # Category names

In [13]:
ibkh_diseases = {name for typ, name in embedding_map if typ == 'disease'}
immune_in_ibkh = set(immune_icd9_codes) & ibkh_diseases
immune_missing = set(immune_icd9_codes) - ibkh_diseases

print(f'Immune diseases in iBKH: {len(immune_in_ibkh)}/{len(immune_icd9_codes)}')
print(f'Missing immune diseases: {list(immune_missing)}')

Immune diseases in iBKH: 15/33
Missing immune diseases: ['555.0', '279.4', '238.7', '995.27', '288.6', '288.5', '995.3', '043', '556.0', '250.01', '279', '042', '279.0', '250.03', '279.1', '555.2', '555.9', '238.4']


In [14]:
# Inputs processing
input_filepath = 'inputs/GraphETM'

torch.save(X_ehr, os.path.join(input_filepath, 'X_ehr.pt'))
ehr_data.to_csv(os.path.join(input_filepath, 'optional/input_EHR.csv'), index=False)

# Labels
np.save(os.path.join(input_filepath, 'labels/ehr_labels_num.npy'), ehr_labels_num)
np.save(os.path.join(input_filepath, 'labels/ehr_labels_str.npy'), np.array(ehr_labels_str))
np.save(os.path.join(input_filepath, 'labels/ehr_categories.npy'), np.array(ehr_categories))

# (Optional) Metadata
icd_metadata_df.to_csv(os.path.join(input_filepath, 'optional/input_EHR_metadata.csv'), index=False)

In [15]:
# Save disease indices (for embeddings)
id_embed_ehr = [
    idx for idx, (typ, name) in enumerate(embedding_map)
    if typ == 'disease' and name in ehr_df.columns # IF disease name in (iBKH) AND disease name in (EHR)
]

np.save(os.path.join(input_filepath, 'id_embed_ehr.npy'), np.array(id_embed_ehr))

In [19]:
# UMAP clustering
X_umap_ehr = umap.UMAP(random_state=1).fit_transform(ehr_data.values)
umap_ehr_df = pd.DataFrame({'x': X_umap_ehr[:, 0], 'y': X_umap_ehr[:, 1], 'disease_types': ehr_df.index.get_level_values(1).tolist()})

# Plot categories
fig = px.scatter(umap_ehr_df, x='x', y='y', color='disease_types', template='ggplot2', width=1100, height=600,
                 title='EHR Immune Disease Types Clustering')
fig.update_traces(marker=dict(size=4))
fig.show()

---
# Genes (PBMC)

In [20]:
# DATA
filepath = 'data/PBMC'
filepath = os.path.expanduser(filepath)

In [21]:
pbmc_df = mmread(os.path.join(filepath, 'matrix.mtx')).tocsc()
pbmc_names = pd.read_csv(os.path.join(filepath, 'genes.tsv'), sep='\t', header=None)
pbmc_df = pd.DataFrame(pbmc_df.toarray().T, columns=pbmc_names[1])

In [22]:
# Drop duplicate columns
pbmc_df = pbmc_df.loc[:, ~pbmc_df.columns.duplicated()].copy()

# Re-order based on iBKH
gene_order = [
    name for typ, name in embedding_map
    if typ == 'gene' and name in pbmc_df.columns # IF gene name in (iBKH) AND gene name in (PBMC)
]

pbmc_df = pbmc_df[gene_order]

In [23]:
# Filter data
pbmc_df = pbmc_df.loc[pbmc_df.sum(axis=1) > 0]    # Drop empty cells (all-zero rows)
pbmc_df = pbmc_df.loc[:, pbmc_df.sum(axis=0) > 0] # Drop empty genes (all-zero columns)

pbmc_adata = ad.AnnData(X=pbmc_df) # Anndata for scanpy
pbmc_adata.var_names = pbmc_df.columns.values

sc.pp.filter_genes(pbmc_adata, min_cells=4) # Gene expressed in <4 cells

sc.pp.normalize_total(pbmc_adata, target_sum=1e4, exclude_highly_expressed=True)
sc.pp.log1p(pbmc_adata)
sc.pp.highly_variable_genes(
    pbmc_adata,
    n_top_genes=1000,
    subset=True, # Drop non-HVG columns in-place
)

In [24]:
# Cell Type Labels
celltypist.models.download_models(force_update = True, model = 'Healthy_COVID19_PBMC.pkl')
predictions_adata = celltypist.annotate(pbmc_adata, model = 'Healthy_COVID19_PBMC.pkl', majority_voting = True).to_adata()
cell_types = predictions_adata.obs['majority_voting']

sc_labels_categorical = pd.Categorical(cell_types)

📜 Retrieving model list from server https://celltypist.cog.sanger.ac.uk/models/models.json
📚 Total models in list: 54
📂 Storing models in /Users/loicduchesne/.celltypist/data/models
💾 Total models to download: 1
💾 Downloading model [1/1]: Healthy_COVID19_PBMC.pkl
🔬 Input data has 4340 cells and 1000 genes
🔗 Matching reference genes in the model
🧬 563 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 5
🗳️ Majority voting the predictions
✅ Majority voting done!


In [26]:
# Get labels
sc_labels_num = sc_labels_categorical.codes               # Num. labels
sc_labels_str = sc_labels_categorical.tolist()            # Str. labels
sc_categories = sc_labels_categorical.categories.tolist() # Category names

In [27]:
# PBMC Input Processing
input_filepath = 'inputs/GraphETM'

pbmc_data = pd.DataFrame(data=pbmc_adata.X, columns=pbmc_adata.var.index) # Dataframe
X_sc = torch.Tensor(pbmc_data.values)                                     # Tensor

# Labels
np.save(os.path.join(input_filepath, 'labels/sc_labels_num.npy'), sc_labels_num)
np.save(os.path.join(input_filepath, 'labels/sc_labels_str.npy'), np.array(sc_labels_str))
np.save(os.path.join(input_filepath, 'labels/sc_categories.npy'), np.array(sc_categories))

torch.save(X_sc, os.path.join(input_filepath, 'X_sc.pt'))
pbmc_data.to_csv(os.path.join(input_filepath, 'optional/input_PBMC.csv'), index=False)

In [28]:
# Save gene indices (for embeddings)
id_embed_sc = [
    idx for idx, (typ, name) in enumerate(embedding_map)
    if typ == 'gene' and name in pbmc_data.columns # IF gene name in (iBKH) AND gene name in (PBMC)
]

np.save(os.path.join(input_filepath, 'id_embed_sc.npy'), np.array(id_embed_sc))

In [29]:
# UMAP clustering
X_umap_sc = umap.UMAP(random_state=1).fit_transform(predictions_adata.X)
umap_sc_df = pd.DataFrame({'x': X_umap_sc[:, 0], 'y': X_umap_sc[:, 1], 'cell_types': predictions_adata.obs['majority_voting']})

# Plot categories
fig = px.scatter(umap_sc_df, x='x', y='y', color='cell_types', template='ggplot2', width=1100, height=600,
                 title='Single-Cell RNA Cell Types Clustering')
fig.update_traces(marker=dict(size=4))
fig.show()

In [None]:
# DONE