# Pathway Analysis

Analyze the pathways represented by the topics

In [1]:
import sys
sys.path.append('/scratch/st-jiaruid-1/yinian/my_jupyter/scETM/src/scETM/')

import os
os.environ[ 'NUMBA_CACHE_DIR' ] = '/scratch/st-jiaruid-1/yinian/tmp/' # https://github.com/scverse/scanpy/issues/2113

In [2]:
import pickle
import os
import numpy as np
import torch.nn.functional as F 
import torch
import pandas as pd
import anndata as ad
from multiprocessing import Pool

os.getcwd()

'/scratch/st-jiaruid-1/yinian/my_jupyter/scETM/notebooks'

In [4]:
import yaml
from pathlib import Path
import pickle
import matplotlib.pyplot as plt

In [5]:
working_dir = './'

### Load the data

In [7]:
config = yaml.safe_load(Path('../experiments/4_all.yaml').read_text())
config

{'files': {'rna': ['/arc/project/st-jiaruid-1/yinian/pbmc/4_13176_raw_rna.h5ad',
   '/arc/project/st-jiaruid-1/yinian/pbmc/4_31800_raw_rna.h5ad',
   '/arc/project/st-jiaruid-1/yinian/pbmc/4_32606_raw_rna.h5ad'],
  'protein': ['/arc/project/st-jiaruid-1/yinian/pbmc/4_13176_raw_protein.h5ad',
   '/arc/project/st-jiaruid-1/yinian/pbmc/4_31800_raw_protein.h5ad',
   '/arc/project/st-jiaruid-1/yinian/pbmc/4_32606_raw_protein.h5ad'],
  'combined': ['/arc/project/st-jiaruid-1/yinian/pbmc/4_13176_raw_combined.h5ad',
   '/arc/project/st-jiaruid-1/yinian/pbmc/4_31800_raw_combined.h5ad',
   '/arc/project/st-jiaruid-1/yinian/pbmc/4_32606_raw_combined.h5ad'],
  'gene_indices': '/scratch/st-jiaruid-1/yinian/my_jupyter/scETM/scripts/gene_indices_1.pkl'},
 'model_params': {'n_epochs': 12000,
  'eval_every': 3000,
  'cell_type_col': 'cell_type',
  'day': 4,
  'donor': 'all',
  'rna_n_vars': 22085},
 'ckpt_dir': '/scratch/st-jiaruid-1/yinian/my_jupyter/scETM/results/'}

In [8]:
files = config['files']
model_params = config['model_params']
if model_params['cell_type_col'] == 'None':
    model_params['cell_type_col'] = None

In [9]:
combined_files = files['combined']
adata = ad.concat([ad.read_h5ad(r_file) for r_file in combined_files], label="batch_indices")
adata

AnnData object with n_obs × n_vars = 28126 × 22225
    obs: 'day', 'donor', 'cell_type', 'technology', 'batch_indices'

In [10]:
file = open('/scratch/st-jiaruid-1/yinian/my_jupyter/scETM/results/multiETM/multiETM_4_all/outputs.pkl', 'rb')
outputs = pickle.load(file)
outputs['alpha'].shape, outputs['rho'].shape

((50, 400), (22225, 400))

In [11]:
delta, alpha, rho = map(pd.DataFrame, [outputs['delta'], outputs['alpha'], outputs['rho']])
delta.index = adata.obs_names
rho.index = adata.var_names
delta.shape, alpha.shape, rho.shape

((28126, 50), (50, 400), (22225, 400))

In [12]:
print('Get top 30 genes per topic (for enrichment analysis)')
beta = rho @ alpha.T  # (gene, topic)

top_words = pd.DataFrame(adata.var_names.values[np.argsort(beta.values, axis=0)[:-31:-1]])  # (n_top, topic)
# top_words.to_csv(os.path.join(working_dir, 'beta_top30genes_.csv'))

Get top 30 genes per topic (for enrichment analysis)


In [13]:
print('Saving unnormliazed topic mixture delta')
# delta.to_csv(os.path.join(working_dir, 'delta.csv'))

print('Saving metadata')
## create meta csv (condition, individual_id, cell_type)
# adata.obs.to_csv(os.path.join(working_dir, 'meta.csv'))

print('Saving normalized topic mixture theta')
theta = torch.tensor(delta.values).softmax(dim=-1).detach().cpu().numpy()
theta = pd.DataFrame(theta, index=adata.obs_names)
# theta.to_csv(os.path.join(working_dir, 'theta.csv'))

Saving unnormliazed topic mixture delta
Saving metadata
Saving normalized topic mixture theta


In [14]:
# by default, keep all topics
print('Sampling theta')
delta_sample = delta.sample(10000)
topic_kept = delta_sample.columns[delta_sample.sum(0) >= 1500]  # (topics)
meta_sample = adata.obs.loc[delta_sample.index]
# delta_sample.to_csv(os.path.join(working_dir, 'delta_sampled.csv'))
# meta_sample.to_csv(os.path.join(working_dir, 'meta_sampled.csv'))

delta_kept = delta[topic_kept]  # (cells, topics)

Sampling theta


In [None]:
print("Pathway enrichment analysis")
from pathdip import pathDIP_Http 

n_topics = delta.shape[1]
component = "Literature curated (core) pathway memberships"
sources = "ACSN2,BioCarta,EHMN,HumanCyc,INOH,IPAVS,KEGG,NetPath,OntoCancro,Panther_Pathway,PharmGKB,PID,RB-Pathways,REACTOME,stke,systems-biology.org,SignaLink2.0,SIGNOR2.0,SMPDB,Spike,UniProt_Pathways,WikiPathways"
o = pathDIP_Http()
pathway_df=[]
for i in range(n_topics):
    IDs = ', '.join(top_words[i])
    o.searchOnGenesymbols(IDs, component, sources)
    result = o.getPathwayAnalysis().split('\n')[1:]
    for line in result:
        p = line.split('\t')[:-1]
        p.append(i)
        if len(p) == 1:
            continue
        pathway_df.append(p)
pathway_df = pd.DataFrame(pathway_df, columns = ['pathway_source','pathway_name','p_val','q_val_BH','q_val_Bonf','topic'])  # (pathways, features)

pathway_df['q_val_BH'] = pathway_df['q_val_BH'].astype(float)
pathway_df = pathway_df[pathway_df['q_val_BH'] < 0.05]
pathway_df.to_csv(os.path.join(working_dir, 'pathways.csv'))