# This script analyze the NHDP results of EGAS00001004809 

Cells from the EGAS00001004809 dataset were analyzed using the NHDP algorithm. The results are stored in the following files:


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scanpy as sc
import anndata as ad
import rpy2.robjects as robjects
import sys as sys
sys.path.append('/home/xinghua/projects/PanCancer_scRNA_analysis/utils/')
from scRNA_utils import *

## Read in the NHDP and AnnData results for cell type

### Identify the model with the highest likelihood for each cell type
We have performed NHDP analysis for each cell type for 6 times. The results are stored in the following files:
"/data/ICI_exprs/ICI_NHDP/EGAS00001004809_high_variance_gene_NHDP/NHDP_runs/"
The following script will identify the model with the highest likelihood for each cell type.



In [None]:
# RData_dir = "/data/ICI_exprs/ICI_NHDP/EGAS00001004809_original_gene_NHDP/"
RData_dir = "/data/ICI_exprs/ICI_NHDP/EGAS00001004809_high_variance_gene_NHDP/NHDP_runs/"

cell_types = ['B', 'T', 'M', 'Epi', 'Endo', 'Fibro']
# use a dictionary to store the best model for each cell type.  Key is cell type, value is the best model
best_models = {}
high_likelihood = {}

#list all RData files
files = os.listdir(RData_dir)
files = [f for f in files if f.endswith('.RData')]
for f in files:
    #print ("Processing file: ", f)
    # read in the RData file
    robjects.r['load'](RData_dir + f)
    # extract the 'tree' object
    nHDP_trained_mb = robjects.r['nHDP_trained_mb']
    f_likelihood = max(np.array(nHDP_trained_mb.rx('likelihood')[0])) #[0]

     #detect the cell type 
    f_cell_type = f.split('_')[3]
    if f_cell_type not in high_likelihood.keys():
        high_likelihood[f_cell_type] = f_likelihood
        best_models[f_cell_type] = f
    else:
        if f_likelihood > high_likelihood[f_cell_type]:
            high_likelihood[f_cell_type] = f_likelihood
            best_models[f_cell_type] = f

# iterate through the best models for each cell type, and print the model name and the likelihood
for k, v in best_models.items():
    print ("Cell type: ", k, " best model: ", v, " likelihood: ", high_likelihood[k])


### Read in h5ad files with metadata

In [None]:
h5ad_dir = "/data/ICI_exprs/ICI_NHDP/EGAS00001004809_high_variance_gene_NHDP/"
files = os.listdir(h5ad_dir)
h5ad_files = [f for f in files if f.endswith('.h5ad')]
adata_dict = {} 
for f in h5ad_files:
    # parse the cell type from the file name
    f_cell_type = f.split('_')[3]
    adata_dict[f_cell_type] = sc.read_h5ad(h5ad_dir+f)

## Processing T cell data

In [None]:
T_NHDP = parseNHDP_RData(RData_dir + best_models['T'])
adata_T = T_NHDP['cell_by_GEM']

# Copy annotation from the original AnnData object
adata_T.obs = adata_dict['T'].obs
adata_T.obsm = adata_dict['T'].obsm
adata_T.uns = adata_dict['T'].uns
adata_T.raw = adata_T

In [None]:
# plot two histograms
plt.figure(figsize=(10, 5))
plt.subplot(1, 2,1)
plt.hist(np.sum(adata_T.X, axis= 1), bins=50)
plt.subplot(1, 2,2)
plt.hist(np.log(np.sum(adata_T.X, axis= 0)+1), bins=50)

In [None]:
### filter GEMs with colsum < 100
nonZeroGEMs = adata_T.var_names[ (np.sum(adata_T.X > 2, axis= 0) / adata_T.shape[0] > .005)]
print(nonZeroGEMs)


## Plot GEMs

In [None]:
adata_T_h5ad = adata_dict['T']
sc.pl.umap(adata_T_h5ad, color = ['leiden', 'cell_type', 'timepoint', 'PDCD1'], ncol = 2)

In [None]:
# make mono-color colormap
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

def create_custom_blue_colormap():
    # Define the colors and positions for the custom colormap
    colors = [(0.0, 'white'), (0.5, 'lightblue'), (1.0, 'blue')]  # Blue shades from white to blue
    cmap_name = 'custom_blue_colormap'
    
    # Create the colormap
    cmap = mcolors.LinearSegmentedColormap.from_list(cmap_name, colors)

    return cmap

In [None]:
blue_cmap = create_custom_blue_colormap()
sc.pl.umap(adata_T, color = nonZeroGEMs, use_raw = False, cmap=blue_cmap, ncols = 4, vmax=25)



## Transform to psuedo-bulk data and find differentially expressed GEMs



In [None]:
# perform pseudotime analysis
adata_pseudobulk = scRNA2PseudoBulkAnnData(adata_T, sample_id_col='sample_id')
adata_pseudobulk

## Plot GEMs expressed in each cell type and compare 'pre' and 'on' treatment

In [None]:
for c in adata_T.obs['leiden'].unique():
    print("Plotting cluster: ", c)
    plotGEMs(adata_T, 'leiden', c, ncols=4)

## Plot GEMs expressed in each cell type and compare 'pre' and 'on' treatment

In [None]:
for c in adata_T.obs['cell_type'].unique():
    print ("Plotting cell type: ", c)
    plotGEMs(adata_T, 'cell_type', c, ncols=4)

## Find top genes for each GEM

In [None]:

adata_GEM = results['1863-counts_cells_cohort1_T_cells.h5ad.NHDP.RData' ]['GEM_by_gene']
print('Gem-by-gene matrix:', adata_GEM.shape)

# sort adata_T to find the top 10 genes for each column
for i in range(adata_GEM.shape[1]):
    # find the top 10 genes for each column
    top_50 = adata_GEM.obs.index[np.argsort(adata_GEM.X[:,i])[-50:]].tolist()
    print(i, ":", top_50)
    

## 