# EGAS00001004809 scRNA Data Preprocessing


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import os as os
import sys as sys
sys.path.append('/home/xinghua/projects/PanCancer_scRNA_analysis/utils/')
from scRNA_utils import *
import operator as op

### Read in data matrix and combine with meta-data

In [None]:
### Read in data in H5AD format
data_dir = "/home/data/ICI_exprs/EGAS00001004809/"
data_file =  data_dir + '1863-counts_cells_cohort1.h5ad'

adata = sc.read_h5ad(data_file)

print ("Read in dataset with dimension: " + str(adata.shape))

In [None]:
# read in the csv meta-data
meta_file_pathname = data_dir + "1872-BIOKEY_metaData_cohort1_web.csv"
cohort1_meta = pd.read_csv(meta_file_pathname, header = 0, index_col = 0)

In [None]:
# make sure the index of cohort1_meta agree with the obs of adata
cohort1_meta = cohort1_meta.reindex(adata.obs.index)
adata.obs = cohort1_meta
print(adata.obs.columns)
print(adata.obs['patient_id'].unique())

In [None]:
# change "timepoint" values to lowercase
adata.obs["timepoint"] = adata.obs["timepoint"].str.lower()
adata.obs["timepoint"].unique()

In [None]:
# the field "cohort" is equivalent to "treatment", rename it
adata.obs.rename(columns={'cohort': 'treatment'}, inplace=True)

Extract sample names from the obs_names

'BIOKEY_13_Pre_AAACCTGCAACAACCT-1'

In [None]:
# extract sample_id from index, re-join first three columns of index after splitting by "_"
adata.obs['sample_id'] = adata.obs.index.str.split("_").str[0:3].str.join("_")


### Extract tissue type from the sample name or from meta-data

Drop unwanted columns

In [None]:
# drop the nCount_RNA and nFeature_RNA columns
adata.obs.drop(columns=['nCount_RNA', 'nFeature_RNA'], inplace=True)

# rearrange the columns order: "patient_id", "sample_id", "timepoint", "treatment", "cell_type"
adata.obs = adata.obs[["patient_id", "sample_id", "timepoint", "treatment", 'expansion', 'BC_type', 'cellType']]   

In [None]:
print(adata.obs.columns)

## 1. Basic Filtering

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

#### Removing cells expressing <500 || >5000 genes:

In [None]:
# removing cells expressing <500 || >5000 genes
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_cells(adata, max_genes=5000)

In [None]:
print ('Dimention of adata: ' + str(adata.shape))
print ('columns for adata.obs: ' + str(adata.obs.columns))

#### Removing cells containing <400 || >25000 UMIs:

In [None]:
# removing cells containing <400 || >25000 UMIs
sc.pp.filter_cells(adata, min_counts = 400)
sc.pp.filter_cells(adata, max_counts = 25000)

In [None]:
print ('Dimention of adata: ' + str(adata.shape))
print ('columns for adata.obs: ' + str(adata.obs.columns))

# Preprecess with respect to gene (var)
#### Removing genes covered by <3 cells per sample


In [None]:
# removing genes covered by <3 cells
sc.pp.filter_genes(adata, min_cells= 10)
adata.var_names_make_unique()
adata.shape

### Remove cells with high percentage of mitochondrial genes

In [None]:
# label genes as mt
adata.var['mt'] = adata.var_names.str.startswith('MT-')  

# annotate cells with the percent of genes assigned as mt
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# remove mitochondrial genes from analysis
adata = adata[:, ~adata.var['mt'].values]

# Here we keep cells with < 20% mito ratio
adata = adata[adata.obs['pct_counts_mt'] < 20, :]
adata.shape

Plot statistics regarding cells

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.0, multi_panel=True)

## 2. Integrating Samples

#### Normalization & Logarithmization:

In [None]:
# Log normalization scaled up to 10000
print('Before normalization, the sum of first row of X: ' + str(adata.X[0,:].sum()))
sc.pp.normalize_total(adata, target_sum=1e4)
print('After normalization, the sum of first row of X: ' + str(adata.X[0,:].sum()))


In [None]:
print('Before log, the sum of first row of X: ' + str(adata.X[0,:].sum()))
# Logarithmize adata
sc.pp.log1p(adata, base = 2)
print('After log, the sum of first row of X: ' + str(adata.X[0,:].sum()))

### load cell cycle markers and score cells for cell cycle

The question is whether this step is necessary for our study. We are trying to look for genes expression modules (GEMs).  Cell cycel is an important component of cellular signaling, thus removing its signal distorts the cellular states.  

Afterall, it seems to introduce 'Inf' to data which prevent the following steps.

## Keep high variance genes 

In [None]:
n_top_genes=10000
sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes)
sc.pl.highly_variable_genes(adata)

In [None]:
adata.var.columns


### Keep track of original adata and update adata.X to  high variance genes only

In [None]:
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
print('adata dimensions of high variance genes: ' + str(adata.shape))

## 3. Unsupervised cell clustering & identification of major cell types

In [None]:
# perform PCA   
sc.tl.pca(adata, svd_solver='arpack', n_comps=30)

In [None]:
print(adata.obsm['X_pca'].shape)
print(adata.varm['PCs'].shape)
print(adata.uns['pca']['variance_ratio'].shape)
print(adata.obs.columns)

In [None]:
sc.pp.neighbors(adata, n_neighbors=80, n_pcs=30)

In [None]:
# Use the Leiden algorithm to find clusters
sc.tl.leiden(adata, resolution=0.5)


In [None]:
# load/find cell cycle markers: T-test/T-cells
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
adata.obs.columns

## 3. Unsupervised cell clustering & identification of major cell types

In [None]:
# perform UMAP
sc.tl.umap(adata)
sc.pl.umap(adata, color='leiden',legend_loc='on data')

In [None]:
# color by gene expression of T cell markers
sc.pl.umap(adata, color= ['CD3D', 'CD3E', 'TRAC', 'TRBC1'])

In [None]:
# color by gene expression of B cell markers
sc.pl.umap(adata, color= ['CD19', 'CD79A', 'CD79B', 'MS4A1'])

In [None]:
# color by gene expression of myeloid cell markers
sc.pl.umap(adata, color= ['CD14', 'CD68', 'CD163', 'CD33'])

In [None]:
# color by gene expression of epi cell markers
sc.pl.umap(adata, color= ['EPCAM', 'CD24','KRT19', 'KRT7'])

In [None]:
# color by gene expression of fibroblast cell markers   
sc.pl.umap(adata, color= ['COL1A1', 'COL1A2', 'COL3A1', 'ACTA2'])

In [None]:
# color by gene expression of endothelial cell markers  
sc.pl.umap(adata, color= ['PECAM1', 'CD34', 'VWF', 'CDH5'])

## Label cells based on cell markers

In [None]:
cell_type_markers = {
    'T cells': ['CD3D', 'CD3E', 'TRAC', 'TRBC1'],
    'B cells': ['CD79A', 'CD79B', 'MS4A1', 'TNFRSF17', 'MZB1'],
    'Myeloid': ['CD14', 'CD68'],
    'Epithelial': ['EPCAM', 'CD24'],
    'Fibroblast': ['COL1A2', 'COL3A1', 'MYH11', 'ACTA2'],
    'Endothelial': ['VWF', 'PECAM1']
}
sc.tl.dendrogram(adata, groupby='leiden')
sc.pl.dotplot(adata, cell_type_markers, 'leiden', dendrogram=True)
#sc.pl.umap(adata, color='clusters', legend_loc='on data')


### Function to annotate clusters with cell type

In [None]:
labelClusterWithCellType(adata, cell_type_markers)

In [None]:
sc.pl.umap(adata, color=['cell_type', 'treatment', 'timepoint', 'leiden'])

sc.pl.umap(adata, color='clusters', legend_loc='on data')

## Save adata wtih cell type label

In [None]:
# write current adata to h5ad
data_dir = "/home/data/ICI_exprs/EGAS00001004809/"
adata.write(data_dir + '1863-counts_cells_cohort1_annotated.h5ad')

In [None]:
data_dir = "/home/data/ICI_exprs/EGAS00001004809/"
adata = sc.read_h5ad(data_dir + '1863-counts_cells_cohort1_annotated.h5ad')

## Some statistics of cell distribution pre and post treatment

In [None]:
# count the number of cells in each cell type
cell_type_count = adata.obs.groupby('cell_type').size()
# plot a pie chart
cell_type_count.plot.pie(y='cell_type', figsize=(5, 5), autopct='%1.1f%%', startangle=90)
plt.title('Overall cell type distribution')

In [None]:
# count the number of cells in each cell type in pre and on 
adata_pre = adata[adata.obs['timepoint'] == 'pre', :]
cell_type_count_pre = adata_pre.obs.groupby(['cell_type']).size()
cell_type_count_pre.plot.pie(y='cell_type', figsize=(5, 5), autopct='%1.1f%%', startangle=90)
plt.title('Cell type distribution in pre')

In [None]:
# count the number of cells in each cell type in pre and on 
adata_on = adata[adata.obs['timepoint'] == 'on', :]
cell_type_count_on = adata_on.obs.groupby(['cell_type']).size()
cell_type_count_on.plot.pie(y='cell_type', figsize=(5, 5), autopct='%1.1f%%', startangle=90)
plt.title('Cell type distribution in on')


## Extract T cells and create a new AnnData object

In [None]:
## Extract T cells and create a new AnnData object
adata_T = adata[adata.obs['cell_type'] == 'T cells'].copy()
adata_T.shape

In [None]:
# restore the X to original raw.X for re-processing 
adata_T  = ad.AnnData(X=adata_T.raw.X, obs=adata_T.obs, var=adata_T.raw.var)
# drop highly variable gene column in var
# adata_T.var.drop(['highly_variable'], axis=1, inplace=True)
adata_T.raw = adata_T
print(str(adata_T.shape))


In [None]:
# reclustering T cells
n_top_genes=5000
# select high variance genes
sc.pp.highly_variable_genes(adata_T, n_top_genes=n_top_genes)  
# filter genes
adata_T = adata_T[:, adata_T.var['highly_variable']]
# PCA
sc.pp.pca(adata_T, n_comps=50, use_highly_variable=True, svd_solver='arpack')
# UMAP
sc.pp.neighbors(adata_T, n_neighbors=15, n_pcs=50)
sc.tl.umap(adata_T)
# clustering
sc.tl.leiden(adata_T, resolution=0.5)
# plot
sc.pl.umap(adata_T, color=['leiden'], legend_loc='on data', title='T cells')


### Reclustering T cells and find where PD-1 and its target express

In [None]:
adata_T = clustering_adata(adata_T, n_top_genes = n_top_genes, resolution = 0.25)

### Label TNK subtypes

In [None]:
T_cell_makers = {
    'CD4'	: ['CD4', 'IL7R'],
    'CD8'	: [ 'CD8A', 'CD8B'],
    'Naïve'	: ['TCF7', 'SELL', 'LEF1', 'CCR7'],
    'Exhausted' : ['LAG3', 'TIGIT', 'PDCD1', 'HAVCR2', 'CTLA4'],
    'Cytotoxic' : ['IL2', 'GZMA', 'GNLY', 'PRF1', 'GZMB', 'GZMK', 'IFNG', 'NKG7'],
    'Treg' : ['IL2RA', 'FOXP3', 'IKZF2', 'IKZF4',  'TNFRSF18'],
    'Gamma-delta' : ['TRGC1', 'TRGC2', 'TRDC'],
    'Th17' : ['IL17A',  'CCR6', 'KLRB1'],  #'IL22',
    'MAIT' : ['SLC4A10', 'KLRB1', 'IL7R', 'DPP4'],  
    'ILC' :	['KIT', 'IL1R1'],
    'Th1' :	['STAT4', 'IL12RB2', 'IFNG'],
    'Th2' :	['GATA3', 'STAT6', 'IL4'],
    'Tfh'	: ['MAF', 'CXCL13', 'CXCR5', 'PDCD1'],
    'NK' :  ['XCL1', 'FCGR3A', 'KLRD1', 'KLRF1', 'NCAM1'],
    'Proliferation' : ['MKI67', 'PCNA', 'STMN1']
}

### Plot

In [None]:
sc.tl.dendrogram(adata_T, groupby='leiden')
sc.pl.dotplot(adata_T, T_cell_makers, 'leiden', dendrogram=True)

In [None]:
for cell_type, markers in T_cell_makers.items():
    print (cell_type, ":", markers)
    sc.pl.umap(adata_T, color=markers)

In [None]:
adata_T.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_T, T_cell_makers, cluster_column='leiden')
sc.pl.umap(adata_T, color=['cell_type', 'leiden'])

Plot PD-1 and potential target genes

In [None]:
sc.pl.umap(adata_T, color= ['PDCD1', 'CXCL13', 'HAVCR2','CTLA4'])

The enriched genes for each cluster

In [None]:
# load/find cell cycle markers: T-test/T-cells
sc.tl.rank_genes_groups(adata_T, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata_T, n_genes=25, sharey=False)

### Examine the distribution of PDCD1 and potential target genes

In [None]:
# Get the index of the PD1 in the gene names list
gene_index = np.where(adata_T.var_names == 'PDCD1')[0][0]
# Extract expression values for the gene in all cells
gene_expression_values = adata_T.X[:, gene_index].toarray().flatten()

# Find cells expressing the gene (pdcd1) by filtering based on expression threshold
expressing_cells_indices = np.where(gene_expression_values >= 1)[0]

adata_T_pd1 = adata_T[expressing_cells_indices].copy()
print(adata_T_pd1.shape)


In [None]:
# compare distribution of PDCD1, CXCL13, HAVCR2, CTLA4 conditioning on timepoint
sc.pl.violin(adata_T_pd1, ['PDCD1', 'CXCL13', 'CCL3', 'HAVCR2','CTLA4'], groupby='timepoint', jitter=0)


In [None]:
# plot the umap with the timepoint
sc.pl.umap(adata_T_pd1, color=['timepoint', 'PDCD1', 'CXCL13', 'HAVCR2','CTLA4'])

In [None]:
# show the distribution of the timepoint
sc.pl.umap(adata_T, color = ['leiden', 'timepoint', 'patient_id' ])

## Extract Myeloid and recluster

In [None]:
## Extract myeloid cells and create a new AnnData object, restore it to raw
data_dir = "/home/data/ICI_exprs/EGAS00001004809/"
adata = sc.read_h5ad(data_dir + '1863-counts_cells_cohort1_annotated.h5ad')
adata_M = adata[adata.obs['cell_type'] == 'Myeloid'].copy()
# restore the X to original raw.X for re-processing 
adata_M  = ad.AnnData(X=adata_M.raw.X, obs=adata_M.obs, var=adata_M.raw.var, obsm=adata_M.obsm, uns=adata_M.uns)
adata_M.uns['log1p']['base'] = 2
    # drop the highly variable column in the var dataframe, so that it can be re-calculated
adata_M.var.drop(['highly_variable'], axis=1, inplace=True)
adata_M.raw = adata_M


adata_M.shape

In [None]:
n_top_genes = 3000
adata_M = clustering_adata(adata_M, resolution = 0.2, n_top_genes = n_top_genes)

In [None]:
# plot umap
sc.pl.umap(adata_M, color='leiden',legend_loc='on data')

It seems that there are some cells that are not myeloid cells.  We will remove them. Use the umap position to remove them.

In [None]:
# sort the sample according X_umap[:, 1] and return the index and value
index_umap_0, value_umap_0 = zip(*sorted(enumerate(adata_M.obsm['X_umap'][:, 0]), key=op.itemgetter(1)))  
index_umap_1, value_umap_1 = zip(*sorted(enumerate(adata_M.obsm['X_umap'][:, 1]), key=op.itemgetter(1)))

In [None]:
plt.hist(value_umap_0, bins=100)

In [None]:
plt.hist(value_umap_1, bins=100)

### Remove outlier cells

In [None]:
# adata_M_clean = adata_M[(adata_M.obsm['X_umap'][:, 0] > 1.0) & (adata_M.obsm['X_umap'][:, 0] < 10.0) & (adata_M.obsm['X_umap'][:, 1] > -1) & (adata_M.obsm['X_umap'][:, 1] < 12.0), :].copy()
adata_M_clean = adata_M
sc.pl.umap(adata_M_clean, color=['leiden', 'timepoint', 'patient_id' ])

In [None]:
adata_M_clean.shape

### Identify DEGs for each cluster of myeloid cells

In [None]:
# Iterate through T cell clusters and identify the genes that are differentially expressed before and after treatment
# q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df_list = []
for c in adata_M.obs['leiden'].cat.categories:
    # cluster #7 is too small, skip it
    if c == '7':
        continue
    
    print("M cluster:", c)
    cell_in_cluster = adata_M[adata_M.obs['leiden'] == c, :]
    cluster_deg_df = paird_ttest(cell_in_cluster, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')
    cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
    cluster_deg_df.sort_values(by=['pval'], inplace=True)
    cluster_deg_df_list.append(cluster_deg_df)

In [None]:
cluster_deg_df_list[0].head(15)

### Extract B cells

In [None]:
## Extract myeloid cells and create a new AnnData object

adata_B = adata[adata.obs['cell_type'] == 'B cells'].copy()
print(str(adata_B.shape))
# restore the X to original raw.X for re-processing 
adata_B  = ad.AnnData(X=adata_B.raw.X, obs=adata_B.obs, var=adata_B.raw.var, obsm=adata_B.obsm, uns=adata_B.uns)
# drop the highly variable gene column in the var dataframe
adata_B.var.drop(columns='highly_variable', inplace=True)
adata_B.raw = adata_B

print(str(adata_B.shape))


In [None]:
adata_B = clustering_adata(adata_B, resolution=0.50, n_top_genes = n_top_genes)

In [None]:
# plot UMAP with timepoint
sc.pl.umap(adata_B, color=['leiden', 'timepoint', 'patient_id'],show=True)

In [None]:
# plt.hist(adata_B.obsm['X_umap'][:,0], bins=100)

In [None]:
# plt.hist(adata_B.obsm['X_umap'][:, 1], bins=100, label='UMAP 1')

In [None]:
adata_B_clean = adata_B[(adata_B.obsm['X_umap'][:, 0] < 5.0), :].copy()
adata_B_clean = adata_B
sc.pl.umap(adata_B_clean, color= ['leiden', 'timepoint', 'patient_id'], ncols= 2)

In [None]:
adata_B_clean.shape

## Extract epithelial cells

In [None]:
## Extract epithelia cells and create a new AnnData object
adata_Epi = adata[adata.obs['cell_type'] == 'Epithelial'].copy()
print(str(adata_Epi.shape))
# restore the X to original raw.X for re-processing 
adata_Epi  = ad.AnnData(X=adata_Epi.raw.X, obs=adata_Epi.obs, var=adata_Epi.raw.var, obsm=adata_Epi.obsm, uns=adata_Epi.uns)
# drop highly variable genes column from var dataframe
adata_Epi.var.drop(columns=['highly_variable'], inplace=True)
adata_Epi.raw = adata_Epi
adata_Epi.shape


In [None]:
adata_Epi = clustering_adata(adata_Epi, n_top_genes = n_top_genes)

In [None]:
sc.pl.umap(adata_Epi, color=['leiden', 'timepoint', 'patient_id'])

In [None]:
# plot hist of umap 1
# plt.hist(adata_Epi.obsm['X_umap'][:, 0], bins=100)

In [None]:
#plot hist of umap 2
# plt.hist(adata_Epi.obsm['X_umap'][:, 1], bins=100)

In [None]:
# adata_Epi_clean = adata_Epi[(adata_Epi.obsm['X_umap'][:, 0] < 20.0) & (adata_Epi.obsm['X_umap'][:, 0] > 1.0) & (adata_Epi.obsm['X_umap'][:, 1] > 1.0), :].copy()
adata_Epi_clean = adata_Epi
sc.pl.umap(adata_Epi_clean, color=['leiden', 'timepoint', 'patient_id'], ncols=2)

In [None]:
adata_Epi_clean.shape

### Extract fibroblasts


In [None]:
adata_Fibro = adata[adata.obs['cell_type'] == 'Fibroblast'].copy()
print(str(adata_Fibro.shape))
# restore the X to original raw.X for re-processing
adata_Fibro  = ad.AnnData(X=adata_Fibro.raw.X, obs=adata_Fibro.obs, var=adata_Fibro.raw.var, obsm=adata_Fibro.obsm, uns=adata_Fibro.uns)
# drop the highly variable column from var
adata_Fibro.var.drop(columns=['highly_variable'], inplace=True)
adata_Fibro.raw = adata_Fibro
adata_Fibro.write(data_dir + '1863-counts_cells_cohort1_Fibro_cells.h5ad')


In [None]:
adata_Fibro = clustering_adata(adata_Fibro, resolution = 1.0, n_top_genes = n_top_genes)

In [None]:
sc.pl.umap(adata_Fibro, color=['leiden', 'timepoint', 'patient_id'], ncols= 2)

In [None]:
# plot hist of umap 1
# plt.hist(adata_Fibro.obsm['X_umap'][:, 0], bins=100)

In [None]:
#plot hist of umap 2
# plt.hist(adata_Fibro.obsm['X_umap'][:, 1], bins=100)

In [None]:
adata_Fibro_clean = adata_Fibro
sc.pl.umap(adata_Fibro_clean, color=['leiden', 'timepoint', 'patient_id'])

### Extract Endothelial cells and create a new AnnData object

In [None]:
adata_Endo = adata[adata.obs['cell_type'] == 'Endothelial'].copy()
print(str(adata_Endo.shape))
# restore the X to original raw.X for re-processing
adata_Endo  = ad.AnnData(X=adata_Endo.raw.X, obs=adata_Endo.obs, var=adata_Endo.raw.var, obsm=adata_Endo.obsm, uns=adata_Endo.uns)
adata_Endo.raw = adata_Endo
# adata_Endo.write(data_dir + '1863-counts_cells_cohort1_Endo_cells.h5ad')


In [None]:
adata_Endo = clustering_adata(adata_Endo, n_top_genes = n_top_genes)

In [None]:
# plot hist of umap 1
# plt.hist(adata_Endo.obsm['X_umap'][:, 0], bins=100)

In [None]:
# plot hist of umap 2
# plt.hist(adata_Endo.obsm['X_umap'][:, 1], bins=100)

In [None]:
adata_Endo_clean = adata_Endo

In [None]:
sc.pl.umap(adata_Endo_clean, color=['leiden', 'timepoint', 'patient_id'])

### Save data in high variance genes space

In [None]:
data_dir_NHDP = "/data/ICI_exprs/ICI_NHDP/EGAS00001004809_high_variance_10k_gene_NHDP/"
#data_dir_NHDP = '/data/ICI_exprs/ICI_NHDP/EGAS00001004809_high_variance_gene_NHDP/'
adata_B_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_B_cells.h5ad')
adata_Epi_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_Epi_cells.h5ad')
adata_Fibro_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_Fibro_cells.h5ad')
adata_Endo_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_Endo_cells.h5ad')
adata_T.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
adata_M_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_M_cells.h5ad')

### Restore adata.X to original data and save files

In [None]:
# data_dir_NHDP = '/data/ICI_exprs/ICI_NHDP/EGAS00001004809_original_gene_NHDP/'
# adata_T_clean = ad.AnnData(X=adata_T.raw.X, obs=adata_T.obs, var=adata_T.raw.var, obsm=adata_T.obsm, uns=adata_T.uns)
# adata_T_clean.raw = adata_T_clean
# adata_T_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
# adata_M_clean = ad.AnnData(X=adata_M.raw.X, obs=adata_M.obs, var=adata_M.raw.var, obsm=adata_M.obsm, uns=adata_M.uns)
# adata_M_clean.raw = adata_M_clean
# adata_M_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_M_cells.h5ad')
# adata_B_clean = ad.AnnData(X=adata_B.raw.X, obs=adata_B.obs, var=adata_B.raw.var, obsm=adata_B.obsm, uns=adata_B.uns)
# adata_B_clean.raw = adata_B_clean
# adata_B_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_B_cells.h5ad')
# adata_Epi_clean = ad.AnnData(X=adata_Epi_clean.raw.X, obs=adata_Epi_clean.obs, var=adata_Epi_clean.raw.var, obsm=adata_Epi_clean.obsm, uns=adata_Epi_clean.uns)
# adata_Epi_clean.raw = adata_Epi_clean
# adata_Epi_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_Epi_cells.h5ad')
# adata_Endo_clean = ad.AnnData(X=adata_Endo_clean.raw.X, obs=adata_Endo_clean.obs, var=adata_Endo_clean.raw.var, obsm=adata_Endo_clean.obsm, uns=adata_Endo_clean.uns)
# adata_Endo_clean.raw = adata_Endo_clean
# adata_Endo_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_Endo_cells.h5ad')
# adata_Fibro_clean = ad.AnnData(X=adata_Fibro_clean.raw.X, obs=adata_Fibro_clean.obs, var=adata_Fibro_clean.raw.var, obsm=adata_Fibro_clean.obsm, uns=adata_Fibro_clean.uns) 
# adata_Fibro_clean.raw = adata_Fibro_clean
# adata_Fibro_clean.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_Fibro_cells.h5ad')


# Identify DEGs within each cell type and clusters


In [None]:
# Read in data from EGAS00001004809
data_dir_NHDP = "/data/ICI_exprs/ICI_NHDP/EGAS00001004809_high_variance_10k_gene_NHDP/"
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
adata_T.shape

In [None]:
sc.pl.umap(adata_T, color=['leiden', 'timepoint', 'CD8A', 'CD4', 'FOXP3'])

In [None]:
sc.tl.rank_genes_groups(adata_T, groupby='leiden',  method='wilcoxon')
sc.pl.rank_genes_groups(adata_T, n_genes=25, sharey=False)


In [None]:
# Iterate through T cell clusters and identify the genes that are differentially expressed before and after treatment
# q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df_list = []
for c in adata_T.obs['leiden'].cat.categories:
    print("T cluster:", c)
    cell_in_cluster = adata_T[adata_T.obs['leiden'] == c, :]
    cluster_deg_df = paird_ttest(cell_in_cluster, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')
    # cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
    # sort the dataframe by pval
    cluster_deg_df.sort_values(by=['pval'], inplace=True)
    cluster_deg_df_list.append(cluster_deg_df)

In [None]:
# convert the list of dataframes to a single excel file
with pd.ExcelWriter(data_dir + 'EGAS00001004809_T_cell_cluster_deg.xlsx') as writer:
    for i, df in enumerate(cluster_deg_df_list):
        df.to_excel(writer, sheet_name='T_cluster_' + str(i))

### Cluster 3 is CD4 T cells with high expression of PDCD1 and CXCL13
Check DEGs for cluster 3


In [None]:
cluster_3 = cluster_deg_df_list[3]
sig_genes = cluster_3[cluster_3['pval'] < 0.05]
# sort rows by pval from low to high
sig_genes = sig_genes.sort_values(by=['pval'])
sig_genes


### Cluster 4 is CD8 T cells with high expression of PDCD1 and CXCL13 and other "exhaustion" markers such as GZMB, NKG7, CCL5

In [None]:
cluster_4 = cluster_deg_df_list[4]
sig_genes = cluster_4[cluster_4['pval'] < 0.05]
# sort rows by pval from low to high
sig_genes = sig_genes.sort_values(by=['pval'])
sig_genes

In [None]:
# plot umap of DEGs from cluster #4
sc.pl.umap(adata_T, color=['TSC22D3', 'ZFP36L1', 'FASLG', 'CXCR4', 'timepoint'])

### Find DEGs after anti-PD1 treatment in cells expressing PD-1


In [None]:
# Get the index of the PD1 in the gene names list
gene_index = np.where(adata_T.var_names == 'PDCD1')[0][0]
# Extract expression values for the gene in all cells
gene_expression_values = adata_T.X[:, gene_index].toarray().flatten()

# Find cells expressing the gene (pdcd1) by filtering based on expression threshold
expressing_cells_indices = np.where(gene_expression_values >= 1)[0]

adata_T_pd1 = adata_T[expressing_cells_indices].copy()
print(adata_T_pd1.shape)


In [None]:
sc.pl.umap(adata_T_pd1, color=[ 'cell_type', 'PDCD1', 'CXCL13', 'PRDM1','TXNIP', 'timepoint'])

In [None]:
sc.pl.umap(adata_T_pd1, color=[ 'cell_type', 'PDCD1', 'CXCL13', 'timepoint'])

In [None]:
pd1_deg_df = paird_ttest(adata_T_pd1, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')

In [None]:
# find DEGs between two groups
pd1_deg_df = pd1_deg_df[pd1_deg_df['qval'] < 0.15]
# sort by pval
pd1_deg_df = pd1_deg_df.sort_values(by=['pval'], ascending=True)
pd1_deg_df.to_csv(os.path.join(data_dir, 'pd1_deg_df.tsv'), sep='\t', index=True)
pd1_deg_df

In [None]:
pd1_deg_df.shape


In [None]:
# plot violin plot of genes, 4 genes in each row
sc.pl.violin (adata_T_pd1, ['PDCD1', 'CXCL13', 'TIGIT', 'HAVCR2', 'PRDM1', 'TXNIP', 'FKBP5',  ], groupby='timepoint', jitter= 0)
sc.pl.violin (adata_T_pd1, ['PDCD1', 'CXCL13', 'TIGIT', 'CD8A', 'HAVCR2', 'PRDM1', 'TXNIP', 'FKBP5' ], groupby='timepoint', jitter= 0)

## Prepare T-cell data for Bryan Andrews

We are to model the target genes of PD-L1 and PD-1 signaling in T cells.
We need to prepare the T cell data for Bryan Andrews to run his analysis. We need to provide him with a file with the following columns:


Extract expression values of 'PDCD1', 'CXCL13', 'PRDM1', 'CD8A' from T cells and create a new dataframe

In [None]:
# extract expression of PDCD1, CXCL13, PRDM1
gene_of_interest = ['PDCD1', 'CXCL13', 'PRDM1', 'CD8A',  'HAVCR2', 'TIGIT', 'CTLA4']
pdcd1_cxcl13_prdm1_array = adata_T[:, gene_of_interest].X.toarray()
pdcd1_cxcl13_prdm1_array.shape

### Create a dataframe containing the above genes and leiden clustering ids

In [None]:
df_4_bryan = pd.DataFrame(data=pdcd1_cxcl13_prdm1_array, columns=gene_of_interest)
df_4_bryan.index = adata_T.obs.index
# add columns for leiiden and sample_id 
df_4_bryan['leiden'] = adata_T.obs['leiden'].values
df_4_bryan['sample_id'] = adata_T.obs['sample_id'].values
df_4_bryan['timepoint'] = adata_T.obs['timepoint'].values


In [None]:
df_4_bryan

In [None]:
# remove cells with zero expression for all PDCD1, CXCL13, PRDM1, CD8A, HAVCR2
df_4_bryan = df_4_bryan[(df_4_bryan['PDCD1'] > 0) | (df_4_bryan['CXCL13'] > 0) | (df_4_bryan['PRDM1'] > 0) | (df_4_bryan['CD8A'] > 0) | (df_4_bryan['HAVCR2'] > 0)  ]
df_4_bryan.shape

### Extract bulk RNA-seq data for PD-L1 'CD274' and populate the dataframe.
Use the pseudobulk data of pd-l1 and repopulate the df_4_bryan with the bulk RNA-seq data

In [None]:
adata = sc.read_h5ad(data_dir + '1863-counts_cells_cohort1_filtered.h5ad')
adata.raw = adata
adata_bulk = scRNA2PseudoBulkAnnData(adata, sample_id_col='sample_id')
cd274_bulk = adata_bulk[:, adata_bulk.var_names == 'CD274'].X.toarray()
cd274_bulk_df = pd.DataFrame(cd274_bulk, columns = ['CD274'], index = adata_bulk.obs.index)

In [None]:
cd274_bulk_df

In [None]:
# add an empty column for df_4_bryan
df_4_bryan['CD274'] = np.nan
# loop through sample_ids of cd274_bulk_df and add the value to df_4_bryan
for sample_id in cd274_bulk_df.index:
    df_4_bryan.loc[df_4_bryan['sample_id'] == sample_id, 'CD274'] = cd274_bulk_df.loc[sample_id, 'CD274']

# add gaussian noise to the expression of CD274
df_4_bryan['CD274'] = df_4_bryan['CD274'] + np.random.normal(0, 0.025, df_4_bryan.shape[0])
df_4_bryan.loc[:,'CD274*PDCD1'] = df_4_bryan.loc[:,'CD274'] * df_4_bryan.loc[:,'PDCD1']

In [None]:
# reorder the columns and save to tsv
df_4_bryan = df_4_bryan[['CD274', 'PDCD1', 'CD274*PDCD1', 'CXCL13', 'PRDM1', 'CD8A', 'HAVCR2', 'TIGIT', 'CTLA4', 'leiden' , 'timepoint',  'sample_id' ]]
df_4_bryan.to_csv(os.path.join(data_dir, 'df_4_bryan.tsv'), sep='\t', index=True)

In [None]:
data_dir