# EGAS00001004809 scRNA Data Preprocessing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import os as os
import sys as sys
sys.path.append('/home/qiuaodon/Desktop/PanCancer_scRNA_analysis/utils/')
from scRNA_utils import *
import operator as op

## Read in data matrix and combine with meta-data of obs

In [None]:
# Read in data in H5AD format
data_dir = "/home/data/ICI_exprs/EGAS00001004809/"
data_file =  data_dir + '1863-counts_cells_cohort1.h5ad'
adata = sc.read_h5ad(data_file)
print ("Read in dataset with dimension: " + str(adata.shape))

In [None]:
# read in the csv meta-data
meta_file_pathname = data_dir + "1872-BIOKEY_metaData_cohort1_web.csv"
cohort1_meta = pd.read_csv(meta_file_pathname, header = 0, index_col = 0)

In [None]:
# make sure the index of cohort1_meta agree with the obs of adata
cohort1_meta = cohort1_meta.reindex(adata.obs.index)
adata.obs = cohort1_meta
print(adata.obs.columns)
print(adata.obs['patient_id'].unique())

In [None]:
adata.obs['expansion'].unique()

In [None]:
# change "timepoint" values to lowercase
adata.obs["timepoint"] = adata.obs["timepoint"].str.lower()
adata.obs["timepoint"].unique()

In [None]:
# the field "cohort" is equivalent to "treatment", rename it
adata.obs.rename(columns={'cohort': 'treatment'}, inplace=True)

In [None]:
# extract sample_id from index, re-join first three columns of index after splitting by "_"
adata.obs['sample_id'] = adata.obs.index.str.split("_").str[0:3].str.join("_")

In [None]:
adata.obs.sample_id.head()

In [None]:
adata.var

### Extract tissue type from the sample name or from meta-data

In [None]:
# drop the nCount_RNA and nFeature_RNA columns
adata.obs.drop(columns=['nCount_RNA', 'nFeature_RNA'], inplace=True)

# rearrange the columns order: "patient_id", "sample_id", "timepoint", "treatment", "cell_type"
adata.obs = adata.obs[["patient_id", "sample_id", "timepoint", "treatment", 'expansion', 'BC_type', 'cellType']]   

In [None]:
print(adata.obs.columns)

## Basic Filtering

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
# removing cells expressing <500 || >5000 genes
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_cells(adata, max_genes=5000)

In [None]:
print ('Dimention of adata: ' + str(adata.shape))
print ('columns for adata.obs: ' + str(adata.obs.columns))

In [None]:
# removing cells containing <400 || >25000 UMIs
sc.pp.filter_cells(adata, min_counts = 400)
sc.pp.filter_cells(adata, max_counts = 25000)

In [None]:
print ('Dimention of adata: ' + str(adata.shape))
print ('columns for adata.obs: ' + str(adata.obs.columns))

## process the var

### Removing genes covered by <10 cells per sample

In [None]:
# removing genes covered by <10 cells
sc.pp.filter_genes(adata, min_cells= 10)
adata.var_names_make_unique()
adata.shape

### Remove cells with high percentage of mitochondrial genes

In [None]:
# label genes as mt
adata.var['mt'] = adata.var_names.str.startswith('MT-')  

# annotate cells with the percent of genes assigned as mt
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# remove mitochondrial genes from analysis
adata = adata[:, ~adata.var['mt'].values]

# Here we keep cells with < 20% mito ratio
adata = adata[adata.obs['pct_counts_mt'] < 20, :]
adata.shape

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.0, multi_panel=True)

## Integrating Samples

### Normalization & Logarithmization:

In [None]:
# Log normalization scaled up to 10000
print('Before normalization, the sum of first row of X: ' + str(adata.X[0,:].sum()))
sc.pp.normalize_total(adata, target_sum=1e4)
print('After normalization, the sum of first row of X: ' + str(adata.X[0,:].sum()))


In [None]:
print('Before log, the sum of first row of X: ' + str(adata.X[0,:].sum()))
# Logarithmize adata
sc.pp.log1p(adata, base = 2)
print('After log, the sum of first row of X: ' + str(adata.X[0,:].sum()))

### keep high variance genes

In [None]:
n_top_genes=10000
sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes)
sc.pl.highly_variable_genes(adata)

In [None]:
adata.var.columns

### Keep track of original adata and update adata.X to  high variance genes only

In [None]:
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
print('adata dimensions of high variance genes: ' + str(adata.shape))

## Unsupervised cell clustering & identification of major cell types

In [None]:
# perform PCA   
sc.tl.pca(adata, svd_solver='arpack', n_comps=30)

In [None]:
print(adata.obsm['X_pca'].shape)
print(adata.varm['PCs'].shape)
print(adata.uns['pca']['variance_ratio'].shape)
print(adata.obs.columns)

In [None]:
sc.pp.neighbors(adata, n_neighbors=80, n_pcs=30)

In [None]:
# Use the Leiden algorithm to find clusters
sc.tl.leiden(adata, resolution=0.5)

In [None]:
# load/find cell cycle markers: T-test/T-cells
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
# perform UMAP
sc.tl.umap(adata)
sc.pl.umap(adata, color='leiden',legend_loc='on data')

### Label cells based on cell markers

In [None]:
cell_type_markers = {
    'T cells': ['CD3D', 'CD3E', 'TRAC', 'TRBC1'],
    'B cells': ['CD79A', 'CD79B', 'MS4A1', 'TNFRSF17', 'MZB1'],
    'Myeloid': ['CD14', 'CD68'],
    'Epithelial': ['EPCAM', 'CD24'],
    'Fibroblast': ['COL1A2', 'COL3A1', 'MYH11', 'ACTA2'],
    'Endothelial': ['VWF', 'PECAM1']
}
sc.tl.dendrogram(adata, groupby='leiden')
sc.pl.dotplot(adata, cell_type_markers, 'leiden', dendrogram=True)

In [None]:
labelClusterWithCellType(adata, cell_type_markers)
sc.pl.umap(adata, color=['cell_type'])
sc.pl.umap(adata, color=['treatment'])
sc.pl.umap(adata, color=['timepoint'])
sc.pl.umap(adata, color=['leiden'])

In [None]:
#save the adata object
adata.write('/home/qiuaodon/Desktop/project_data_new/whole_cohort1_processed.h5ad')

In [None]:
# show the UMAP of PDCD1 with a valid color_map
sc.pl.umap(adata, color='PDCD1', color_map='viridis')


In [None]:
sc.pl.umap(adata, color='timepoint')

In [None]:
# select the cells from the timepoint pre
adata_pre = adata[adata.obs['timepoint'] == 'pre', :]
sc.pl.umap(adata_pre, color='PDCD1', color_map='Spectral_r')
adata_on = adata[adata.obs['timepoint'] == 'on', :]
sc.pl.umap(adata_on, color='PDCD1', color_map='Spectral_r')

## Some statistics of cell distribution pre and post treatment

In [None]:
# count the number of cells in each cell type
cell_type_count = adata.obs.groupby('cell_type').size()
# plot a pie chart
cell_type_count.plot.pie(y='cell_type', figsize=(5, 5), autopct='%1.1f%%', startangle=90)
plt.title('Overall cell type distribution')

In [None]:
# count the number of cells in each cell type in pre and on 
adata_pre = adata[adata.obs['timepoint'] == 'pre', :]
cell_type_count_pre = adata_pre.obs.groupby(['cell_type']).size()
cell_type_count_pre.plot.pie(y='cell_type', figsize=(5, 5), autopct='%1.1f%%', startangle=90)
plt.title('Cell type distribution in pre')

In [None]:
# count the number of cells in each cell type in pre and on 
adata_on = adata[adata.obs['timepoint'] == 'on', :]
cell_type_count_on = adata_on.obs.groupby(['cell_type']).size()
cell_type_count_on.plot.pie(y='cell_type', figsize=(5, 5), autopct='%1.1f%%', startangle=90)
plt.title('Cell type distribution in on')


## Extract T cells, cluster and create a new AnnData object

### use the raw data of T cells to get the clusters

In [None]:
# restore the X to original raw.X for re-processing 
adata.raw = adata
adata_T = adata[adata.obs['cell_type'] == 'T cells', :]
adata_T  = ad.AnnData(X=adata_T.raw.X, obs=adata_T.obs, var=adata_T.raw.var)
adata_T.raw = adata_T
print(str(adata_T.shape))

In [None]:
# reclustering T cells
# select high variance genes
n_top_gene_T = 5000
sc.pp.highly_variable_genes(adata_T, n_top_genes=n_top_gene_T)  
# filter genes
adata_T = adata_T[:, adata_T.var['highly_variable']]
# PCA
sc.pp.pca(adata_T, n_comps=50, use_highly_variable=True, svd_solver='arpack')
# UMAP
sc.pp.neighbors(adata_T, n_neighbors=15, n_pcs=50)
sc.tl.umap(adata_T)
# clustering
sc.tl.leiden(adata_T, resolution=0.5)
# plot
sc.pl.umap(adata_T, color=['leiden'], legend_loc='on data', title='T cells')


In [None]:
adata_T.var

## Extract Myeloid and recluster

In [None]:
# Extract myeloid cells and create a new AnnData object
adata_M = adata[adata.obs['cell_type'] == 'Myeloid'].copy()
# restore the X to original raw.X for re-processing 
adata_M  = ad.AnnData(X=adata_M.raw.X, obs=adata_M.obs, var=adata_M.raw.var, obsm=adata_M.obsm, uns=adata_M.uns)
# drop the highly variable column in the var dataframe
adata_M.var.drop(['highly_variable'], axis=1, inplace=True)
adata_M.raw = adata_M
adata_M.shape

In [None]:
n_top_genes = 5000
adata_M = clustering_adata(adata_M, n_top_genes = n_top_genes)

In [None]:
# plot umap
sc.pl.umap(adata_M, color='leiden',legend_loc='on data')

### It seems that there are some cells that are not myeloid cells?  We will remove them. Use the umap position to remove them.

In [None]:
# sort the sample according X_umap[:, 1] and return the index and value
index_umap_0, value_umap_0 = zip(*sorted(enumerate(adata_M.obsm['X_umap'][:, 0]), key=op.itemgetter(1)))  
index_umap_1, value_umap_1 = zip(*sorted(enumerate(adata_M.obsm['X_umap'][:, 1]), key=op.itemgetter(1)))

In [None]:
plt.hist(value_umap_0, bins=100)
plt.hist(value_umap_1, bins=100)

In [None]:
adata_M_clean = adata_M
sc.pl.umap(adata_M_clean, color=['leiden', 'timepoint', 'patient_id' ])

## Extract B cells and recluster

In [None]:
adata_B = adata[adata.obs['cell_type'] == 'B cells'].copy()
print(str(adata_B.shape))
# restore the X to original raw.X for re-processing 
adata_B  = ad.AnnData(X=adata_B.raw.X, obs=adata_B.obs, var=adata_B.raw.var, obsm=adata_B.obsm, uns=adata_B.uns)
# drop the highly variable gene column in the var dataframe
adata_B.var.drop(columns='highly_variable', inplace=True)
adata_B.raw = adata_B
print(str(adata_B.shape))

In [None]:
n_top_genes = 5000
adata_B = clustering_adata(adata_B, resolution=0.50, n_top_genes = n_top_genes)

In [None]:
# plot UMAP with timepoint
sc.pl.umap(adata_B, color=['leiden', 'timepoint', 'patient_id'],show=True)

In [None]:
plt.hist(adata_B.obsm['X_umap'][:,0], bins=100)

### why we need this step

In [None]:
adata_B_clean = adata_B[(adata_B.obsm['X_umap'][:, 0] < 5.0), :].copy()
sc.pl.umap(adata_B_clean, color= ['leiden', 'timepoint', 'patient_id'], ncols= 2)

In [None]:
print(adata_B.shape)
print(adata_B_clean.shape)

## Extract epithelial cells and recluster

In [None]:
adata_Epi = adata[adata.obs['cell_type'] == 'Epithelial'].copy()
print(str(adata_Epi.shape))
# restore the X to original raw.X for re-processing 
adata_Epi  = ad.AnnData(X=adata_Epi.raw.X, obs=adata_Epi.obs, var=adata_Epi.raw.var, obsm=adata_Epi.obsm, uns=adata_Epi.uns)
# drop highly variable genes column from var dataframe
adata_Epi.var.drop(columns=['highly_variable'], inplace=True)
adata_Epi.raw = adata_Epi
adata_Epi.shape

In [None]:
adata_Epi = clustering_adata(adata_Epi, n_top_genes = 5000)

In [None]:
sc.pl.umap(adata_Epi, color=['leiden', 'timepoint', 'patient_id'])

In [None]:
adata_Epi.shape

## Extract fibroblasts and recluster

In [None]:
adata_Fibro = adata[adata.obs['cell_type'] == 'Fibroblast'].copy()
print(str(adata_Fibro.shape))
# restore the X to original raw.X for re-processing
adata_Fibro  = ad.AnnData(X=adata_Fibro.raw.X, obs=adata_Fibro.obs, var=adata_Fibro.raw.var, obsm=adata_Fibro.obsm, uns=adata_Fibro.uns)
# drop the highly variable column from var
adata_Fibro.var.drop(columns=['highly_variable'], inplace=True)
adata_Fibro.raw = adata_Fibro
adata_Fibro.shape

In [None]:
adata_Fibro = clustering_adata(adata_Fibro, resolution = 1.0, n_top_genes = 5000)
sc.pl.umap(adata_Fibro, color=['leiden', 'timepoint', 'patient_id'], ncols= 2)

## Extract Endothelial cells and reclustering

In [None]:
adata_Endo = adata[adata.obs['cell_type'] == 'Endothelial'].copy()
print(str(adata_Endo.shape))
# restore the X to original raw.X for re-processing
adata_Endo  = ad.AnnData(X=adata_Endo.raw.X, obs=adata_Endo.obs, var=adata_Endo.raw.var, obsm=adata_Endo.obsm, uns=adata_Endo.uns)
adata_Endo.raw = adata_Endo
adata_Endo.shape

In [None]:
adata_Endo = clustering_adata(adata_Endo, n_top_genes = n_top_genes)

In [None]:
sc.pl.umap(adata_Endo, color=['leiden', 'timepoint', 'patient_id'])

## save all the data on my desktop

In [None]:
data_dir_NHDP = "/home/qiuaodon/Desktop/project_data/"
adata_B.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_B_cells.h5ad')
adata_Epi.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_Epi_cells.h5ad')
adata_Fibro.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_Fibro_cells.h5ad')
adata_Endo.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_Endo_cells.h5ad')
adata_T.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
adata_M.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_M_cells.h5ad')