# EGAS00001004809 scRNA Data Preprocessing 


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import os as os
import seaborn as sns
import sys as sys
sys.path.append('/home/xinghua/projects/PanCancer_scRNA_analysis/utils/')

### Read in data matrix and combine with meta-data

In [None]:

### Read in data in H5AD format
data_dir = "/home/data/ICI_exprs/EGAS00001004809/"
data_file =  data_dir + '1863-counts_cells_cohort1.h5ad'

adata = sc.read_h5ad(data_file)

print ("Read in dataset with dimension: " + str(adata.shape))

In [None]:
# read in the csv meta-data
meta_file_pathname = data_dir + "1872-BIOKEY_metaData_cohort1_web.csv"
cohort1_meta = pd.read_csv(meta_file_pathname, header = 0, index_col = 0)

In [None]:
adata.obs = cohort1_meta
print(adata.obs.columns)
print(adata.obs['patient_id'].unique())

In [None]:
adata.obs["timepoint"].unique()

In [None]:
nPatients = adata.obs['patient_id'].unique()

In [None]:
# remove "cellType" column and turn it into "cell_type"
adata.obs['cell_type'] = adata.obs['cellType']

# drop 'cellType' column
adata.obs.drop('cellType', axis=1, inplace=True)

## 1. Basic Filtering

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

#### Removing cells expressing <500 || >5000 genes:

In [None]:
# removing cells expressing <500 || >5000 genes
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_cells(adata, max_genes=5000)

In [None]:
print ('Dimention of adata: ' + str(adata.shape))
print ('columns for adata.obs: ' + str(adata.obs.columns))

#### Removing cells containing <400 || >25000 UMIs:

In [None]:
# removing cells containing <400 || >25000 UMIs
sc.pp.filter_cells(adata, min_counts = 400)
sc.pp.filter_cells(adata, max_counts = 25000)

In [None]:
print ('Dimention of adata: ' + str(adata.shape))
print ('columns for adata.obs: ' + str(adata.obs.columns))

# Preprecess with respect to gene (var)
#### Removing genes covered by <3 cells per sample


In [None]:
# removing genes covered by <3 cells
sc.pp.filter_genes(adata, min_cells= 10)
adata.var_names_make_unique()
adata.shape

### Remove cells with high percentage of mitochondrial genes

In [None]:
# label genes as mt
adata.var['mt'] = adata.var_names.str.startswith('MT-')  

# annotate cells with the percent of genes assigned as mt
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Here we keep cells with < 20% mito ratio
adata = adata[adata.obs['pct_counts_mt'] < 20, :]
adata.shape

### Attention: it appears that a total of 135311 cells has more than 20% MT genes. Need to check with the paper what percentage of cutoff they have used

Plot statistics regarding cells

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.0, multi_panel=True)

## 2. Integrating Samples

#### Normalization & Logarithmization:

In [None]:
# Log normalization scaled up to 10000
print('Before normalization, the sum of first row of X: ' + str(adata.X[0,:].sum()))
sc.pp.normalize_total(adata, target_sum=1e4)
print('After normalization, the sum of first row of X: ' + str(adata.X[0,:].sum()))


In [None]:
print('Before log, the sum of first row of X: ' + str(adata.X[0,:].sum()))
# Logarithmize adata
sc.pp.log1p(adata)
print('After log, the sum of first row of X: ' + str(adata.X[0,:].sum()))

## Save a pre-process version of the data

In [None]:
adata.obs

In [None]:
# write current adata to h5ad
adata.write(data_dir + '1863-counts_cells_cohort1_filtered.h5ad')

In [None]:
# skip the loading and etc
adata = sc.read('/home/data/ICI_exprs/EGAS00001004809/1863-counts_cells_cohort1_filtered.h5ad')

In [None]:
print(adata)

In [None]:
adata.obs

## Keep high variance genes 

In [None]:
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)

### Keep track of original adata and update adata.X to  high variance genes only

In [None]:
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
print('adata dimensions of high variance genes: ' + str(adata.shape))

## 3. Unsupervised cell clustering & identification of major cell types

In [None]:
# perform PCA   
sc.tl.pca(adata, svd_solver='arpack', n_comps=30)

In [None]:
print(adata.obsm['X_pca'].shape)
print(adata.varm['PCs'].shape)
print(adata.uns['pca']['variance_ratio'].shape)
print(adata.obs.columns)

In [None]:
sc.pp.neighbors(adata, n_neighbors=80, n_pcs=30)

In [None]:
# Use the Leiden algorithm to find clusters
sc.tl.leiden(adata, resolution=0.5)


In [None]:
# load/find cell cycle markers: T-test/T-cells
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
adata.obs.columns

## 3. Unsupervised cell clustering & identification of major cell types

In [None]:
# perform UMAP
sc.tl.umap(adata)
sc.pl.umap(adata, color='leiden',legend_loc='on data')



In [None]:
# color by gene expression of T cell markers
sc.pl.umap(adata, color= ['CD3D', 'CD3E', 'TRAC', 'TRBC1'])

In [None]:
# color by gene expression of B cell markers
sc.pl.umap(adata, color= ['CD19', 'CD79A', 'CD79B', 'MS4A1'])


In [None]:
# color by gene expression of myeloid cell markers
sc.pl.umap(adata, color= ['CD14', 'CD68', 'CD163', 'CD33'])

In [None]:
# color by gene expression of epi cell markers
sc.pl.umap(adata, color= ['EPCAM', 'CD24','KRT19', 'KRT7'])

In [None]:
# color by gene expression of fibroblast cell markers   
sc.pl.umap(adata, color= ['COL1A1', 'COL1A2', 'COL3A1', 'ACTA2'])

In [None]:
# color by gene expression of endothelial cell markers  
sc.pl.umap(adata, color= ['PECAM1', 'CD34', 'VWF', 'CDH5'])

## Label cells based on cell markers

In [None]:
cell_type_markers = {
    'T cells': ['CD3D', 'CD3E', 'TRAC', 'TRBC1'],
    'B cells': ['CD79A', 'CD79B', 'MS4A1', 'TNFRSF17', 'MZB1'],
    'Myeloid': ['CD14', 'CD68'],
    'Epithelial': ['EPCAM', 'CD24'],
    'Fibroblast': ['COL1A2', 'COL3A1', 'MYH11', 'ACTA2'],
    'Endothelial': ['VWF', 'PECAM1']
}
sc.tl.dendrogram(adata, groupby='leiden')
sc.pl.dotplot(adata, cell_type_markers, 'leiden', dendrogram=True)
#sc.pl.umap(adata, color='clusters', legend_loc='on data')


### Function to annotate clusters with cell type

In [None]:
def labelClusterWithCellType(adata, cell_type_markers, cluster_column='leiden'):
    '''
    This function will label each cluster with the cell type that is most abundant in that cluster.

    Parameters:
        adata: AnnData object
        cell_type_markers: a dictionary where the key is the cell type and the value is a list of markers for that cell type
        cluster_column: the column in adata.obs that contains the cluster labels

    Returns:
        adata: AnnData object with a new column in adata.obs called 'cell_type' that contains the cell type label for each cell
    
    '''

    # find total number of clusters   
    cls_ids = adata.obs[cluster_column].unique()
    
    # iterate through all cluster
    for i in cls_ids:
        # find cells in cluster i        
        cell_in_cls_i = adata.obs[cluster_column] == i  
        # this will return a vector of True/False where True means the cell is in cluster i
        # print('processing cluster: ' + str(i) + ' with ' + str(sum(cell_in_cls_i)) + ' cells')

        # keep track of which cell type is most abundant in cluster i
        cell_type_cluster_overlapp_pct = dict()

        #iterate through key and value of cell_type_markers
        for cell_type, marker_genes in cell_type_markers.items():   
            # Extract the expression of all marker genes for cells in cluster i
            # this will return a sparse matrix of cells x markers
            cell_w_marker_genes = adata.raw.X[:, adata.raw.var_names.isin(marker_genes)] > 0  
            
            # change cell_in_cls_i to numpy array and repeat it to match the shape of cell_w_marker_genes
            cell_in_cls_i_m = np.tile(cell_in_cls_i.to_numpy(), (cell_w_marker_genes.shape[1], 1)).T

            # find cells in cluster i that express the marker
            # this create a matrix of cells x markers where True means the cell express the marker and in cluster i
            cell_w_marker_genes = cell_w_marker_genes.toarray() & cell_in_cls_i_m

            # caclualte average markers expressed in each cell in Marker_genes_i
            nmarker_per_cell = np.sum(cell_w_marker_genes, axis=0) / cell_w_marker_genes.shape[1]
            #print(nmarker_per_cell)

            # keep track of which cell type is most abundant in cluster i
            # assuming the cell type with the highest average marker present is the most abundant         
            cell_type_cluster_overlapp_pct[cell_type] = np.sum(nmarker_per_cell) / sum(cell_in_cls_i)

        # check with cell type is most abundant in cluster i
        max_type = max(cell_type_cluster_overlapp_pct, key=cell_type_cluster_overlapp_pct.get)
        print('Cluster ' + str(i) + ' is most likely ' + max_type + ' with ' + str(cell_type_cluster_overlapp_pct[max_type]) + ' overlap')
        adata.obs.loc[cell_in_cls_i, 'cell_type'] = max_type           
        


In [None]:
adata.obs.drop(columns="cell_type", inplace=True)

In [None]:
labelClusterWithCellType(adata, cell_type_markers)

In [None]:
sc.pl.umap(adata, color='cell_type')

sc.pl.umap(adata, color='clusters', legend_loc='on data')

## Extract T cells and create a new AnnData object

In [None]:
## Extract T cells and create a new AnnData object
adata_T = adata[adata.obs['cell_type'] == 'T cells'].copy()
adata_T.shape

In [None]:
# restore the X to original raw.X for re-processing 
adata_T  = ad.AnnData(X=adata_T.raw.X, obs=adata_T.obs, var=adata_T.raw.var, obsm=adata_T.obsm, uns=adata_T.uns)
adata_T.raw = adata_T
adata_T.write(data_dir + '1863-counts_cells_cohort1_T_cells.h5ad')
print(str(adata_T.shape))


Re-select high variance genes

In [None]:
# re-caculate high variance genes
sc.pp.highly_variable_genes(adata_T, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata_T)
len(adata_T.var_names)

### Reclustering T cells and find where PD-1 and its target express

In [None]:
# cluster T cells 
sc.tl.pca(adata_T, svd_solver='arpack', n_comps=40)
sc.pp.neighbors(adata_T, n_neighbors=80, n_pcs=40)

In [None]:
sc.tl.leiden(adata_T, resolution=.25)
sc.tl.umap(adata_T)
sc.pl.umap(adata_T, color='leiden',legend_loc='on data')

### Label TNK subtypes

In [None]:
T_cell_makers = {
    'CD4'	: ['CD4', 'IL7R'],
    'CD8'	: [ 'CD8A', 'CD8B'],
    'Naïve'	: ['TCF7', 'SELL', 'LEF1', 'CCR7'],
    'Exhausted' : ['LAG3', 'TIGIT', 'PDCD1', 'HAVCR2', 'CTLA4'],
    'Cytotoxic' : ['IL2', 'GZMA', 'GNLY', 'PRF1', 'GZMB', 'GZMK', 'IFNG', 'NKG7'],
    'Treg' : ['IL2RA', 'FOXP3', 'IKZF2', 'IKZF4',  'TNFRSF18'],
    'Gamma-delta' : ['TRGC1', 'TRGC2', 'TRDC'],
    'Th17' : ['IL17A',  'CCR6', 'KLRB1'],  #'IL22',
    'MAIT' : ['SLC4A10', 'KLRB1', 'IL7R', 'DPP4'],  
    'ILC' :	['KIT', 'IL1R1'],
    'Th1' :	['STAT4', 'IL12RB2', 'IFNG'],
    'Th2' :	['GATA3', 'STAT6', 'IL4'],
    'Tfh'	: ['MAF', 'CXCL13', 'CXCR5', 'PDCD1'],
    'NK' :  ['XCL1', 'FCGR3A', 'KLRD1', 'KLRF1', 'NCAM1'],
    'Proliferation' : ['MKI67', 'PCNA', 'STMN1']
}

### Plot

In [None]:
sc.tl.dendrogram(adata_T, groupby='leiden')
sc.pl.dotplot(adata_T, T_cell_makers, 'leiden', dendrogram=True)

In [None]:
for cell_type, markers in T_cell_makers.items():
    print (cell_type, ":", markers)
    sc.pl.umap(adata_T, color=markers)

In [None]:
adata_T.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_T, T_cell_makers, cluster_column='leiden')
sc.pl.umap(adata_T, color='cell_type')

Plot PD-1 and potential target genes

In [None]:
sc.pl.umap(adata_T, color= ['PDCD1', 'CXCL13', 'HAVCR2','CTLA4'])

The enriched genes for each cluster

In [None]:
# load/find cell cycle markers: T-test/T-cells
sc.tl.rank_genes_groups(adata_T, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata_T, n_genes=25, sharey=False)

### Examine the distribution of PDCD1 and potential target genes

In [None]:
# extract cells belonging to cluster 3, 4
adata_T_34 = adata_T[adata_T.obs['leiden'].isin(['3', '4'])].copy()

In [None]:
# compare distribution of PDCD1, CXCL13, HAVCR2, CTLA4 conditioning on timepoint
sc.pl.violin(adata_T_34, ['PDCD1', 'CXCL13', 'CCL3', 'HAVCR2','CTLA4'], groupby='timepoint')


In [None]:
# plot the umap with the timepoint
sc.pl.umap(adata_T_34, color=['timepoint', 'PDCD1', 'CXCL13', 'HAVCR2','CTLA4'])

In [None]:
# show the distribution of the timepoint
sc.pl.umap(adata_T, color = ['timepoint'])

In [None]:
adata_T.obs.columns

## Extract Myeloid and B cells  

## myeloid

In [None]:
adata.obs['cell_type']

In [None]:
## Extract myeloid cells and create a new AnnData object
adata_M = adata[adata.obs['cell_type'] == 'Myeloid'].copy()


In [None]:
# restore the X to original raw.X for re-processing 
adata_M  = ad.AnnData(X=adata_M.raw.X, obs=adata_M.obs, var=adata_M.raw.var, obsm=adata_M.obsm, uns=adata_M.uns)
adata_M.raw = adata_M
print(str(adata_M.shape))

adata_M.shape

re-calculate high variance genes

In [None]:
# re-caculate high variance genes-- from Dr. Lu's EGAS00001004809

sc.pp.highly_variable_genes(adata_M, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata_M)
len(adata_M.var_names)

re-clustering Myeloid

In [None]:
# cluster myeloid 
sc.tl.pca(adata_M, svd_solver='arpack', n_comps=40)
sc.pp.neighbors(adata_M, n_neighbors=80, n_pcs=40)

In [None]:
sc.tl.leiden(adata_M, resolution=.5)
sc.tl.umap(adata_M)
sc.pl.umap(adata_M, color='leiden',legend_loc='on data')

In [None]:
# myeloid_markers = {
#     'Monocyte': ['CD14', 'CD16', 'FCGR3A', 'CSF1R'],
#     'Macrophage': ['CD68', 'CD163', 'CD206', 'CD11b', 'CD11c'],
#     'Dendritic': ['CD11c', 'HLA-DR', 'CD86', 'CD209'],
#     'Neutrophil': ['CD66b', 'CD16', 'CD11b', 'MPO', 'CXCR2'],
#     'Mast Cell': ['FCER1A', 'TPSAB1', 'TPSB2', 'CPA3'],
#     'Eosinophil': ['SIGLEC8', 'CCL26', 'PRG2', 'RNASE2'],
#     'Basophil': ['CD203c', 'TPT1', 'HLA-DR'],
#     'Monocyte-derived DC': ['CD1c', 'CD1a', 'CD14', 'CD16', 'CD11c', 'CD11b'],
#     'Plasmacytoid DC': ['CD303', 'CD304', 'IL3RA', 'HLA-DR'],
#     'Osteoclast': ['ACP5', 'CALCR', 'CSTK', 'CD14', 'CD68'],
#     'Microglia': ['CD11b', 'CD45', 'TMEM119', 'P2RY12'],
#     'Histiocyte': ['CD163', 'CD68', 'CD14'],
#     'MDC-1': ['CLEC9A', 'CD1c', 'CD141', 'CD11c', 'HLA-DR'],
#     'MDC-2': ['CD1a', 'CD207', 'CD11c', 'HLA-DR'],
#     'Langerhans': ['CD207', 'CD1a', 'CD207', 'CD11c', 'HLA-DR']
# }

myeloid_markers = {
    'Monocyte': ['CD14', 'CD16', 'FCGR3A', 'CSF1R'],
    'Macrophage': ['CD68', 'CD163', 'CD206', 'CD11b'],
    'Dendritic': ['HLA-DR', 'CD86', 'CD209'],
    'Neutrophil': ['CD16', 'CD11b', 'CXCR2'],
    'Mast Cell': ['FCER1A'],
    'Eosinophil': ['CCL26', 'PRG2'],
    'Basophil': ['CD203c'],
    'Monocyte-derived DC': ['CD1c', 'CD1a', 'CD14', 'CD16', 'CD11c', 'CD11b'],
    'Plasmacytoid DC': ['CD303', 'CD304', 'IL3RA', 'HLA-DR'],
    'Histiocyte': ['CD163', 'CD68', 'CD14'],
    'MDC-1': ['CLEC9A', 'CD1c', 'CD141', 'CD11c', 'HLA-DR'],
    'MDC-2': ['CD1a', 'CD207', 'CD11c', 'HLA-DR'],
    'Langerhans': ['CD207', 'CD1a', 'CD11c', 'HLA-DR']
}



In [None]:
adata_M.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_M, myeloid_markers, cluster_column='leiden')
sc.pl.umap(adata_M, color='cell_type')

In [None]:
sc.pl.umap(adata_M, color=['timepoint'], palette = 'Set1')

In [None]:
sc.pl.umap(adata_M, color=['timepoint', 'cell_type'])


In [None]:
adata_M.obs

In [None]:
adata_M.var

plot

In [None]:
# save adata_M to a file 

adata_M.write('/home/data/ICI_exprs/ICI_Myeloid_cell_collection/1863-counts_cells_cohort1_M_cells.h5ad')

## B-cells

In [None]:
## Extract B cells and create a new AnnData object
adata_B = adata[adata.obs['cell_type'] == 'B cells'].copy()

re-calculate high variance genes

In [None]:
# re-caculate high variance genes-- from Dr. Lu's EGAS00001004809

sc.pp.highly_variable_genes(adata_B, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata_B)
len(adata_B.var_names)

re-clustering B-cells

In [None]:
# cluster B cells 
sc.tl.pca(adata_B, svd_solver='arpack', n_comps=40)
sc.pp.neighbors(adata_B, n_neighbors=80, n_pcs=40)

In [None]:
sc.tl.leiden(adata_B, resolution=.5)
sc.tl.umap(adata_B)
sc.pl.umap(adata_B, color='leiden',legend_loc='on data')

In [None]:
b_cell_markers = {
    'B_cell' : ['CD19', 'CD20', 'CD79A', 'CD79B', 'MS4A1', 'IGHM', 'IGLC2', 'IGLC3', 'IGHG1'],
    'Plasma_cell' : ['CD38', 'CD138', 'XBP1', 'PRDM1', 'IRF4', 'MUM1'],
    'Memory_B_cell' : ['CD27', 'CD21', 'CD23', 'CD24', 'CD5'],
    'Naive_B_cell' : ['CD27', 'CD21', 'CD23', 'CD24', 'CD5', 'CD38'],
    'Germinal_center_B_cell' : ['BCL6', 'PAX5', 'CD10', 'CD38'],
    'Follicular_B_cell' : ['CD21', 'CD35', 'CXCR4', 'CD23'],
    'Marginal_zone_B_cell' : ['CD27', 'CD21', 'CD35', 'IgM', 'IgD'],
    'B1_cell' : ['CD20', 'CD43', 'CD5', 'IgM', 'IgD'],
    'B_regulatory_cell' : ['CD19', 'CD20', 'CD24', 'CD38', 'CD5', 'CD27', 'CD1d', 'CD21'],
    'Plasmablast' : ['CD38', 'CD138', 'IRF4', 'XBP1', 'PRDM1', 'MUM1'],
    'Transitional_B_cell' : ['CD10', 'CD24', 'CD38', 'CD21', 'CD23'],
    'IgM_B_cell' : ['IgM'],
    'IgD_B_cell' : ['IgD']
}


In [None]:
adata_B.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_B, b_cell_markers, cluster_column='leiden')
sc.pl.umap(adata_B, color='cell_type')

In [None]:
sc.pl.umap(adata_B, color=['timepoint'], palette = 'Set1')

In [None]:
sc.pl.umap(adata_B, color=['timepoint', 'cell_type'])


In [None]:
adata_B.shape

In [None]:
adata_B.obs

In [None]:
adata_B.var

In [None]:
# save adata_B to a file 

adata_B.write('/data/ICI_exprs/ICI_B_cell_collection/1863-counts_cells_cohort1_B_cells.h5ad')

# epithelial cells

In [None]:
## Extract epithelial cells and create a new AnnData object
adata_Ep = adata[adata.obs['cell_type'] == 'Epithelial'].copy()


In [None]:
# restore the X to original raw.X for re-processing 
adata_Ep  = ad.AnnData(X=adata_Ep.raw.X, obs=adata_Ep.obs, var=adata_Ep.raw.var, obsm=adata_Ep.obsm, uns=adata_Ep.uns)
adata_Ep.raw = adata_Ep
print(str(adata_Ep.shape))

adata_Ep.shape

re-calculate high variance genes

In [None]:
# re-caculate high variance genes-- from Dr. Lu's EGAS00001004809

sc.pp.highly_variable_genes(adata_Ep, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata_Ep)
len(adata_Ep.var_names)

re-clustering epithelial

In [None]:
# cluster epithelial 
sc.tl.pca(adata_Ep, svd_solver='arpack', n_comps=40)
sc.pp.neighbors(adata_Ep, n_neighbors=80, n_pcs=40)

In [None]:
sc.tl.leiden(adata_Ep, resolution=.5)
sc.tl.umap(adata_Ep)
sc.pl.umap(adata_Ep, color='leiden',legend_loc='on data')

In [None]:
epithelial_markers = {
    'Epithelial_cell': ['EPCAM', 'KRT5', 'KRT8', 'KRT18', 'KRT19'],
    'Basal_cell': ['KRT5', 'CD44', 'ITGA6', 'TP63'],
    'Luminal_cell': ['KRT8', 'KRT18', 'KRT19', 'CD24', 'CDH1', 'ESR1'],
    'Club_cell': ['SCGB1A1', 'CC10', 'UTP4', 'Foxj1'],
    'Ciliated_cell': ['FOXJ1', 'DNAH5', 'CCNO', 'RSPH4A'],
    'Goblet_cell': ['MUC5AC', 'MUC2', 'TFF3', 'SPDEF'],
    'Neuroendocrine_cell': ['CHGA', 'SYP', 'NCAM1', 'CD56'],
    'Secretory_cell': ['MUC5B', 'LYZ', 'SPINK4', 'SCGB3A2'],
    'Transitional_cell': ['KRT5', 'KRT8', 'KRT18', 'KRT19', 'KRT14']
}


In [None]:
adata_Ep.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_Ep, epithelial_markers, cluster_column='leiden')
sc.pl.umap(adata_Ep, color='cell_type')

In [None]:
sc.pl.umap(adata_Ep, color=['timepoint'], palette = 'Set1')

In [None]:
sc.pl.umap(adata_Ep, color=['timepoint', 'cell_type'])


In [None]:
adata_Ep.obs

In [None]:
adata_Ep.var

plot

In [None]:
# save adata_Ep to a file 

adata_Ep.write('/home/data/ICI_exprs/ICI_Epi_cell_collection/1863-counts_cells_cohort1_Epithelial_cells.h5ad')

# function for extracting cell types

In [None]:
def analyze_cell_type(adata, cell_type, markers, adata_name):
    '''
    this function should automatically extract a desired cell type for the user to save to a .h5ad file.  

    '''

    # set Scanpy plotting parameters
    sc.set_figure_params()

    # make a copy of adata
    adata = adata.copy()

    # extract cells and create a new AnnData object
    adata_type = adata[adata.obs['cell_type'] == cell_type].copy()

    # restore the X to the original raw.X for re-processing
    adata_type = ad.AnnData(X=adata_type.raw.X, obs=adata_type.obs, var=adata_type.raw.var, obsm=adata_type.obsm, uns=adata_type.uns)
    adata_type.raw = adata_type
    print(str(adata_type.shape))

    # re-calculate highly variable genes
    sc.pp.highly_variable_genes(adata_type, min_mean=0.0125, max_mean=3, min_disp=0.5)
    sc.pl.highly_variable_genes(adata_type)
    len(adata_type.var_names)

    # re-cluster the specified cell type
    sc.tl.pca(adata_type, svd_solver='arpack', n_comps=40)
    sc.pp.neighbors(adata_type, n_neighbors=80, n_pcs=40)
    sc.tl.leiden(adata_type, resolution=.5)
    
    sc.tl.umap(adata_type)
    sc.pl.umap(adata_type, color='leiden', legend_loc='on data')

    

    # apply cell type labels using the marker dictionary
    adata_type.obs.drop(columns="cell_type", inplace=True)
    labelClusterWithCellType(adata_type, markers, cluster_column='leiden')

    # UMAP
    sc.pl.umap(adata_type, color='cell_type')

    # more UMAPs
    sc.pl.umap(adata_type, color=['timepoint', 'cell_type'])

    # save a copy of adata_type under custom name
    globals()[adata_name] = adata_type


In [None]:
adata.obs['cell_type']

T-cells

In [None]:
# T cell 

cell_type = 'T cells'

markers = {
    'CD4'	: ['CD4', 'IL7R'],
    'CD8'	: [ 'CD8A', 'CD8B'],
    'Naïve'	: ['TCF7', 'SELL', 'LEF1', 'CCR7'],
    'Exhausted' : ['LAG3', 'TIGIT', 'PDCD1', 'HAVCR2', 'CTLA4'],
    'Cytotoxic' : ['IL2', 'GZMA', 'GNLY', 'PRF1', 'GZMB', 'GZMK', 'IFNG', 'NKG7'],
    'Treg' : ['IL2RA', 'FOXP3', 'IKZF2', 'IKZF4',  'TNFRSF18'],
    'Gamma-delta' : ['TRGC1', 'TRGC2', 'TRDC'],
    'Th17' : ['IL17A',  'CCR6', 'KLRB1'],  #'IL22',
    'MAIT' : ['SLC4A10', 'KLRB1', 'IL7R', 'DPP4'],  
    'ILC' :	['KIT', 'IL1R1'],
    'Th1' :	['STAT4', 'IL12RB2', 'IFNG'],
    'Th2' :	['GATA3', 'STAT6', 'IL4'],
    'Tfh'	: ['MAF', 'CXCL13', 'CXCR5', 'PDCD1'],
    'NK' :  ['XCL1', 'FCGR3A', 'KLRD1', 'KLRF1', 'NCAM1'],
    'Proliferation' : ['MKI67', 'PCNA', 'STMN1']
}


adata_name = 'adata_Tcell'

analyze_cell_type(adata, cell_type, markers, adata_name)

In [None]:
adata_Tcell.obs

# myeloid cells

In [None]:
# myeloid 

cell_type = 'Myeloid'

markers = {
    'Monocyte': ['CD14', 'CD16', 'FCGR3A', 'CSF1R'],
    'Macrophage': ['CD68', 'CD163', 'CD206', 'CD11b'],
    'Dendritic': ['HLA-DR', 'CD86', 'CD209'],
    'Neutrophil': ['CD16', 'CD11b', 'CXCR2'],
    'Mast Cell': ['FCER1A'],
    'Eosinophil': ['CCL26', 'PRG2'],
    'Basophil': ['CD203c'],
    'Monocyte-derived DC': ['CD1c', 'CD1a', 'CD14', 'CD16', 'CD11c', 'CD11b'],
    'Plasmacytoid DC': ['CD303', 'CD304', 'IL3RA', 'HLA-DR'],
    'Histiocyte': ['CD163', 'CD68', 'CD14'],
    'MDC-1': ['CLEC9A', 'CD1c', 'CD141', 'CD11c', 'HLA-DR'],
    'MDC-2': ['CD1a', 'CD207', 'CD11c', 'HLA-DR'],
    'Langerhans': ['CD207', 'CD1a', 'CD11c', 'HLA-DR']
}


adata_name = 'adata_My'

analyze_cell_type(adata, cell_type, markers, adata_name)

In [None]:
adata_My.shape

In [None]:
adata_My.obs

# B-cells

In [None]:
# B cells 

cell_type = 'B cells'

markers = {
    'B_cell' : ['CD19', 'CD20', 'CD79A', 'CD79B', 'MS4A1', 'IGHM', 'IGLC2', 'IGLC3', 'IGHG1'],
    'Plasma_cell' : ['CD38', 'CD138', 'XBP1', 'PRDM1', 'IRF4', 'MUM1'],
    'Memory_B_cell' : ['CD27', 'CD21', 'CD23', 'CD24', 'CD5'],
    'Naive_B_cell' : ['CD27', 'CD21', 'CD23', 'CD24', 'CD5', 'CD38'],
    'Germinal_center_B_cell' : ['BCL6', 'PAX5', 'CD10', 'CD38'],
    'Follicular_B_cell' : ['CD21', 'CD35', 'CXCR4', 'CD23'],
    'Marginal_zone_B_cell' : ['CD27', 'CD21', 'CD35', 'IgM', 'IgD'],
    'B1_cell' : ['CD20', 'CD43', 'CD5', 'IgM', 'IgD'],
    'B_regulatory_cell' : ['CD19', 'CD20', 'CD24', 'CD38', 'CD5', 'CD27', 'CD1d', 'CD21'],
    'Plasmablast' : ['CD38', 'CD138', 'IRF4', 'XBP1', 'PRDM1', 'MUM1'],
    'Transitional_B_cell' : ['CD10', 'CD24', 'CD38', 'CD21', 'CD23'],
    'IgM_B_cell' : ['IgM'],
    'IgD_B_cell' : ['IgD']
}



adata_name = 'adata_Bcell'

analyze_cell_type(adata, cell_type, markers, adata_name)

In [None]:
adata_Bcell.shape

In [None]:
adata_Bcell.obs

# epithelial cells

In [None]:
# epithelial 

cell_type = 'Epithelial'

markers = {
    'Epithelial_cell': ['EPCAM', 'KRT5', 'KRT8', 'KRT18', 'KRT19'],
    'Basal_cell': ['KRT5', 'CD44', 'ITGA6', 'TP63'],
    'Luminal_cell': ['KRT8', 'KRT18', 'KRT19', 'CD24', 'CDH1', 'ESR1'],
    'Club_cell': ['SCGB1A1', 'CC10', 'UTP4', 'Foxj1'],
    'Ciliated_cell': ['FOXJ1', 'DNAH5', 'CCNO', 'RSPH4A'],
    'Goblet_cell': ['MUC5AC', 'MUC2', 'TFF3', 'SPDEF'],
    'Neuroendocrine_cell': ['CHGA', 'SYP', 'NCAM1', 'CD56'],
    'Secretory_cell': ['MUC5B', 'LYZ', 'SPINK4', 'SCGB3A2'],
    'Transitional_cell': ['KRT5', 'KRT8', 'KRT18', 'KRT19', 'KRT14']
}


adata_name = 'adata_Epi'

analyze_cell_type(adata, cell_type, markers, adata_name)

In [None]:
adata_Epi.obs

# endothelial cells

In [None]:
def analyze_cell_type(adata, cell_type, markers, adata_name):
    '''
    this function should automatically extract a desired cell type for the user to save to a .h5ad file.  

    '''

    # set Scanpy plotting parameters
    sc.set_figure_params()

    # make a copy of adata
    adata = adata.copy()

    # extract cells and create a new AnnData object
    adata_type = adata[adata.obs['cell_type'] == cell_type].copy()

    # restore the X to the original raw.X for re-processing
    adata_type = ad.AnnData(X=adata_type.raw.X, obs=adata_type.obs, var=adata_type.raw.var, obsm=adata_type.obsm, uns=adata_type.uns)
    adata_type.raw = adata_type
    print(str(adata_type.shape))

    # re-calculate highly variable genes
    sc.pp.highly_variable_genes(adata_type, min_mean=0.0125, max_mean=3, min_disp=0.5)
    sc.pl.highly_variable_genes(adata_type)
    len(adata_type.var_names)

    # re-cluster the specified cell type
    sc.tl.pca(adata_type, svd_solver='arpack', n_comps=40)
    sc.pp.neighbors(adata_type, n_neighbors=80, n_pcs=40)
    sc.tl.leiden(adata_type, resolution=.5)
    
    sc.tl.umap(adata_type)
    sc.pl.umap(adata_type, color='leiden', legend_loc='on data')

    

    # apply cell type labels using the marker dictionary
    adata_type.obs.drop(columns="cell_type", inplace=True)
    labelClusterWithCellType(adata_type, markers, cluster_column='leiden')

    # UMAP
    sc.pl.umap(adata_type, color='cell_type')

    # more UMAPs
    sc.pl.umap(adata_type, color=['timepoint', 'cell_type'])

    # save a copy of adata_type under custom name
    globals()[adata_name] = adata_type


In [None]:
adata.obs.rename(columns={'cellType': 'cell_type'}, inplace=True)


In [None]:
# endothelial 

cell_type = 'Endothelial_cell'

markers = {
    'Endothelial_cell': ['PECAM1', 'VWF', 'CD34', 'CDH5'],
    'Arterial_cell': ['DLL4', 'NOTCH1', 'EphrinB2', 'HEY2'],
    'Venous_cell': ['NR2F2', 'EFNB2', 'COUP-TFII', 'PROX1'],
    'Lymphatic_cell': ['LYVE1', 'PROX1', 'PDPN', 'FLT4'],
    'Capillary_cell': ['CDH5', 'CD34', 'PLVAP', 'CLDN5'],
    'Pericyte': ['PDGFRB', 'NG2', 'ACTA2', 'PDGFB'],
    'Smooth_muscle_cell': ['ACTA2', 'TAGLN', 'CNN1', 'MYH11'],
    'Fibroblast': ['COL1A1', 'COL3A1', 'DCN', 'FSP1'],
    'Macrophage': ['CD68', 'CD163', 'CD206', 'CD11b'],
    'Mast_cell': ['FCER1A', 'TPSAB1', 'TPSB2', 'KIT'],
    'Neutrophil': ['CD66b', 'MPO', 'ELANE', 'CD11b'],
    'NK_cell': ['CD56', 'NCAM1', 'KLRD1', 'KLRF1'],
    'Plasma_cell': ['CD38', 'CD138', 'XBP1', 'PRDM1'],
}

adata_name = 'adata_End'

analyze_cell_type(adata, cell_type, markers, adata_name)

In [None]:
adata_End.obs

In [None]:
adata_End.write('/home/data/ICI_exprs/ICI_End_cell_collection/1863-counts_cells_cohort1_End_cells_raw.h5ad')

# fibroblast

In [None]:
# fibroblast 

cell_type = 'Fibroblast'

fibroblast_markers = {
    'Fibroblast': ['COL1A1', 'COL1A2', 'ACTA2', 'FN1', 'POSTN', 'DCN', 'VIM', 'THY1'],
    'Myofibroblast': ['TAGLN', 'CNN1', 'MYH11', 'ACTA2', 'COL1A1'],
    'Fibrocyte': ['CX3CR1', 'CD34', 'COL1A1', 'COL3A1'],
    'Fibroblast-like Synoviocyte': ['CD55', 'CD90', 'CD248', 'CD106'],
    'Pericyte': ['PDGFRB', 'RGS5', 'MCAM', 'ACTA2'],
    'Smooth Muscle Cell': ['ACTA2', 'MYH11', 'TAGLN'],
    'Cancer-Associated Fibroblast': ['FAP', 'PDGFRB', 'ACTA2', 'COL1A1', 'COL3A1'],
    'Cardiac Fibroblast': ['COL1A1', 'COL3A1', 'POSTN', 'DCN', 'ACTA2'],
    'Lung Fibroblast': ['COL1A1', 'COL3A1', 'POSTN', 'DCN', 'ACTA2', 'SFTPC', 'SCGB1A1'],
    'Dermal Fibroblast': ['COL1A1', 'COL3A1', 'POSTN', 'DCN', 'ACTA2', 'CD90'],
    'Pancreatic Stellate Cell': ['ACTA2', 'VIM', 'DES', 'PDGFRB'],
    'Hepatic Stellate Cell': ['GFAP', 'DES', 'ACTA2', 'VIM', 'COL1A1'],
    'Intestinal Subepithelial Myofibroblast': ['ACTA2', 'VIM', 'TAGLN', 'COL1A1'],
    'Renal Fibroblast': ['ACTA2', 'VIM', 'COL1A1', 'COL3A1', 'PDGFRB'],
}


adata_name = 'adata_F'

analyze_cell_type(adata, cell_type, markers, adata_name)

In [None]:
adata_F.obs

## write to files

In [None]:
# save adata_Tcell to a file 

adata_Tcell.write('/home/data/ICI_exprs/ICI_T_cell_collection/1863-counts_cells_cohort1_T_cells.h5ad')

In [None]:
# save adata_My to a file 

adata_My.write('/home/data/ICI_exprs/ICI_Myeloid_cell_collection/1863-counts_cells_cohort1_Myeloid_cells.h5ad')

In [None]:
# save adata_Bcell to a file 

adata_Bcell.write('/home/data/ICI_exprs/ICI_B_cell_collection/1863-counts_cells_cohort1_B_cells.h5ad')

In [None]:
# save adata_Epi to a file 

adata_Epi.write('/home/data/ICI_exprs/ICI_Epi_cell_collection/1863-counts_cells_cohort1_Epithelial_cells.h5ad')

In [None]:
# save adata_End to a file

In [None]:
# save adata_F to a file