In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import os as os
import seaborn as sns
import sys as sys
sys.path.append('/home/xinghua/projects/PanCancer_scRNA_analysis/utils/')
from scRNA_utils import * 
# from rapids_scanpy_funcs import *


In [None]:
#adata = load_10X_matrices('/home/data/ICI_exprs/GSE169246')
adata = sc.read('/data/ICI_exprs/GSE169246/GSE169246_TNBC_RNA_matrix.mtx.gz', cache=True).transpose()
#adata = sc.read_h5ad('/home/data/ICI_exprs/GSE169246/GSE169246_filtered.h5ad')
print(adata)

In [None]:
# read in metadata
adata.obs_names = pd.read_csv('/data/ICI_exprs/GSE169246/GSE169246_TNBC_RNA_barcodes.tsv.gz', header=None, sep='\t')[0]
adata.var_names = pd.read_csv('/data/ICI_exprs/GSE169246/GSE169246_TNBC_RNA_features.tsv.gz', header=None, sep='\t')[0]


In [None]:
print("Cell barcodes\n", adata.obs_names[0:10])
print("Gene names\n", adata.var_names[0:10])

In [None]:
adata.var_names_make_unique()
adata.obs_names_make_unique

### Extract metadata 


In [None]:
# split metadata into sample_id, patient_id, and treatment timepoint
adata.obs['sample_id'] = adata.obs.index.str.split('.').str[1]
adata.obs['patient_id'] = adata.obs['sample_id'].str.split('_').str[1]
adata.obs['timepoint'] = adata.obs['sample_id'].str.split('_').str[0]
adata.obs['sample_source'] = adata.obs['sample_id'].str.split('_').str[2]

In [None]:

# replace timepoint value from "Pre" to "pre", and "Post" to "on"
adata.obs['timepoint'] = adata.obs['timepoint'].str.replace('Post', 'on')
adata.obs['timepoint'] = adata.obs['timepoint'].str.replace('Pre', 'pre')
# replase timepoint with lowercase
adata.obs['timepoint'] = adata.obs['timepoint'].str.lower()



Annotate treatment of each patient

In [None]:
# label treatment
# First set every one to anti-pd-L1 label
adata.obs['treatment'] = 'Anti-PD-L1+ Chemo' 
# Change the label of chemo group
chemo_group = ['P022', 'P011', 'P020', 'P008', 'P013', 'P025', 'P018', 'P023', 'P024', 'P003', 'P028']
for p in chemo_group:
    adata.obs.loc[adata.obs['patient_id'] == p, 'treatment'] = 'Chemo'

In [None]:
print(adata.obs.columns)
print(adata.var.columns)

### Need to pay attention to exclude T cells from pbmc are included in the dataset, when study the clonal expansion of T cells

In [None]:
adata.obs['sample_source'].value_counts()

### Summary of data

In [None]:
print("Number of patients", str(len(adata.obs['patient_id'].unique())))
print("Number of samples", str(len(adata.obs['sample_id'].unique())))
print("Number of timepoints", str(len(adata.obs['timepoint'].unique())))
print("Number of sample sources", str(len(adata.obs['sample_source'].unique())))
print("Number of treatments", str(len(adata.obs['treatment'].unique())))

### Save a copy of h5ad file to speed up loading

In [None]:
adata.write_h5ad('/data/ICI_exprs/GSE169246/GSE169246_TNBC_RNA.h5ad')

In [None]:
adata = sc.read_h5ad('/data/ICI_exprs/GSE169246/GSE169246_TNBC_RNA.h5ad')

## Start preprocessing

In [None]:
# # sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
# sc.logging.print_header()
# sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
# removing genes expressing in <10 cells
sc.pp.filter_genes(adata, min_cells = 50)

# removing cells with fewer than 400 genes or more than 8000 genes
sc.pp.filter_cells(adata, min_genes=400)
sc.pp.filter_cells(adata, max_genes=8000)

In [None]:
# removing cells containing <600 || >120000 UMIs
sc.pp.filter_cells(adata, min_counts = 600)
sc.pp.filter_cells(adata, max_counts = 120000)

In [None]:
# label genes as mt
adata.var['mt'] = adata.var_names.str.startswith('MT-')  

# annotate cells with the percent of genes assigned as mt
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Here we keep cells with < 20% mito ratio
adata = adata[adata.obs['pct_counts_mt'] < 10, :]
adata.shape

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.0, multi_panel=True)

In [None]:
# Log normalization scaled up to 10000
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
# Logarithmize adata
sc.pp.log1p(adata, base=2)

In [None]:
# adata.write('/home/data/ICI_exprs/GSE169246/GSE169246_filtered-lux.h5ad')

In [None]:
# adata = sc.read_h5ad('/home/data/ICI_exprs/GSE169246/GSE169246_filtered-lux.h5ad')

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes = 4000, flavor = 'cell_ranger')
#sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, subset=True)
sc.pl.highly_variable_genes(adata)

In [None]:
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
print('adata dimensions of high variance genes: ' + str(adata.shape))

In [None]:
# perform PCA   
sc.tl.pca(adata, svd_solver='arpack', n_comps=50)

# Keep the top 50 components
#adata.obsm["X_pca"] = adata.obsm["X_pca"][:, :50]

In [None]:
print(adata.obsm['X_pca'].shape)
print(adata.varm['PCs'].shape)
print(adata.uns['pca']['variance_ratio'].shape)
print(adata.obs.columns)

### Clusterings

In [None]:
sc.pp.neighbors(adata, n_neighbors=80)

In [None]:
# Use the Leiden algorithm to find clusters
sc.tl.leiden(adata, resolution=0.5)


In [None]:
# load/find cell cycle markers: T-test/T-cells
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
# perform UMAP
sc.tl.umap(adata)


In [None]:
sc.pl.umap(adata, color=['leiden', 'CD3E', 'CD19',  'CD68'] ,legend_loc='on data')
# plot B cells markers
# sc.pl.umap(adata, color=['MS4A1', 'CD79A', 'CD79B', 'CD19', 'CD20', 'CD22', 'CD27', 'CD38', 'CD45RA', 'CD45RO', 'CD69', 'CD80', 'CD86', 'CD95', 'CD274', 'CD276', 'CD319', 'CD320'], legend_loc='on data')

In [None]:
cell_type_markers = {
    'T cells': ['CD3D', 'CD3E'],
    'B cells': ['CD79A', 'CD79B'],
    'Myeloid': ['CD14', 'CD68']
}
sc.tl.dendrogram(adata, groupby='leiden')
sc.pl.dotplot(adata, cell_type_markers, 'leiden', dendrogram=True)
#sc.pl.umap(adata, color='clusters', legend_loc='on data')


In [None]:
# check if the markers are in the var names
for cell_type, markers in cell_type_markers.items():
    print (cell_type, ":", markers)
    #print ("number of match in var: ", str(sum(adata.raw.var_names.isin(markers))))
    sc.pl.umap(adata, color=markers, legend_loc='on data')

In [None]:
labelClusterWithCellType(adata, cell_type_markers)

In [None]:
# plot umap with cell type labels
sc.pl.umap(adata, color=['cell_type', 'timepoint', 'treatment'])

### Write to file.

In [None]:
# extract T cells and output  AnnData object to file
adata_T = adata[adata.obs['cell_type'] == 'T cells']
adata_T = ad.AnnData(X=adata_T.raw.X, obs=adata_T.obs, var=adata_T.raw.var)
adata_T.raw = adata_T
adata_T.write_h5ad('/data/ICI_exprs/ICI_T_cell_collection/GSE169246_T_cells.h5ad')
adata_T.write_h5ad('/data/ICI_exprs/ICI_NHDP/GSE169246_T_cells.h5ad')

# extract B cells
adata_B = adata[adata.obs['cell_type'] == 'B cells']
adata_B = ad.AnnData(X=adata_B.raw.X, obs=adata_B.obs, var=adata_B.raw.var)
adata_B.raw = adata_B
adata_B.write_h5ad('/data/ICI_exprs/ICI_B_cell_collection/GSE169246_B_cells.h5ad')
adata_B.write_h5ad('/data/ICI_exprs/ICI_NHDP/GSE169246_B_cells.h5ad')

# extract Myeloid cells
adata_M = adata[adata.obs['cell_type'] == 'Myeloid']
adata_M = ad.AnnData(X=adata_M.raw.X, obs=adata_M.obs, var=adata_M.raw.var)
adata_M.raw = adata_M
adata_M.write_h5ad('/data/ICI_exprs/ICI_Myeloid_cell_collection/GSE169246_M_cells.h5ad')
adata_M.write_h5ad('/data/ICI_exprs/ICI_NHDP/GSE169246_M_cells.h5ad')

In [None]:
adata_T.obs.columns

## Clustering T cells

In [None]:
adata_T = clustering_adata(adata_T,n_top_genes=5000)