In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import os as os
import seaborn as sns
from load_10X_matrices import load_10X_matrices

In [16]:
#adata = load_10X_matrices('/home/data/ICI_exprs/GSE169246')
#adata = sc.read_10x_mtx('/home/data/ICI_exprs/GSE169246/', var_names='gene_symbols', cache=True)
adata = sc.read_10x_mtx('/home/data/ICI_exprs/GSE169246/', prefix = "GSE169246_TNBC_RNA_")
print(adata)

KeyboardInterrupt: 

In [None]:
adata.obs_names_make_unique

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
# removing genes expressing in <10 cells
sc.pp.filter_genes(adata, min_cells = 10)

# removing cells with fewer than 400 genes or more than 8000 genes
sc.pp.filter_cells(adata, min_genes=400)
sc.pp.filter_cells(adata, max_genes=8000)

In [None]:
# removing cells containing <600 || >120000 UMIs
sc.pp.filter_cells(adata, min_counts = 600)
sc.pp.filter_cells(adata, max_counts = 120000)

In [None]:
# label genes as mt
adata.var['mt'] = adata.var_names.str.startswith('MT-')  

# annotate cells with the percent of genes assigned as mt
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Here we keep cells with < 20% mito ratio
adata = adata[adata.obs['pct_counts_mt'] < 10, :]
adata.shape

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.0, multi_panel=True)

In [None]:
# Log normalization scaled up to 10000
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
# Logarithmize adata
sc.pp.log1p(adata)

In [None]:
adata.write('/home/data/ICI_exprs/GSE169246/GSE169246_filtered.h5ad')

In [None]:
adata = sc.read_h5ad('/home/data/ICI_exprs/GSE169246/GSE169246_filtered.h5ad')

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes = 4000, flavor = 'cell_ranger')
#sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, subset=True)
sc.pl.highly_variable_genes(adata)

In [None]:
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
print('adata dimensions of high variance genes: ' + str(adata.shape))

In [None]:
# perform PCA   
sc.tl.pca(adata, svd_solver='arpack', n_comps=50)

# Keep the top 50 components
#adata.obsm["X_pca"] = adata.obsm["X_pca"][:, :50]

In [None]:
print(adata.obsm['X_pca'].shape)
print(adata.varm['PCs'].shape)
print(adata.uns['pca']['variance_ratio'].shape)
print(adata.obs.columns)

In [None]:
sc.pp.neighbors(adata, n_neighbors=80, use_rep='X', batch_key='batch')
sc.external.pp.bbknn(adata, batch_key='batch', n_pcs=50)

In [None]:
# Use the Leiden algorithm to find clusters
sc.tl.leiden(adata)
#sc.tl.leiden(adata, resolution=0.5)

In [None]:
# load/find cell cycle markers: T-test/T-cells
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
# perform UMAP
sc.tl.umap(adata)
sc.pl.umap(adata, color='leiden',legend_loc='on data')

In [None]:
# find marker genes of each cluster
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')

In [None]:
t_cell_markers = {
    'CD3G', 'CD4', 'CD81', 'GZMK', 'IFNG', 'KLRG1', 'NKG7', 
    'ITGAE', 'CD40LG', 'FGFBP2', 'LEF1', 'TCF7', 'SELL', 'FOXP3', 
    'IL7R', 'IL2RA', 'LAYN', 'ICOS', 
    'ZNF683', 'SCL1', 'TOX2', 'PRF1', 'GZMB', 
    'GNLY', 'KLRB1', 'CSCL13', 'LAG3', 
    'HAVCR2', 'PDCD1', 'TIGIT', 'CTLA4', 'RORA'
    
    
    'CD4'	: ['CD4', 'IL7R'],
    'CD8'	: [ 'CD8A', 'CD8B'],
    'Naïve'	: ['TCF7', 'SELL', 'LEF1', 'CCR7'],
    'Exhausted' : ['LAG3', 'TIGIT', 'PDCD1', 'HAVCR2', 'CTLA4'],
    'Cytotoxic' : ['IL2', 'GZMA', 'GNLY', 'PRF1', 'GZMB', 'GZMK', 'IFNG', 'NKG7'],
    'Treg' : ['IL2RA', 'FOXP3', 'IKZF2', 'IKZF4',  'TNFRSF18'],
    'Th17' : ['IL17A',  'CCR6', 'KLRB1'],  #'IL22',
    'MAIT' : ['SLC4A10', 'KLRB1', 'IL7R', 'DPP4'],  
    'ILC' :	['KIT', 'IL1R1'],
    'Th1' :	['STAT4', 'IL12RB2', 'IFNG'],
    'Th2' :	['GATA3', 'STAT6', 'IL4'],
    'Tfh'	: ['MAF', 'CXCL13', 'CXCR5', 'PDCD1'],
    'NK' :  ['XCL1', 'FCGR3A', 'KLRD1', 'KLRF1', 'NCAM1'],
    'Proliferation' : []
}

In [None]:
# check if the markers are in the var names
for cell_type, markers in t_cell_markers.items():
    print (cell_type, ":", markers)
    print ("number of match in var: ", str(sum(adata.raw.var_names.isin(markers))))

In [None]:
sc.tl.dendrogram(adata, groupby='leiden')
sc.pl.dotplot(adata, t_cell_markers, 'leiden', dendrogram=True)

In [None]:
for cell_type, markers in t_cell_markers.items():
    print(cell_type, ":", "markers")
    sc.pl.umap(adata, color=markers)