# GSE179994 scRNA Data Preprocessing

This dataset is from:
	
Liu B, Hu X, Feng K, Gao R et al. Temporal single-cell tracing reveals clonal revival and expansion of precursor exhausted T cells during anti-PD-1 therapy in lung cancer. Nat Cancer 2022 Jan;3(1):108-121. PMID: 35121991


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import os as os
import seaborn as sns
import sys as sys
sys.path.append('/home/xinghua/projects/PanCancer_scRNA_analysis/utils')
from scRNA_utils import *


### Read in data matrix and combine with meta-data

In [None]:
### Read in data in H5AD format
data_dir = "/data/ICI_exprs/GSE179994/"
data_file =  data_dir + 'GSE179994_all.Tcell.rawCounts.h5ad'

adata = sc.read_h5ad(data_file)

print ("Read in dataset with dimension: " + str(adata.shape))

In [None]:
# read in the csv meta-data
meta_file_pathname = data_dir + "GSE179994_Tcell.metadata.tsv.gz"
meta = pd.read_csv(meta_file_pathname, sep="\t", header = 0, index_col = 0)

In [None]:
meta.columns

In [None]:
# rename 'sample' to 'sample_id'
meta = meta.rename(columns = {'sample':'sample_id'})
meta.loc[:,'sample_id'].unique()

In [None]:
adata.obs = meta

# swap column name of patient to patient_id
adata.obs = adata.obs.rename(columns = {'patient':'patient_id'})
print(adata.obs['patient_id'].unique())
print("Total number of patients: " + str(len(adata.obs['patient_id'].unique())))

In [None]:
# extract treatment time point from sample by splitting with '.'
timepoint = [x.split('.')[1] for x in adata.obs['sample_id']]
adata.obs['timepoint'] = timepoint
print(adata.obs['timepoint'].unique())
# replace 'post' with 'on' for consistency
adata.obs['timepoint'] = adata.obs['timepoint'].replace('post','on')
# make 'timepoint' lowercase
adata.obs['timepoint'] = adata.obs['timepoint'].str.lower()
print(adata.obs['timepoint'].unique())

## Create a pseudo-bulk dataset before any other preprocessing

In [None]:
df_pseudo_bulk = pd.DataFrame(index = adata.obs['sample'].unique(), columns = adata.var.index)
df_pseudo_bulk.shape

In [None]:
# group cells according to sample
for sample in adata.obs['sample'].unique():
    tmp = adata.X[adata.obs['sample'] == sample, :].sum(axis = 0)
    tmp = tmp / sum(tmp) * 1000000
    df_pseudo_bulk.loc[sample,:] = np.log2(tmp + 1)

In [None]:
adata_pseudo_bulk = ad.AnnData(df_pseudo_bulk, obs=pd.DataFrame(index = df_pseudo_bulk.index), var=pd.DataFrame(index = df_pseudo_bulk.columns))

## 1. Basic Filtering

In [None]:
sc.settings.verbosity = 3            # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

#### Removing cells expressing <500 || >5000 genes:

In [None]:
# removing cells expressing <500 || >5000 genes
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_cells(adata, max_genes=5000)

In [None]:
print ('Dimention of adata: ' + str(adata.shape))
print ('columns for adata.obs: ' + str(adata.obs.columns))

#### Removing cells containing <400 || >25000 UMIs:

In [None]:
# removing cells containing <400 || >25000 UMIs
sc.pp.filter_cells(adata, min_counts = 400)
sc.pp.filter_cells(adata, max_counts = 25000)

In [None]:
print ('Dimention of adata: ' + str(adata.shape))
print ('columns for adata.obs: ' + str(adata.obs.columns))

# Preprecess with respect to gene (var)
#### Removing genes covered by <3 cells per sample


In [None]:
# removing genes covered by <3 cells
sc.pp.filter_genes(adata, min_cells= 10)
adata.var_names_make_unique()
adata.shape

### Remove cells with high percentage of mitochondrial genes

In [None]:
# label genes as mt
adata.var['mt'] = adata.var_names.str.startswith('MT-')  

# annotate cells with the percent of genes assigned as mt
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Here we keep cells with < 20% mito ratio
adata = adata[adata.obs['pct_counts_mt'] < 20, :]
adata.shape

Plot statistics regarding cells

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.0, multi_panel=True)

## 2. Integrating Samples

#### Normalization & Logarithmization:

In [None]:
# Log normalization scaled up to 10000
print('Before normalization, the sum of first row of X: ' + str(adata.X[0,:].sum()))
sc.pp.normalize_total(adata, target_sum=1e4)
print('After normalization, the sum of first row of X: ' + str(adata.X[0,:].sum()))


In [None]:
print('Before log, the sum of first row of X: ' + str(adata.X[0,:].sum()))
# Logarithmize adata
sc.pp.log1p(adata, base=2)
print('After log, the sum of first row of X: ' + str(adata.X[0,:].sum()))

### load cell cycle markers and score cells for cell cycle

The question is whether this step is necessary for our study. We are trying to look for genes expression modules (GEMs).  Cell cycel is an important component of cellular signaling, thus removing its signal distorts the cellular states.  

Afterall, it seems to introduce 'Inf' to data which prevent the following steps.

## Save a pre-process version of the data

In [None]:
# write current adata to h5ad
adata.write(data_dir + 'GSE179994_all.Tcell.rawCounts.h5ad_filtered.h5ad')

In [None]:
# read in 'GSE179994_all.Tcell.rawCounts.h5ad_filtered.h5ad'
adata = sc.read_h5ad(data_dir + 'GSE179994_all.Tcell.rawCounts.h5ad_filtered.h5ad')

## Keep high variance genes 

In [None]:
sc.pp.highly_variable_genes(adata,n_top_genes = 2000)
#sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, subset=True)
sc.pl.highly_variable_genes(adata)

### Keep track of original adata and update adata.X to  high variance genes only

In [None]:
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
print('adata dimensions of high variance genes: ' + str(adata.shape))

## 3. Unsupervised cell clustering & identification of major cell types

In [None]:
# perform PCA   
sc.tl.pca(adata, svd_solver='arpack', n_comps=50)

In [None]:
print(adata.obsm['X_pca'].shape)
print(adata.varm['PCs'].shape)
print(adata.uns['pca']['variance_ratio'].shape)
print(adata.obs.columns)

In [None]:
sc.pp.neighbors(adata, n_neighbors=80, n_pcs=50)

In [None]:
# Use the Leiden algorithm to find clusters
sc.tl.leiden(adata, resolution=0.5)


In [None]:
# load/find cell cycle markers: T-test/T-cells
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
adata.obs.columns

## 3. Unsupervised cell clustering & identification of major cell types

In [None]:
# perform UMAP
sc.tl.umap(adata)
sc.pl.umap(adata, color='leiden',legend_loc='on data')



# find the high rank genes as markers for each cluster

In [None]:
# find marker genes of each cluster
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')

sc.pl.umap(adata, color='clusters', legend_loc='on data')

### Label TNK subtypes

In [None]:
T_cell_makers = {
    'CD4'	: ['CD4', 'IL7R'],
    'CD8'	: [ 'CD8A', 'CD8B'],
    'Naïve'	: ['TCF7', 'SELL', 'LEF1', 'CCR7'],
    'Exhausted' : ['LAG3', 'TIGIT', 'PDCD1', 'HAVCR2', 'CTLA4'],
    'Cytotoxic' : ['IL2', 'GZMA', 'GNLY', 'PRF1', 'GZMB', 'GZMK', 'IFNG', 'NKG7'],
    'Treg' : ['IL2RA', 'FOXP3', 'IKZF2', 'IKZF4',  'TNFRSF18'],
    'Th17' : ['IL17A',  'CCR6', 'KLRB1'],  #'IL22',
    'MAIT' : ['SLC4A10', 'KLRB1', 'IL7R', 'DPP4'],  
    'ILC' :	['KIT', 'IL1R1'],
    'Th1' :	['STAT4', 'IL12RB2', 'IFNG'],
    'Th2' :	['GATA3', 'STAT6', 'IL4'],
    'Tfh'	: ['MAF', 'CXCL13', 'CXCR5', 'PDCD1'],
    'NK' :  ['XCL1', 'FCGR3A', 'KLRD1', 'KLRF1', 'NCAM1'],
    'Proliferation' : ['MKI67', 'PCNA', 'STMN1']
}

In [None]:
# check if the markers are in the var names
for cell_type, markers in T_cell_makers.items():
    print (cell_type, ":", markers)
    print ("number of match in var: ", str(sum(adata.raw.var_names.isin(markers))))

### Plot

In [None]:
sc.tl.dendrogram(adata, groupby='leiden')
sc.pl.dotplot(adata, T_cell_makers, 'leiden', dendrogram=True)

In [None]:
for cell_type, markers in T_cell_makers.items():
    print(cell_type, ":", "markers")
    sc.pl.umap(adata, color=markers)

In [None]:
adata.obs.columns

In [None]:
#adata.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata, T_cell_makers, cluster_column='leiden')
sc.pl.umap(adata, color='cell_type')

Plot PD-1 and potential target genes

In [None]:
sc.pl.umap(adata, color= ['PDCD1', 'CXCL13', 'HAVCR2','CTLA4'])

The enriched genes for each cluster

### Compare the distribution of Pre and On treatment

In [None]:
adata.obs.columns

In [None]:
print(adata.obs['timepoint'].value_counts())

In [None]:
# plot cells based on treatment
sc.pl.umap(adata, color=['cell_type', 'timepoint'])
sc.pl.umap(adata, color= ['CXCL13', 'PDCD1', 'TIGIT', 'HAVCR2'])

In [None]:
adata.write(data_dir + '1863-counts_cells_cohort1_T_cells.h5ad')

In [None]:
sc.pl.umap(adata, color= ['CD8A', 'PDCD1', 'HAVCR2'])

Compare PD-1 and CXCL13 in pre and post

In [None]:
# extract cells wtih PDCD1 > 0.5
adata_PDCD1 = adata[adata.raw[:, 'PDCD1'].X > 0.5, :].copy()

In [None]:
# collect counts of cells in pre and post treatment groups
adata_PDCD1.obs['timepoint'].value_counts()

In [None]:
# plot violin plot with values > 1
sc.pl.violin(adata_PDCD1, ['CXCL13', 'PDCD1', 'TIGIT', 'HAVCR2'], split=True, groupby='timepoint', ylim=(1, 7), jitter=0.0, multi_panel=True)

In [None]:
adata.obs['timepoint'].unique()


In [None]:
cxcl13_pre = np.array(adata.raw.X[adata.obs['timepoint'] == 'pre', adata.raw.var_names == 'CXCL13'])
pdcd1_pre = np.array(adata.raw.X[adata.obs['timepoint'] == 'pre', adata.raw.var_names == 'PDCD1'])
plt.scatter( pdcd1_pre, cxcl13_pre, c='blue', label='pre', s = 1)

In [None]:
# extract expression values of genes of interest and convert to numpy array
cxcl13_post = np.array(adata.raw.X[adata.obs['timepoint'] == 'post', adata.raw.var_names == 'CXCL13'])
pdcd1_post = np.array(adata.raw.X[adata.obs['timepoint'] == 'post', adata.raw.var_names == 'PDCD1'])


# print(cxcl13_pre)
# print(pdcd1_pre)
plt.scatter(pdcd1_post, cxcl13_post, color='blue', label='Pre', s = 1)
# plot scatter between PDCD1 and CXCL13
#plt.scatter(pdcd1_pre, cxcl13_pre, color='blue', label='Pre')


### Plot use the cluster id assigned by authors

In [None]:
sc.pl.umap(adata, color= [ 'cluster'])

### Extract pseudo-bulk and plot

In [None]:
# extract expression values of genes of interest and convert to numpy array
cxcl13_post = adata_pseudo_bulk.X[adata.obs['timepoint'] == 'post', adata_pseudo_bulk.var_names == 'CXCL13']
pdcd1_post = np.array(adata_pseudo_bulk.X[adata.obs['timepoint'] == 'post', adata_pseudo_bulk.var_names == 'PDCD1'])
plt.scatter( pdcd1_post, cxcl13_post, c='blue', label='pre', s = 1)

In [None]:
# Group cells by sample
