# Merged T cell from 3 ICI studies
EGAS00001004809, GSE179994, and GSE169246
``` 


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import os as os
import seaborn as sns


## Read in T cells of EGAS100010040809 


### Read in count matrix first

In [None]:
adata_EGAS100010040809 = sc.read_h5ad('/data/ICI_exprs/ICI_T_cell_collection/EGAS00001004809-1863-counts_cells_cohort1_T_cells.h5ad')
print(adata_EGAS100010040809.shape)

Read in Meta data of each dataset

In [None]:
EGAS100010040809_meta = pd.read_csv('/data/ICI_exprs/EGAS00001004809/1870-BIOKEY_metaData_tcells_cohort1_web.csv', index_col=0, header=0)
print(EGAS100010040809_meta.shape)

### Loaded matrix and meta data has discrepancy in cell number, 
so we need to filter out the cells that are not in the meta data

In [None]:
# find common cells from adata and meta
common_cells = np.intersect1d(adata_EGAS100010040809.obs.index, EGAS100010040809_meta.index)

In [None]:
 # Add sample_id to EGAS100010040809_meta by parsing the sample name
EGAS100010040809_meta['sample_id'] = ['_'.join(x.split('_')[:3]) for x in EGAS100010040809_meta.index]

In [None]:
EGAS100010040809_to_keep = EGAS100010040809_meta.loc[common_cells,['patient_id', 'timepoint','sample_id']]
EGAS100010040809_to_keep['batch'] = ['EGAS100010040809']*len(EGAS100010040809_to_keep)
EGAS100010040809_to_keep.head()

In [None]:
EGAS100010040809_adata = adata_EGAS100010040809[common_cells, :]
EGAS100010040809_adata.obs = EGAS100010040809_to_keep.loc[common_cells, :]

## Read in GSE179994_all.Tcell.rawCounts.h5ad


In [None]:
GSE179994_adata = sc.read_h5ad('/data/ICI_exprs/GSE179994/GSE179994_all.Tcell.rawCounts.h5ad')
print(GSE179994_adata.shape)
# the data is a matrix only without any obs or var

### Process GSE179994_meta

In [None]:
GSE179994_meta = pd.read_csv('/data/ICI_exprs/GSE179994/GSE179994_Tcell.metadata.tsv.gz', sep = '\t', index_col = 0, compression = 'gzip', header = 0)
print(GSE179994_meta.shape)

In [None]:
# replace 'sample' with 'sample_id'
GSE179994_meta.columns = GSE179994_meta.columns.str.replace('sample', 'sample_id')
GSE179994_meta.columns = GSE179994_meta.columns.str.replace('patient', 'patient_id')
print(GSE179994_meta.columns)

In [None]:
# split sample_id to extract time point
GSE179994_meta['timepoint'] = GSE179994_meta['sample_id'].str.split('.').str[1]
# replace 'post' with 'on'
GSE179994_meta['timepoint'] = GSE179994_meta['timepoint'].str.replace('pre', 'pre')
GSE179994_meta['timepoint'] = GSE179994_meta['timepoint'].str.replace('post', 'on')


In [None]:
GSE179994_meta_to_keep = GSE179994_meta.loc[: ,['patient_id', 'timepoint','sample_id']]
GSE179994_meta_to_keep['batch'] = ['GSE179994'] * GSE179994_meta_to_keep.shape[0]
#align the index of GSE179994_meta_to_keep with GSE179994_adata
GSE179994_meta_to_keep = GSE179994_meta_to_keep.loc[GSE179994_adata.obs_names, :]
print(GSE179994_meta_to_keep.shape)

In [None]:
GSE179994_adata.obs = GSE179994_meta_to_keep

### Find common genes in Xs of both datasets

In [None]:
common_genes = np.intersect1d(EGAS100010040809_adata.var_names, GSE179994_adata.var_names)
print(len(common_genes))

# filter out genes that are not in common_genes
EGAS100010040809_adata = EGAS100010040809_adata[:, common_genes]
GSE179994_adata = GSE179994_adata[:, common_genes]


In [None]:
EGAS100010040809_adata.raw = None

In [None]:
GSE179994_adata.raw = None

### Merge adatas from GSE179994_meta and GSE179994_all.Tcell.rawCounts.h5ad 

In [None]:
# adata = EGAS100010040809_adata.concatenate(GSE179994_adata)
adata = ad.concat([EGAS100010040809_adata, GSE179994_adata])
print(adata.shape)

In [None]:
print(adata.obs['batch'])
adata.obs['batch'].value_counts()
#print(EGAS100010040809_adata.obs['batch'])

In [None]:
# add combined_meta to adata.ob
adata.write('/data/ICI_exprs/Merged_GSE179994_EGAS100010040809_T_cell_.h5ad')

## Process combined data


In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

#### Removing cells expressing <500 || >5000 genes:

In [None]:
# removing cells expressing <500 || >5000 genes
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_cells(adata, max_genes=5000)

In [None]:
print ('Dimention of adata: ' + str(adata.shape))
print ('columns for adata.obs: ' + str(adata.obs.columns))

#### Removing cells containing <400 || >25000 UMIs:

In [None]:
# removing cells containing <400 || >25000 UMIs
sc.pp.filter_cells(adata, min_counts = 400)
sc.pp.filter_cells(adata, max_counts = 25000)

In [None]:
print ('Dimention of adata: ' + str(adata.shape))
print ('columns for adata.obs: ' + str(adata.obs.columns))

# Preprecess with respect to gene (var)
#### Removing genes covered by <3 cells per sample


In [None]:
# removing genes covered by <3 cells
sc.pp.filter_genes(adata, min_cells= 10)
adata.var_names_make_unique()
adata.shape

### Remove cells with high percentage of mitochondrial genes

In [None]:
# label genes as mt
adata.var['mt'] = adata.var_names.str.startswith('MT-')  

# annotate cells with the percent of genes assigned as mt
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Here we keep cells with < 20% mito ratio
adata = adata[adata.obs['pct_counts_mt'] < 20, :]
adata.shape

Plot statistics regarding cells

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.0, multi_panel=True)

## 2. Integrating Samples

#### Normalization & Logarithmization:

In [None]:
# Log normalization scaled up to 10000
print('Before normalization, the sum of first row of X: ' + str(adata.X[0,:].sum()))
sc.pp.normalize_total(adata, target_sum=1e4)
print('After normalization, the sum of first row of X: ' + str(adata.X[0,:].sum()))


In [None]:
print('Before log, the sum of first row of X: ' + str(adata.X[0,:].sum()))
# Logarithmize adata
sc.pp.log1p(adata, base=2)
print('After log, the sum of first row of X: ' + str(adata.X[0,:].sum()))

## Keep high variance genes 

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes=5000)
sc.pl.highly_variable_genes(adata)

### Keep track of original adata and update adata.X to  high variance genes only

In [None]:
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
print('adata dimensions of high variance genes: ' + str(adata.shape))


In [None]:
adata.write('/data/ICI_exprs/ICI_NHDP/Merged_GSE179994_EGAS100010040809_T_cell_5K_hvg.h5ad')

In [None]:
adata = sc.read('/data/ICI_exprs/ICI_NHDP/Merged_GSE179994_EGAS100010040809_T_cell_5K_hvg.h5ad')

## 3. Unsupervised cell clustering & identification of major cell types

In [None]:
# perform PCA   
sc.tl.pca(adata, svd_solver='arpack', n_comps=50)

In [None]:
print(adata.obsm['X_pca'].shape)
print(adata.varm['PCs'].shape)
print(adata.uns['pca']['variance_ratio'].shape)
print(adata.obs.columns)

In [None]:
sc.pp.neighbors(adata, n_neighbors=80, n_pcs=50)

In [None]:
# Use the Leiden algorithm to find clusters
sc.tl.leiden(adata, resolution=0.5)


In [None]:
# load/find cell cycle markers: T-test/T-cells
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

## 3. Unsupervised cell clustering & identification of major cell types

In [None]:
# perform UMAP
sc.tl.umap(adata)
sc.pl.umap(adata, color='leiden',legend_loc='on data')



### Check batch effect  

In [None]:
sc.pl.umap(adata, color='batch')

## Perform Harmony analysis
The data above show that there is a significant batch effects. Apply Harmony analysis to re-project cells into new PCA spact and re-perfrom clustering analysis

In [None]:
# perform batch correction using harmony, which works in the PCA space instead of the original gene space
from scanpy.external.pp import harmony_integrate
sc.external.pp.harmony_integrate(adata, 'batch',  max_iter_harmony=20, random_state=0)


### Clustering again

In [None]:
# cluster cells again after batch correction
sc.pp.neighbors(adata, n_neighbors=80, n_pcs=30, use_rep='X_pca_harmony')

In [None]:
sc.tl.leiden(adata, resolution=1)

In [None]:
# plot UMAP
sc.tl.umap(adata)
sc.pl.umap(adata, color=['leiden', 'batch', 'timepoint'])

### Save a copy with cluster labels.

In [None]:
adata.write('/data/ICI_exprs/ICI_NHDP/Merged_GSE179994_EGAS100010040809_T_cell_5K_hvg_with_cls_meta.h5ad')

### Check the distribution of T markers

In [None]:
# color by gene expression of T cell markers
sc.pl.umap(adata, color= ['CD3D', 'CD3E'])

### Label TNK subtypes

In [None]:
T_cell_makers = {
    'CD4'	: ['CD4', 'IL7R'],
    'CD8'	: [ 'CD8A', 'CD8B'],
    'Naïve'	: ['TCF7', 'SELL', 'LEF1', 'CCR7'],
    'Exhausted' : ['LAG3', 'TIGIT', 'PDCD1', 'HAVCR2', 'CTLA4'],
    'Cytotoxic' : ['IL2', 'GZMA', 'GNLY', 'PRF1', 'GZMB', 'GZMK', 'IFNG', 'NKG7'],
    'Treg' : ['IL2RA', 'FOXP3', 'IKZF2', 'IKZF4',  'TNFRSF18'],
    # 'Gamma-delta' : ['TRGC1', 'TRGC2', 'TRDC'],
    'Th17' : ['IL17A',  'CCR6', 'KLRB1'],  #'IL22',
    'MAIT' : ['SLC4A10', 'KLRB1', 'IL7R', 'DPP4'],  
    'ILC' :	['KIT', 'IL1R1'],
    'Th1' :	['STAT4', 'IL12RB2', 'IFNG'],
    'Th2' :	['GATA3', 'STAT6', 'IL4'],
    'Tfh'	: ['MAF', 'CXCL13', 'CXCR5', 'PDCD1'],
    'NK' :  ['XCL1', 'FCGR3A', 'KLRD1', 'KLRF1', 'NCAM1'],
    'Proliferation' : ['MKI67', 'PCNA', 'STMN1']
}

In [None]:
# check if the markers are in the var names
for cell_type, markers in T_cell_makers.items():
    print (cell_type, ":", markers)
    print ("number of match in var: ", str(sum(adata.raw.var_names.isin(markers))))

### Plot

In [None]:
sc.tl.dendrogram(adata, groupby='leiden')
sc.pl.dotplot(adata, T_cell_makers, 'leiden', dendrogram=True)

In [None]:
for cell_type, markers in T_cell_makers.items():
    print (cell_type, ":", markers)
    sc.pl.umap(adata, color=markers)

In [None]:
adata.obsm.keys

Plot PD-1 and potential target genes

In [None]:
sc.pl.umap(adata, color= ['PDCD1', 'CXCL13', 'HAVCR2','CTLA4', "PRDM1"])

The enriched genes for each cluster

In [None]:
# load/find cell cycle markers: T-test/T-cells
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

### Examine the distribution of PDCD1 and potential target genes

In [None]:
# plot the umap with the timepoint
sc.pl.umap(adata, color=['timepoint', 'PDCD1', 'CXCL13', 'HAVCR2','CTLA4', 'batch'])

In [None]:
adata.obs.columns

In [None]:
# show the distribution of the timepoint
sc.pl.umap(adata, color = ['timepoint', 'batch'])

### Compare PDCD1 and CXCL13

In [None]:
# extract cells wtih PDCD1 > 0.5
adata_PDCD1 = adata[adata.raw[:, 'PDCD1'].X > 0.5, :].copy()
# plot violin plot with values > 1
sc.pl.violin(adata_PDCD1, ['PDCD1', 'CXCL13', 'TIGIT', 'HAVCR2'], split=True, groupby='timepoint', jitter=0.0, multi_panel=True)

In [None]:
sc.pl.violin(adata_PDCD1, ['PDCD1', 'CXCL13', 'TIGIT', 'HAVCR2'], split=True, groupby='batch',  jitter=0.0, multi_panel=True)

It appears that cells expressing these genes exhibit bimodal distribution.  Whether different mode correspond to different cell type is unclear

### Examine the relationship between PD-1 and potential target genes pre-treatment



In [None]:
# plot scatter plot for pre 
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
axes[0].scatter(CD274_pre, cxcl13_pre, s=2, color='black')
axes[0].set_xlabel('CD274')
axes[0].set_ylabel('CXCL13')
axes[1].scatter(pdcd1_pre, cxcl13_pre, s=2, color='black')
axes[1].set_xlabel('PDCD1')
# plt.title('Pre')
axes[2].scatter(pdcd1_pre * CD274_pre, cxcl13_pre, s=2, color='black' )
axes[2].set_xlabel('PDCD1 * CD274')
fig.tight_layout()


plt.show()

### Plot relationship on treatment


In [None]:
# Extract from adata_sample_tpm 
cxcl13_on = adata_sample_tpm.X[adata_sample_tpm.obs['timepoint'] == 'On', adata_sample_tpm.var_names == 'CXCL13'] 
pdcd1_on = adata_sample_tpm.X[adata_sample_tpm.obs['timepoint'] == 'On', adata_sample_tpm.var_names == 'PDCD1']
CD274_on = adata_sample_tpm.X[adata_sample_tpm.obs['timepoint'] == 'On', adata_sample_tpm.var_names == 'CD274']
CTLA4_on = adata_sample_tpm.X[adata_sample_tpm.obs['timepoint'] == 'On', adata_sample_tpm.var_names == 'CTLA4']
GZMK_on = adata_sample_tpm.X[adata_sample_tpm.obs['timepoint'] == 'On', adata_sample_tpm.var_names == 'GZMK']

In [None]:
# plot scatter plot for pre 
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
axes[0].scatter(CD274_on, cxcl13_on, s=2, color='black')
axes[0].set_xlabel('CD274')
axes[0].set_ylabel('CXCL13')
axes[1].scatter(pdcd1_on, cxcl13_on, s=2, color='black')
axes[1].set_xlabel('PDCD1')
axes[2].scatter(pdcd1_on * CD274_on, cxcl13_on, s=2, color='black' )
axes[2].set_xlabel('PDCD1 * CD274')
fig.tight_layout()


In [None]:
# plot scatter plot for pre 
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
axes[0].scatter(pdcd1_pre * CD274_pre, cxcl13_pre, s=2, color='black' )
axes[0].set_xlabel('PDCD1 * CD274')
axes[1].scatter(pdcd1_on * CD274_on, cxcl13_on, s=2, color='black' )
axes[1].set_xlabel('PDCD1 * CD274')
fig.tight_layout()
plt.show()