In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import os as os
import sys as sys
sys.path.append('/home/qiuaodon/Desktop/PanCancer_scRNA_analysis/utils/')
from scRNA_utils import *
import operator as op
import matplotlib.colors as mcolors

In [None]:
data_dir_NHDP = "/home/qiuaodon/Desktop/project_data_new/"
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
print(str(adata_T.shape))

In [None]:
sc.pl.umap(adata_T, color = ['CD4', 'CD8A', 'CD8B'])

In [None]:
# filter the adata with expression of CD4 and CD8 more than 0
adata_T_CD4 = adata_T[adata_T[:, adata_T.var_names == 'CD4'].X > 0, :]
adata_T_CD8 = adata_T[adata_T[:, adata_T.var_names == 'CD8A'].X > 0, :]

## CD4

In [None]:
sc.tl.pca(adata_T_CD4, svd_solver='arpack', n_comps=30)

In [None]:
print(adata_T_CD4.obsm['X_pca'].shape)
print(adata_T_CD4.varm['PCs'].shape)
print(adata_T_CD4.uns['pca']['variance_ratio'].shape)
print(adata_T_CD4.obs.columns)

In [None]:
sc.pp.neighbors(adata_T_CD4, n_neighbors=80, n_pcs=30)

In [None]:
# Use the Leiden algorithm to find clusters
sc.tl.leiden(adata_T_CD4, resolution= 1)

In [None]:
sc.tl.umap(adata_T_CD4)

### remove the proliferation cells

In [None]:
# remove the cluster 8 and NAN

adata_T_CD4 = adata_T_CD4[adata_T_CD4.obs['leiden']!= '8'].copy()
# get adata_T_CD4 filtered with E and NE for expansion
adata_T_CD4 = adata_T_CD4[adata_T_CD4.obs['expansion'].isin(['E', 'NE'])].copy()

In [None]:
sc.pl.umap(adata_T_CD4, color = 'leiden')

In [None]:
colors = ["grey", "blue"]  # Start with grey and end with blue
cmap = mcolors.LinearSegmentedColormap.from_list("grey_to_blue", colors)

fig, ax = plt.subplots(figsize=(6, 5))
sc.pl.umap(adata_T_CD4, color='PDCD1', color_map = cmap, ax=ax, vmax = 1)
plt.show()

In [None]:
sc.pl.umap(adata_T_CD4, color = ['leiden'])

In [None]:
from matplotlib.font_manager import FontProperties
# Create a figure with the desired size
plt.figure(figsize=(10, 8))

# Create the UMAP plot
sc.pl.umap(adata_T_CD4, color=['expansion'], size=30, palette='Set1', show=False, legend_fontsize=16)

# Get the current axis
ax = plt.gca()

# Update legend markers (dots) size
for handle in ax.get_legend().legendHandles:
    handle.set_sizes([180])  # Adjust the value as needed for dot size
font_properties = FontProperties(family='Calibri', size=12)  # Set to Calibri (Body) with desired size

# Remove the box line around the UMAP plot
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# Show the plot
plt.show()


In [None]:
plt.figure(figsize=(10, 8))

# Create the UMAP plot
sc.pl.umap(adata_T_CD4, color=['cell_type'], size=30, show=False, legend_fontsize=16, palette='Set3')

# Get the current axis
ax = plt.gca()

# Update legend markers (dots) size
for handle in ax.get_legend().legendHandles:
    handle.set_sizes([180])  # Adjust the value as needed for dot size
font_properties = FontProperties(family='Calibri', size=12)  # Set to Calibri (Body) with desired size

# Remove the box line around the UMAP plot
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

# Show the plot
plt.show()


In [None]:
adata_T_CD4_E = adata_T_CD4[adata_T_CD4.obs['expansion']== 'E']
adata_T_CD4_NE = adata_T_CD4[adata_T_CD4.obs['expansion']== 'NE']

In [None]:
sc.pl.umap(adata_T_CD4_E, color = ['expansion'],size = 30, palette = 'Set1')
sc.pl.umap(adata_T_CD4_NE, color = ['expansion'],size = 30, palette = 'Set1')

In [None]:
sc.pl.umap(adata_T_CD4, color = ['timepoint'],size = 30)

In [None]:
sc.pl.umap(adata_T_CD4, color = ['timepoint', 'leiden', 'CCR7', 'CD4', 'PDCD1', 'CTLA4'])

In [None]:
CD4_markers = {
    'CD4 Naïve'	: ['TCF7', 'SELL', 'LEF1', 'CCR7','PASK'],
    'CD4 H1' : ['CCL4', 'CCL3', 'PDCD1', 'IFNG', 'GZMB', 'LAG3', 'CTLA4'],
    'CD4 EM' :  ['IL7R', 'GZMK', 'ANXA1', 'FOS', 'JUN','KLRB1', 'GZMA'],
    'CD4 FH' : [ 'ICA1', 'GNG4', 'EBI3', 'BCL6', 'CXCL13']
    
}

In [None]:

CD4_markers = {
    'CD4 Naïve'	: ['TCF7', 'SELL', 'LEF1', 'CCR7','PASK'],
    'CD4 EX' : ['CXCL13', 'CD200', 'PDCD1', 'TNFRSF18'],
    'CD4 EM' :  ['IL7R', 'CD40LG', 'ANXA1', 'FOS', 'JUN'],
    'CD4 REG' : ['FOXP3', 'SAT1', 'IL2RA', 'CTLA4']
}

In [None]:
sc.tl.dendrogram(adata_T_CD4, groupby='leiden')
sc.pl.dotplot(adata_T_CD4, CD4_markers, 'leiden', dendrogram=True)

In [None]:
sc.pl.umap(adata_T_CD4, color=['leiden'], palette='Set1')

In [None]:
adata_T_CD4

In [None]:
adata_T_CD4.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_T_CD4, CD4_markers, cluster_column='leiden')

sc.pl.umap(adata_T_CD4, color=['cell_type'])
plt.figure(figsize = (6, 6))
sc.pl.umap(adata_T_CD4, color=['leiden','timepoint'])

In [None]:
sc.pl.umap(adata_T_CD4, color=['cell_type'],palette='Set3',size=50)

In [None]:
adata_T_CD4.obs['leiden'] = adata_T_CD4.obs['leiden'].astype(int)
adata_T_CD4.obs['leiden'] = adata_T_CD4.obs['leiden'].replace({0: 'CD4 EM 2', 1: 'CD4 EM 1', 2: 'CD4 H1', 3: 'CD4 Naïve 1', 4: 'CD4 EM 1', 5: 'CD4 Naïve 2', 6: 'CD4 EM 1', 7: 'CD4 EM 1', 8: 'CD4 H1', 9: 'CD4 H1'})

In [None]:
sc.pl.umap(adata_T_CD4, color = 'leiden')

In [None]:
adata_T_CD4.write(data_dir_NHDP + '1863-counts_cells_cohort1_T_CD4.h5ad')

## CD8

In [None]:
CD8_markers = {
    'CD8 Naïve'	: ['NELL2', 'CD55', 'KLF2', 'CCR7'],
    'CD8 Exhausted' : ['GZMB', 'LAG3', 'CCL4L2'],
    'CD8 RM' : ['TOB1', 'ZNF683'],
    'CD8 EM' :  ['GZMK', 'EOMES', 'ITM2C'],
    'CD8 EMRA' : ['CX3CR1', 'GZMH'],
    'Proliferation' : ['MKI67', 'TUBA1B', 'STMN1', 'HIST1H1C']
}

In [None]:
sc.tl.pca(adata_T_CD8, svd_solver='arpack', n_comps=30)

In [None]:
print(adata_T_CD8.obsm['X_pca'].shape)
print(adata_T_CD8.varm['PCs'].shape)
print(adata_T_CD8.uns['pca']['variance_ratio'].shape)
print(adata_T_CD8.obs.columns)

In [None]:
sc.pp.neighbors(adata_T_CD8, n_neighbors=80, n_pcs=30)

In [None]:
# Use the Leiden algorithm to find clusters
sc.tl.leiden(adata_T_CD8, resolution= 0.6)

In [None]:
sc.tl.umap(adata_T_CD8)

In [None]:
sc.pl.umap(adata_T_CD8, color = ['PDCD1'], size = 25)

In [None]:
sc.pl.umap(adata_T_CD8, color = ['timepoint'], size = 25)

In [None]:
sc.pl.umap(adata_T_CD8, color = ['expansion'], size = 25, palette = 'Set1')

In [None]:
sc.pl.umap(adata_T_CD8, color = ['leiden'])

In [None]:
sc.tl.dendrogram(adata_T_CD8, groupby='leiden')
sc.pl.dotplot(adata_T_CD8, CD8_markers, 'leiden', dendrogram=True)

In [None]:
adata_T_CD8.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_T_CD8, CD8_markers, cluster_column='leiden')

sc.pl.umap(adata_T_CD8, color=['cell_type'])
plt.figure(figsize = (6, 6))
sc.pl.umap(adata_T_CD8, color=['leiden','timepoint'])

In [None]:
adata_T_CD8.obs['leiden'] = adata_T_CD8.obs['leiden'].astype(int)
adata_T_CD8.obs['leiden'] = adata_T_CD8.obs['leiden'].replace({0: 'CD8 EM', 1: 'CD8 EX 1', 2: 'CD8 Naïve', 3: 'CD8 RM', 4: 'CD8 EM', 5: 'CD8 RM', 6: 'CD8 EX 2', 7: 'CD8 EX 3', 8: 'CD8 EMRA'})

In [None]:
sc.pl.umap(adata_T_CD8, color=['cell_type'], palette = 'Set3', size = 35)

In [None]:
sc.pl.umap(adata_T_CD8, color = ['CCR7', 'leiden'])

In [None]:
# save the adata_T_CD4 and adata_T_CD8
adata_T_CD8.write(data_dir_NHDP + '1863-counts_cells_cohort1_T_CD8.h5ad')

In [None]:
# separate each cluster based on timepoint to two clusters, such as separating cluster 1 to cluster 1_on and 1_pre
adata_T_CD8.obs['cluster'] = adata_T_CD8.obs['leiden'].astype(str) + '_' + adata_T_CD8.obs['timepoint'].astype(str)

In [None]:
sc.pl.umap(adata_T_CD8, color=['cluster','leiden'], palette='Set1')

In [None]:
adata_T_CD8.write(data_dir_NHDP + '1863-counts_cells_cohort1_T_CD8_on_pre.h5ad')

## separate the CD4 and CD8 based clusters

In [None]:
# get a adata only with the cluster 0,2,4,8 from adata_T
adata_T_CD4 = adata_T[adata_T.obs['leiden'].isin(['0', '2', '3', '8']), :]

# separate cluster 0 based on timepoint to two clusters
adata_T_CD4.obs['cluster'] = adata_T_CD4.obs['leiden'].astype(str) + '_' + adata_T_CD4.obs['timepoint'].astype(str)

# combine pre and on of cluster 2
adata_T_CD4.obs['cluster'] = adata_T_CD4.obs['cluster'].replace('2_on', '2')
adata_T_CD4.obs['cluster'] = adata_T_CD4.obs['cluster'].replace('2_pre', '2')

In [None]:
sc.pl.umap(adata_T_CD4, color = 'cluster')

In [None]:
adata_T_CD4.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_T_CD4, CD4_markers, cluster_column='cluster')

sc.pl.umap(adata_T_CD4, color=['cell_type'])
plt.figure(figsize = (6, 6))
sc.pl.umap(adata_T_CD4, color=['cluster','timepoint'])

In [None]:
sc.pl.umap(adata_T, color = 'CCR7')

In [None]:
sc.pl.umap(adata_T_CD4, color='cluster', palette='Set1')


In [None]:
adata_T_CD4.write(data_dir_NHDP + '1863-counts_cells_cohort1_T_CD4_on_pre.h5ad')

In [None]:
# get a adata only with the cluster 1,4,5,6 from adata_T
adata_T_CD8 = adata_T[adata_T.obs['leiden'].isin(['1', '4', '5', '6']), :]
adata_T_CD8.obs['cluster'] = adata_T_CD8.obs['leiden'].astype(str) + '_' + adata_T_CD8.obs['timepoint'].astype(str)


In [None]:
sc.pl.umap(adata_T_CD8, color = 'cluster', palette='Set1')

In [None]:
adata_T_CD8.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_T_CD8, CD8_markers, cluster_column='cluster')

sc.pl.umap(adata_T_CD8, color=['cell_type'])
plt.figure(figsize = (6, 6))
sc.pl.umap(adata_T_CD8, color=['cluster','timepoint'])

In [None]:
adata_T_CD8.write(data_dir_NHDP + '1863-counts_cells_cohort1_T_CD8_on_pre.h5ad')