In [None]:
import rpy2.robjects as robjects
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import scanpy as sc
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('/home/qiuaodon/Desktop/PanCancer_scRNA_analysis/utils/')
from scRNA_utils import *
data_dir = '/home/qiuaodon/Desktop/project_data_new/'

# load in data

In [None]:
# read in the adata h5ad file
adata = sc.read(data_dir + 'whole_cohort1_processed.h5ad')

In [None]:
adata_pseudo = scRNA2PseudoBulkAnnData(adata)

In [None]:
adata_pseudo.obs['BC_type'].value_counts()

In [None]:
adata

In [None]:
# Create a figure and axes with a specific size
fig, ax = plt.subplots(figsize=(5, 5))

# Plot UMAP on the specified axes
sc.pl.umap(adata, color='cell_type', palette='Set3', ax=ax)

In [None]:
adata_T.obs

In [None]:
sc.pl.umap(adata, color = 'expansion')
sc.pl.umap(adata, color = 'timepoint')

In [None]:
# change the color to grey-to-blue
colors = ["grey", "blue"]  # Start with grey and end with blue
cmap = mcolors.LinearSegmentedColormap.from_list("grey_to_blue", colors)

sc.pl.umap(adata, color='CD274', color_map = cmap, vmax = 1.5)
plt.show()

In [None]:
adata_T = sc.read(data_dir + '1863-counts_cells_cohort1_T_cells.h5ad')
adata_B = sc.read(data_dir + '1863-counts_cells_cohort1_B_cells.h5ad')
adata_M = sc.read(data_dir + '1863-counts_cells_cohort1_M_cells.h5ad')
adata_Epi = sc.read(data_dir + '1863-counts_cells_cohort1_Epi_cells.h5ad')
adata_Fibro = sc.read(data_dir + '1863-counts_cells_cohort1_Fibro_cells.h5ad')
adata_Endo = sc.read(data_dir + '1863-counts_cells_cohort1_Endo_cells.h5ad')

In [None]:
sc.pl.umap(adata_T, color = 'ZFP36',color_map = cmap )

In [None]:
sc.pl.umap(adata_B, color = 'PDCD1',color_map = cmap )

In [None]:
sc.pl.umap(adata_M, color = 'cell_type', cmap = 'Set1')

In [None]:
pre_adata = adata[adata.obs['timepoint'] == 'pre']
on_adata = adata[adata.obs['timepoint'] == 'on']


## prepare for the boxplot

In [None]:
adata_E.obs['patient_id'].unique()

In [None]:
adata_E = adata[adata.obs['expansion'] == 'E']
# remove patient_id == BIOKEY_5 because T cells in this patient decreased

adata_E


In [None]:
adata_NE = adata[adata.obs['expansion'] == 'NE']
adata_NE

In [None]:
adata_NE.obs['patient_id'].unique()

In [None]:
# check if BIOKEY_20 is in adata_NE
'BIOKEY_20' in adata_NE.obs['patient_id'].unique()

In [None]:
# set a dataframe with cell type, timepoint, 
# plot_data = adata.obs.copy()
plot_data = adata_E.obs.copy()
# plot_data = adata_NE.obs.copy()
# plot_data = adata_T.obs.copy()
# plot_data = adata_B.obs.copy()
# plot_data = adata_M.obs.copy()
# Count the number of cells for each combination of 'timepoint' and 'cell_type'
cell_counts = plot_data.groupby(['cell_type', 'sample_id']).size().reset_index(name='cell_count')
cell_counts

In [None]:
# add timepoint column by the sample_id words after second '_'
cell_counts['timepoint'] = cell_counts['sample_id'].apply(lambda x: x.split('_')[2])
cell_counts

In [None]:
cell_counts = cell_counts.sort_values(by=['timepoint','cell_type', 'sample_id'], ascending=[False, True, True])


In [None]:
cell_counts

In [None]:
# pivot the table and each cell type is a column
cell_counts_pivot = cell_counts.pivot(index=['sample_id','timepoint'], columns=['cell_type'], values='cell_count')
cell_counts_pivot = cell_counts_pivot.fillna(0)
cell_counts_pivot = cell_counts_pivot.sort_values(by=['timepoint', 'sample_id'],ascending=[False, True])
cell_counts_pivot

In [None]:
# transposition
cell_counts_pivot = cell_counts_pivot.T
cell_counts_pivot

In [None]:
# replace the cell_count by percentage in each sample
cell_counts_pivot = cell_counts_pivot.div(cell_counts_pivot.sum(axis=0), axis=1) * 100

cell_counts_pivot


In [None]:
# export the table
cell_counts_pivot.to_csv(data_dir + 'cell_counts_E_cells_for_boxplot.csv')

# cell_counts_pivot.to_csv(data_dir + 'cell_counts_M_cells_for_boxplot.csv')

In [None]:
# Categorizing 'timepoint' to ensure proper ordering
cell_counts['timepoint'] = pd.Categorical(cell_counts['timepoint'], ['Pre', 'On'])
cell_counts = cell_counts.sort_values('timepoint')
# put T cells first column
cell_counts['cell_type'] = pd.Categorical(cell_counts['cell_type'], ['T cells',  'B cells', 'Myeloid', 'Fibroblast', 'Epithelial', 'Endothelial'])

# Define custom colors for each timepoint
palette = {'Pre': '#FFA500', 'On': '#4169E1'}   # Pre: yellow, On: blue

# Create the boxplot with custom colors
plt.figure(figsize=(4, 4))
sns.boxplot(x='cell_type', y='cell_count', hue='timepoint', data=cell_counts, palette=palette)

# Customizing the plot
plt.xticks(rotation=45)
plt.title('Number of Cells in Different Cell Types Over Timepoints')
# remove x lable
plt.xlabel('')
plt.ylabel('Number of Cells')
plt.tight_layout()
plt.show()

In [None]:
from scipy.stats import ttest_ind
from scipy.stats import ttest_rel
significance_dict = {}
for cell_type in cell_counts['cell_type'].unique():
    pre = cell_counts[(cell_counts['cell_type'] == cell_type) & (cell_counts['timepoint'] == 'Pre')]['cell_count']
    on = cell_counts[(cell_counts['cell_type'] == cell_type) & (cell_counts['timepoint'] == 'On')]['cell_count']
    #t_stat, p_val = ttest_ind(pre, on, equal_var=False)
    t_stat, p_val = ttest_rel(pre, on)
    significance_dict[cell_type] = p_val

# Create the boxplot
plt.figure(figsize=(4, 4))
# put the Pre first before On
cell_counts['timepoint'] = pd.Categorical(cell_counts['timepoint'], ['Pre', 'On'])
sns.boxplot(x='cell_type', y='cell_count', hue='timepoint', data=cell_counts)

# Add significance annotations
for i, cell_type in enumerate(cell_counts['cell_type'].unique()):
    p_val = significance_dict[cell_type]
    max_y = cell_counts[cell_counts['cell_type'] == cell_type]['cell_count'].max() + 10
    if p_val < 0.05:
        plt.hlines(y=3910, xmin=i-0.3, xmax=i+0.3, color='black', lw=1.5)
        plt.text(i, max(cell_counts['cell_count']), "*" if p_val < 0.05 else "ns", ha='center')
        

# Customize the plot as necessary
plt.xticks(rotation=45)
plt.title('Cell Counts in Different Cell Types Over Timepoints')
plt.xlabel('Cell Type')
plt.ylabel('Cell Count')
plt.tight_layout()
plt.show()

In [None]:
significance_dict

# label T cell subtypes

In [None]:
T_cell_markers = {
    'CD4 '	: ['CD4', 'IL7R'],
    'CD8'	: [ 'CD8A', 'CD8B'],
    'Naïve'	: ['TCF7', 'SELL', 'LEF1', 'CCR7'],
    'Exhausted' : ['LAG3', 'TIGIT', 'PDCD1', 'HAVCR2', 'CTLA4'],
    'Cytotoxic' : ['IL2', 'GZMA', 'GNLY', 'PRF1', 'GZMB', 'GZMK', 'IFNG', 'NKG7'],
    'Treg' : ['IL2RA', 'FOXP3', 'IKZF2', 'IKZF4',  'TNFRSF18'],
    'Gamma-delta' : ['TRGC1', 'TRGC2', 'TRDC'],
    'Th17' : ['IL17A',  'CCR6', 'KLRB1'],  #'IL22',
    'MAIT' : ['SLC4A10', 'KLRB1', 'IL7R', 'DPP4'],  
    'ILC' :	['KIT', 'IL1R1'],
    'Th1' :	['STAT4', 'IL12RB2', 'IFNG'],
    'Th2' :	['GATA3', 'STAT6', 'IL4'],
    'Tfh'	: ['MAF', 'CXCL13', 'CXCR5', 'PDCD1'],
    'NK' :  ['XCL1', 'FCGR3A', 'KLRD1', 'KLRF1', 'NCAM1'],
    'Proliferation' : ['MKI67', 'PCNA', 'STMN1']
}

In [None]:
T_cell_markers = {
    'CD4 EX'	: ['CD4', 'CTLA4', 'PDCD1', 'CXCL13', 'CD200'],
    'CD4 EM' : ['CD4', 'ANXA1', 'FOS', 'JUN','IL7R', 'CD40LG'],
    'CD4 REG' : ['IL2RA', 'FOXP3', 'IKZF2', 'IKZF4',  'TNFRSF18'],
    'CD8 EX' : [ 'CD8A', 'PDCD1', 'HAVCR2', 'CTLA4'],
    'CD8 EM' : ['GZMK', 'EOMES', 'ITM2C'],
    'Proliferation' : ['MKI67', 'PCNA', 'STMN1']

}

In [None]:
sc.tl.dendrogram(adata_T, groupby='leiden')
sc.pl.dotplot(adata_T, T_cell_markers, 'leiden', dendrogram=True)

In [None]:
adata_T.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_T, T_cell_markers, cluster_column='leiden')


In [None]:
for cell_type, markers in T_cell_markers.items():
    print (cell_type, ":", markers)
    sc.pl.umap(adata_T, color=markers)

In [None]:
sc.pl.umap(adata_T, color='cell_type', palette='Set3')

In [None]:
# assign the cluster leiden 5 and 6 to cell type CD8 EM
adata_T.obs.loc[adata_T.obs['leiden'].isin(['5','6']), 'cell_type'] = 'CD8 EM'

sc.pl.umap(adata_T, color='cell_type', palette='Set3')

In [None]:
sc.pl.umap(adata_T, color='CD3D' )
# cluster 7 don't have CD3D expression meaning they are not T cells
#remove cluster 7
adata_T = adata_T[adata_T.obs['leiden'] != '7']


In [None]:
sc.pl.umap(adata_T, color=['CD4', 'CD8B','leiden','CCR7','CXCR3','CCR5', 'CCR8','CCR6'])

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))
sc.pl.umap(adata_T, color=['timepoint'], ax = ax)

In [None]:
# 

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))
sc.pl.umap(adata_T, color=['PDCD1'],color_map= cmap, ax = ax, vmax =1)

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))
sc.pl.umap(adata_T, color=['cell_type'],palette = 'Set3', ax = ax)

In [None]:
from scipy.stats import ttest_ind
# Create the boxplot
plot_data = adata_E.obs.copy()
cell_counts = plot_data.groupby(['cell_type', 'sample_id']).size().reset_index(name='cell_count')
# put the Pre first before On
cell_counts['timepoint'] = cell_counts['sample_id'].str.split('_').str[2]
cell_counts['timepoint'] = pd.Categorical(cell_counts['timepoint'], ['Pre', 'On'])
plt.figure(figsize=(4, 4))
palette = {'Pre': '#FFA500', 'On': '#4169E1'}   # Pre: yellow, On: blue
sns.boxplot(x='cell_type', y='cell_count', hue='timepoint',palette = palette,  data=cell_counts, showfliers=False)

# Add significance annotations
for i, cell_type in enumerate(cell_counts['cell_type'].unique()):
    p_val = significance_dict[cell_type]
    max_y = cell_counts[cell_counts['cell_type'] == cell_type]['cell_count'].max() + 10
    if p_val < 0.05:
        plt.hlines(y=3910, xmin=i-0.3, xmax=i+0.3, color='black', lw=1.5)
        plt.text(i, max(cell_counts['cell_count']), "*" if p_val < 0.05 else "ns", ha='center')
        
# Customize the plot as necessary
plt.xticks(rotation=45)
plt.title('Cell Counts of patients with expansion')
plt.xlabel('Cell Subtype')
plt.ylabel('Cell Count')
plt.tight_layout()
plt.show()

In [None]:
significance_dict

In [None]:
# draw the boxplot for 

# B cells

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))
sc.pl.umap(adata_B, color=['timepoint'], ax = ax)
sc.pl.umap(adata_B, color=['PDCD1'],color_map= cmap, ax = ax, vmax =1)

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))
sc.pl.umap(adata_B, color=['cell_type'],color_map= cmap, ax = ax, vmax =1, palette = 'Set3')

In [None]:
B_cell_markers ={
    'Follicular B cell' : ['MS4A1', 'IGHD', 'FCER2'],
    'GrB+ Regulatory B cell' : ['BSG', 'CD19', 'CD38'],
    'Mature B cell' : ['DHX9', 'FCER2'],
    'Memory B cell' : ['CD44', 'CD69'],
    'Plasma cell' : ['CD38', 'CD27', 'SDC1', 'BCL6'],
    'lgA+ Regulatory B cell' : ['CD19', 'IGHA1'],
    'lgG+' : ['IGHG1'],
    'IgM+' : ['IGHM'],
    'Breg' : ['CD274', 'HAVCR1', 'IL10'],
    'Activated B cell' : ['CD86'],
    'Trafficking' : ['CXCR4', 'CXCR5'],
}

In [None]:
sc.tl.dendrogram(adata_B, groupby='leiden')
sc.pl.dotplot(adata_B, B_cell_markers, 'leiden', dendrogram=True)

In [None]:
adata_B.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_B, B_cell_markers, cluster_column='leiden')
sc.pl.umap(adata_B, color=['cell_type'], palette = 'Set3')
sc.pl.umap(adata_B, color=['timepoint'])

In [None]:
adata_B.obs['cell_type'] = adata_B.obs['cell_type'].cat.add_categories(['Follicular B cell'])
adata_B.obs.loc[adata_B.obs['leiden'] == '1', 'cell_type'] = 'Follicular B cell'

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))
sc.pl.umap(adata_M, color=['PDCD1'], color_map = cmap, ax= ax)

In [None]:
sc.pl.umap(adata_B, color=['IGHM', 'CD27', 'CD38', 'IGHG1'],color_map= cmap, vmax =1)

In [None]:
for cell_type, markers in B_cell_markers.items():
    print (cell_type, ":", markers)
    sc.pl.umap(adata_B, color=markers)

In [None]:
from scipy.stats import ttest_ind
significance_dict = {}
for cell_type in cell_counts['cell_type'].unique():
    pre = cell_counts[(cell_counts['cell_type'] == cell_type) & (cell_counts['timepoint'] == 'Pre')]['cell_count']
    on = cell_counts[(cell_counts['cell_type'] == cell_type) & (cell_counts['timepoint'] == 'On')]['cell_count']
    t_stat, p_val = ttest_ind(pre, on, equal_var=False)
    significance_dict[cell_type] = p_val

# Create the boxplot
plt.figure(figsize=(4, 4))
# put the Pre first before On
cell_counts['timepoint'] = pd.Categorical(cell_counts['timepoint'], ['Pre', 'On'])
sns.boxplot(x='cell_type', y='cell_count', hue='timepoint', data=cell_counts)

# Add significance annotations
for i, cell_type in enumerate(cell_counts['cell_type'].unique()):
    p_val = significance_dict[cell_type]
    if p_val < 0.05:
        plt.text(i, max(cell_counts['cell_count']), "*" if p_val < 0.05 else "ns", ha='center')

# Customize the plot as necessary
plt.xticks(rotation=45)
plt.title('Cell Counts in Different Cell Types Over Timepoints')
plt.xlabel('Cell Type')
plt.ylabel('Cell Count')
plt.tight_layout()
plt.show()

# M cell

In [None]:
print(adata_M.obsm['X_pca'].shape)
print(adata_M.varm['PCs'].shape)
print(adata_M.uns['pca']['variance_ratio'].shape)
print(adata_M.obs.columns)

In [None]:
M_cell_markers = {
    # 'M1': ['IL1B', 'CD86', 'CD68', 'TLR2'],
    # 'M2': ['MRC1', 'FOLR2', 'CD68', 'CD163'],
    'Macrophage': ['CD68', 'S100A9', 'PLAUR', 'SPP1' ],# 'CD68','CD163', 'MRC1', 'CD163', 'CD274', 'MPEG1','SIGLEC1'
    'Mast cell': ['TPSB2', 'TPSAB1', 'CPA3', 'KIT'],
    # 'Neutrophil' : ['FCGR3A', 'CSF3R', 'MPO'],
    'Dendritic' : [ 'CD80', 'CD83', 'ICAM1'],
    'Monocyte' : ['CD14', 'FCGR3A', 'CSF1R'],
    # 'Langerhans': ['CD207', 'CD1A', 'ITGAX']
}

In [None]:
sc.pl.umap(adata_M, color = ['leiden','cell_type'], palette = 'Set3')

In [None]:
for cell_type, markers in M_cell_markers.items():
    print (cell_type, ":", markers)
    sc.pl.umap(adata_M, color=markers)

In [None]:
sc.tl.dendrogram(adata_M, groupby='leiden')
sc.pl.dotplot(adata_M, M_cell_markers, 'leiden', dendrogram=True)

In [None]:
adata_M.obs.drop(columns="cell_type", inplace = True )
labelClusterWithCellType(adata_M, M_cell_markers, cluster_column='leiden')

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))
sc.pl.umap(adata_M, color=['timepoint'], ax = ax)

In [None]:
fig, ax = plt.subplots(figsize=(6, 5))
sc.pl.umap(adata_M, color=['cell_type'],palette='Set3' , ax = ax)

In [None]:

# group each type of cells together and calculate the mean expression of each marker then draw the heatmap for the markers with the cell_type
sc.pl.heatmap(adata_M, M_cell_markers, groupby='cell_type', dendrogram=True)



In [None]:
# save the adata
adata_M.write(data_dir + '1863-counts_cells_cohort1_M_cells.h5ad')

# Epi


In [None]:
sc.pl.umap(adata_Epi, color = 'leiden')