## scRNAseq CD8 Tm in pancreatic cancer preprocessing and analysis

Author: Maha Alissa Alkhalaf

Figures: Figure 6 (G), Extended Data 21

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import celltypist
import scrublet as scr
import anndata as ad
from scipy.io import mmread

## Data Collection

In [None]:
path = '../data/Werba_GSE205013/'

In [None]:
import os
import pandas as pd
import scanpy as sc
from scipy.io import mmread
from tqdm import tqdm

path = '../data/Werba_GSE205013/'

# Create a list to store the file paths
file_list = [
    "GSM6204109_P01",
    "GSM6204110_P02",
    "GSM6204111_P03",
    "GSM6204112_P04",
    "GSM6204113_P05",
    "GSM6204114_P06",
    "GSM6204115_P07",
    "GSM6204116_P08",
    "GSM6204117_P09",
    "GSM6204118_P10",
    "GSM6204119_P11",
    "GSM6204120_P12",
    "GSM6204121_P13",
    "GSM6204122_P14",
    "GSM6204123_P15",
    "GSM6204124_P16",
    "GSM6204125_P17",
    "GSM6204126_P18",
    "GSM6204127_P19",
    "GSM6204128_P20",
    "GSM6204129_P21",
    "GSM6204130_P22",
    "GSM6204131_P23",
    "GSM6204132_P24",
    "GSM6204133_P25",
    "GSM6204134_P26",
    "GSM6204135_P27"
]

# Initialize a Scanpy AnnData object
adata = {}

for file_prefix in tqdm(file_list):
   # Read barcode file
    barcode_file = file_prefix + "_barcodes.tsv.gz"
    barcode_path = os.path.join(path, barcode_file)
    barcodes = pd.read_csv(barcode_path, sep='\t', index_col=0, header=None)
    
    # Read feature file
    feature_file = file_prefix + "_features.tsv.gz"
    feature_path = os.path.join(path, feature_file)
    features = pd.read_csv(feature_path, sep='\t', index_col=0, header=None)
    features = list(features[1])
    
    # Read matrix file
    matrix_file = file_prefix + "_matrix.mtx.gz"
    matrix_path = os.path.join(path, matrix_file)
    matrix = mmread(matrix_path).T  # Transpose to match scanpy's convention

    adata[file_prefix[10:]] = sc.AnnData(X=matrix, obs=barcodes, var=features)

In [None]:
adata

In [None]:
concatenated_adata = ad.concat(adata, label = 'dataset')

In [None]:
concatenated_adata

In [None]:
concatenated_adata.var_names = features

In [None]:
concatenated_adata.obs.index = concatenated_adata.obs.index.astype(str)

In [None]:
concatenated_adata.obs.index.name = None
concatenated_adata.obs = concatenated_adata.obs.rename(index={0: 'barcode'})

In [None]:
concatenated_adata.var_names

In [None]:
concatenated_adata.var['gene'] = concatenated_adata.var_names.values

In [None]:
concatenated_adata.write(f'{path}concatenated_werba.h5ad')

In [None]:
adata = sc.read_h5ad(f'{path}concatenated_werba.h5ad')

In [None]:
adata.X = np.nan_to_num(adata.X, nan = 0)

In [None]:
sc.pp.filter_cells(adata, min_genes=200) 

In [None]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e6)
sc.pp.log1p(adata)

In [None]:
adata

In [None]:
scrub = scr.Scrublet(adata.X)
adata.obs['doublet_scores'], adata.obs['predicted_doublets'] = scrub.scrub_doublets()
scrub.plot_histogram()
print(f'Number of predicted doublet is: {sum(adata.obs["predicted_doublets"])}')

In [None]:
# also revert back to the raw counts as the main matrix in adata
#adata = adata.raw.to_adata() 

adata = adata[adata.obs['predicted_doublets'] == False, :]

print(f'Remaining cells {adata.shape[0]}.')
print(f'Remaining genes {adata.shape[1]}.')

In [None]:
adata.write(f'{path}preprocessed_werba.h5ad')

In [None]:
adata = sc.read_h5ad(f'{path}preprocessed_werba.h5ad')
adata

In [None]:
adata.var_names = features

In [None]:
adata.X = np.nan_to_num(adata.X, nan = 0)

In [None]:
# compute variable genes
sc.pp.highly_variable_genes(adata, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
print(f'Highly variable genes: {sum(adata.var.highly_variable)}')

#plot variable genes
sc.pl.highly_variable_genes(adata)

# subset for variable genes in the dataset
adata = adata[:, adata.var['highly_variable']]

In [None]:
sc.pp.highly_variable_genes(adata)
sc.pp.pca(adata, n_comps = 30, use_highly_variable = True, svd_solver = 'arpack')
sc.pp.neighbors(adata, n_neighbors = 20)

In [None]:
sns.histplot(adata.X[:, adata.var_names == 'CD3E'].toarray().flatten())
sns.histplot(adata.X[:, adata.var_names == 'CD8A'].toarray().flatten())
sns.histplot(adata.X[:, adata.var_names == 'CD8B'].toarray().flatten())

plt.yscale('log')

In [None]:
adata.obs['cell_type'] = 'unknown'
adata.obs[(adata.X[:, adata.var_names == 'CD3E'].toarray().flatten() > 0) & (adata.X[:, adata.var_names == 'CD8A'].toarray().flatten() > 0) & (adata.X[:, adata.var_names == 'CD8B'].toarray().flatten() > 0), -1] = 'CD8 T cells'

In [None]:
adata_cd8 = adata[(adata.X[:, adata.var_names == 'CD3E'].toarray().flatten() > 0) & (adata.X[:, adata.var_names == 'CD8A'].toarray().flatten() > 0) & (adata.X[:, adata.var_names == 'CD8B'].toarray().flatten() > 0), :]

In [None]:
adata_cd8.obs['cell_type'] = 'cd8_t_cells'

In [None]:
adata_cd8

In [None]:
markers = {'CD8+ T Cells' : ['CD8A', 'CD8B', 'TRBC2', 'CD3D', 'CD3G', 'CD3E', 'IL7R', 'GZMK', 'LTB', 'LEF1'], 'B Cells' : ['PXK', 'CD19', 'MS4A1', 'CD74', 'CD79A', 'IGHD'], 'NK Cells' : ['KLRD1', 'NKG7', 'GNLY', 'STYK1', 'GZMA', 'GZMB'], 'DC' : ['FCER1A', 'CST3', 'ZBTB46', 'ITGAX', 'CX3CR1', 'ITGAM'], 'Macrophages' : ['ID1', 'FAR2', 'IFITM1', 'NFIL3', 'NPL', 'OTUB2'], 'Monocytes' : ['LYZ', 'CFP', 'APOBEC3A', 'CD7', 'TET2'], 'NKT' : ['IL2RB', 'NCAM1', 'CD44', 'IL12RB2', 'CXCR4'], 'Plasma' : ['MZB1', 'SSR4', 'IGHG1']}
sc.pl.dotplot(adata, markers, groupby = 'celltype_minor', dendrogram = True, show = False)

plt.savefig('../figures/WU__before_filtering_marker_genes_CD8+_T_cells.pdf', dpi = 300, bbox_inches = 'tight')

In [None]:
adata_cd8 = adata[adata.obs['celltype_minor'] == 'T cells CD8+']
adata_cd8

In [None]:
adata_cd8.var_names = adata_cd8.var['Gene']

In [None]:
markers = {'CD8+ T Cells' : ['CD8A', 'CD8B', 'TRBC2', 'CD3D', 'CD3G', 'CD3E', 'IL7R', 'GZMK', 'LTB', 'LEF1'], 'B Cells' : ['PXK', 'CD19', 'MS4A1', 'CD74', 'CD79A', 'IGHD'], 'NK Cells' : ['KLRD1', 'NKG7', 'GNLY', 'STYK1', 'GZMA', 'GZMB'], 'DC' : ['FCER1A', 'CST3', 'ZBTB46', 'ITGAX', 'CX3CR1', 'ITGAM'], 'Macrophages' : ['ID1', 'FAR2', 'IFITM1', 'NFIL3', 'NPL', 'OTUB2'], 'Monocytes' : ['LYZ', 'CFP', 'APOBEC3A', 'CD7', 'TET2'], 'NKT' : ['IL2RB', 'NCAM1', 'CD44', 'IL12RB2', 'CXCR4'], 'Plasma' : ['MZB1', 'SSR4', 'IGHG1']}
sc.pl.dotplot(adata_cd8, markers, groupby = 'celltype_minor', dendrogram = True, show = False)

plt.savefig('../figures/WU_marker_genes_CD8+_T_cells.pdf', dpi = 300, bbox_inches = 'tight')

In [None]:
sns.histplot(adata.obs['dataset'])

## Module score

In [None]:
single_cell = pd.read_csv('DEG_high_vs_low_salt', sep = '\t')
single_cell.sort_values('avg_log2FC', inplace = True, ascending = False) 
single_cell.head()

In [None]:
bulk = pd.read_csv('../data/salt_data/bulk_CD8_salt.csv')
bulk.sort_values('LFC_cd8_highsalt_vs_cd8_lowsalt', inplace = True, ascending = False) 
bulk.head()

In [None]:
#bulk_upregulated_sig = list(bulk.query('DE_cd8_highsalt_vs_cd8_lowsalt == "upregulated" and (FDR_cd8_highsalt_vs_cd8_lowsalt < 0.001)')['genename'])
#bulk_downregulated_sig = list(bulk.query('DE_cd8_highsalt_vs_cd8_lowsalt == "downregulated" and (FDR_cd8_highsalt_vs_cd8_lowsalt < 0.001)')['genename'])

single_cell_upregulated_sig = list(single_cell.query('(avg_log2FC > 0) and (p_val_adj < 0.001)')['Unnamed: 0'])
single_cell_downregulated_sig = list(single_cell.query('(avg_log2FC < 0) and (p_val_adj < 0.001)')['Unnamed: 0'])


gene_set_significant = {#'bulk_upregulated'         : bulk_upregulated_sig, 
                       #'bulk_downregulated'        : bulk_downregulated_sig,
                       'single_cell_upregulated'   : single_cell_upregulated_sig,
                       'single_cell_downregulated' : single_cell_downregulated_sig
                       }

In [None]:
adata_cd8.X = adata_cd8.X.toarray()

In [None]:
adata_cd8.var['gene'] = list(adata.var_names)
adata_cd8.var

In [None]:
from collections import Counter

# Count each gene name in var_names
gene_name_counts = Counter(adata_cd8.var_names)

# Filter out gene names that appear more than once
duplicates = {gene: count for gene, count in gene_name_counts.items() if count > 1}

print("Duplicate gene names and their counts:", duplicates)


In [None]:
adata_cd8 = adata_cd8[:, ~adata_cd8.var['gene'].isin(duplicates.keys())]

In [None]:
adata_cd8.var_names.is_unique

In [None]:
cutof = 50
sc.tl.score_genes(adata_cd8, gene_list=gene_set_significant['single_cell_upregulated'][:cutof], score_name='sc_salt_module_score')

threshold = 0.3

# Create a new column indicating high or low salt based on the threshold
adata_cd8.obs['sc_salt_group'] = pd.cut(adata_cd8.obs['sc_salt_module_score'], bins=[-float('inf'), -0.0001, threshold, float('inf')],
                                 labels=['cd8+_low_salt', 'cd8+_neutral_salt', 'cd8+_high_salt'], include_lowest=True)

print(adata_cd8.obs['sc_salt_group'].value_counts())
sns.histplot(adata_cd8.obs['sc_salt_module_score'])
plt.title(f'module score for CD8+ T cell in tumor\nbased on first {cutof} salt upregulated scRNAseq genes\n Wu et al. data')
sns.despine()

plt.savefig(f'../figures/werber_module score for CD8+ T cell in tumor\nbased on first {cutof} salt upregulated scRNAseq genes.pdf', dpi = 300, bbox_inches = 'tight')

In [None]:
def gene_expression(adata, gene, conditions, condition):
    plt.figure(figsize = (2, 2.5), dpi = 300)

    gene_index = np.where(adata.var.index == gene)[0][0]
    high_salt = adata.X[[condition == conditions[0] for condition in adata.obs[condition]], gene_index]
    low_salt = adata.X[[condition == conditions[1] for condition in adata.obs[condition]], gene_index]
    
    alternatives = ['greater', 'less']
    p_values = []
    for alternative in alternatives:
        _ , p = stats.ranksums(high_salt, low_salt, alternative = alternative)
        p_values.append(p)
    p_values = [f'{i[0]} : {i[1]:.2e}' for i in list(zip(alternatives, p_values))]

    colors = ['#ffa37b', '#A7C7E7']
    customPalette = sns.set_palette(sns.color_palette(colors))

    ax = sns.violinplot(data = [high_salt, low_salt], saturation = 0.9, width = 0.9, palette = customPalette, linewidth = 0.3, kws = {'linecolor' : 'black'})
    for i, c in enumerate(ax.collections):
        ax.collections[i].set_edgecolor('black')

    sns.boxplot(data = [high_salt, low_salt], width = 0.4,
                boxprops = {'zorder': 2, 'edgecolor' : 'black'},
                capprops = {'color' : 'black'},
                whiskerprops = {'color' : 'black'},
                medianprops = {'color' : 'black'},
                showfliers = False,
                linewidth = 0.3,
                ax = ax)

    sns.stripplot(data = [high_salt, low_salt], color = 'black', ax = ax, size = 0.4)

    ax.set_ylabel(f'Log-scaled expression value', fontsize = 4)

    ax.set_yticklabels(ax.get_yticks(), size = 4);
    ax.set_xticklabels(ax.get_xticklabels(), size = 4);

    labels = [item.get_text() for item in ax.get_yticklabels()]

    ax.set_xticklabels([conditions[0], conditions[1]])
    ax.set_yticklabels([str(round(float(label), 2)) for label in labels])

    ax.set_title(f'Gene: {gene}\nWilcoxon rank sum, p-values:\n {", ".join(p_values)}', fontsize = 4)
    sns.despine()
    plt.savefig(f'../figures/werber_violin_plot_expression_values_of_{gene}_in_{conditions}.pdf', dpi = 300, bbox_inches = 'tight')
    plt.show()
    plt.clf()

In [None]:
genes = {'cyto_list3' : pd.read_csv('../../EX0032_analysis/gene_lists/cyto_list3.csv', sep = '\t'),
        }

for geneset_name, geneset in genes.items():
    for group in ['sc_salt_group']:
        utilis.violin_plot(original_adata = adata_cd8,
                colors = ['#ffa37b', '#A7C7E7'],
                group = group, 
                group_conditions = ['cd8+_high_salt', 'cd8+_low_salt'],
                geneset = geneset['genes'],
                geneset_name = f'{geneset_name} - {group}',
                module_score = True)
    print('--------------------------------------------------------')