In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import os as os
import sys as sys
sys.path.append('/home/qiuaodon/Desktop/PanCancer_scRNA_analysis/utils/')
from scRNA_utils import *
import operator as op

In [None]:
data_dir_NHDP = "/home/qiuaodon/Desktop/project_data_new/"
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
print(str(adata_T.shape))

In [None]:
# show the violin plot of PDCD1LG2 expression in whole cells
sc.pl.violin(adata_pseudo, keys='PDCD1LG2', rotation=90)

In [None]:
sc.pl.violin(adata_pseudo, keys='CD274', rotation=90)

In [None]:
sc.pl.umap(adata_T, color='CCR7')

In [None]:
sc.pl.umap(adata_T, color = 'CD4')
sc.pl.umap(adata_T, color = 'CD8A')

In [None]:
# reclustering T cells
# select high variance genes
n_top_gene_T = 5000
sc.pp.highly_variable_genes(adata_T, n_top_genes=n_top_gene_T)  
# filter genes
adata_T = adata_T[:, adata_T.var['highly_variable']]
# PCA
sc.pp.pca(adata_T, n_comps=50, use_highly_variable=True, svd_solver='arpack')
# UMAP
sc.pp.neighbors(adata_T, n_neighbors=15, n_pcs=50)
sc.tl.umap(adata_T)
# clustering
sc.tl.leiden(adata_T, resolution=0.4)
# plot
sc.pl.umap(adata_T, color=['leiden'], legend_loc='on data', title='T cells')


In [None]:
# clustering
sc.tl.leiden(adata_T, resolution=0.4)
# plot
sc.pl.umap(adata_T, color=['leiden'], legend_loc='on data', title='T cells')


In [None]:
adata_T

In [None]:
data_dir_NHDP = "/home/qiuaodon/Desktop/project_data_new/"
adata_T.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')

In [None]:
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
adata_T

In [None]:
adata = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_whole_cells.h5ad')
adata  = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
adata.raw = adata
print(str(adata.shape))

In [None]:
adata_pseudo = scRNA2PseudoBulkAnnData(adata, sample_id_col='sample_id')
print(adata_pseudo.shape) 

In [None]:
adata_T_pseudo = scRNA2PseudoBulkAnnData(adata_T, sample_id_col='sample_id')
print(adata_T_pseudo.shape) 

In [None]:
# plot the scatterscatterplot of the expression of CXCL13 and PDCD1 in T cells
sc.pl.scatter(adata_T_pseudo, x='PDCD1', y='CXCL13', color='timepoint', size = 100)



## pseudo bulk data without clusters

In [None]:
# Define the genes of interest
genes_of_interest = ['PDCD1', 'HAVCR2', 'PRDM1', 'CXCR4', 'CTLA4','CXCL13','TIGIT']

# Get the gene expression matrix for the genes of interest
gene_expr_matrix = adata_T_pseudo[:, genes_of_interest].X

# Add a column for the timepoint
timepoint_column = adata_T_pseudo.obs['timepoint'].values.reshape(-1, 1)
gene_expr_matrix_with_timepoint = np.hstack((gene_expr_matrix, timepoint_column))
# transfer it to dataframe
gene_expr_matrix_with_timepoint = pd.DataFrame(gene_expr_matrix_with_timepoint, columns=genes_of_interest + ['timepoint'])
# Print the gene expression matrix with timepoint
# change the index of column to sample_id
gene_expr_matrix_with_timepoint.index = adata_T_pseudo.obs['sample_id']

print(gene_expr_matrix_with_timepoint)


In [None]:
# add CD274 from whole cells to gene_expr_matrix_with_timepoint

gene_expr_matrix_with_timepoint['CD274(L)'] = adata_pseudo[:, 'CD274'].X.toarray().flatten()+ adata_pseudo[:, 'PDCD1LG2'].X.toarray().flatten()



In [None]:
# Check the value in the 'timepoint' column and apply the corresponding calculation
def calculate_L_R(row):
    if row['timepoint'] == 'pre':
        return row['CD274(L)'] * row['PDCD1']
    elif row['timepoint'] == 'on':
        return row['CD274(L)'] * row['PDCD1'] * 0.1

# Apply the function to each row in the DataFrame
gene_expr_matrix_with_timepoint['L*R'] = gene_expr_matrix_with_timepoint.apply(calculate_L_R, axis=1)


In [None]:
gene_expr_matrix_with_timepoint

In [None]:
# movw the timepoint column to the last column
cols = list(gene_expr_matrix_with_timepoint.columns)
cols = [cols[-1]] + cols[:-1]
gene_expr_matrix_with_timepoint = gene_expr_matrix_with_timepoint[cols]
# sort by timepoint
gene_expr_matrix_with_timepoint = gene_expr_matrix_with_timepoint.sort_values(by='timepoint', ascending=False)

In [None]:
# output the gene_expr_matrix_with_timepoint
gene_expr_matrix_with_timepoint.to_csv(data_dir_NHDP + 'gene_expr_T_cells_pseudobulk_withoutcluster.csv')

## use the clusters to generate data

In [None]:
pseudobulk_data = {}

for cluster_label in adata_T.obs['leiden'].unique():
    cluster_data = adata_T[adata_T.obs['leiden'] == cluster_label, :]
    pseudobulk_data[cluster_label] = scRNA2PseudoBulkAnnData(cluster_data, sample_id_col='sample_id')



In [None]:
pseudobulk_data

In [None]:
# Create an empty dataframe to store the gene expression matrix for each cluster
combined_matrix = pd.DataFrame()

# Iterate over each cluster
for cluster_label, cluster_data in pseudobulk_data.items():
    # Get the gene expression matrix for the cluster
    gene_expr_matrix = cluster_data[:, genes_of_interest].X
    
    # Convert the gene expression matrix to a DataFrame
    gene_expr_df = pd.DataFrame(gene_expr_matrix, columns=genes_of_interest)
    
    # Add a column for the cluster label
    cluster_label_column = pd.Series([cluster_label] * gene_expr_df.shape[0], name='Cluster')
    # add columns for sample_id and timepoint
    sample_id_column = pd.Series(cluster_data.obs['sample_id'].values, name='sample_id')
    timepoint_column = pd.Series(cluster_data.obs['timepoint'].values, name='timepoint')
    gene_expr_df = pd.concat([sample_id_column, timepoint_column, gene_expr_df], axis=1)
    # add column for CD274(L)

    # Combine the gene expression matrix and cluster label column
    cluster_matrix = pd.concat([gene_expr_df, cluster_label_column], axis=1)
    
    # Append the cluster matrix to the combined matrix
    combined_matrix = pd.concat([combined_matrix, cluster_matrix], ignore_index=True)


# Print the combined matrix
print(combined_matrix)


In [None]:
# add CD274 from whole cells to gene_expr_matrix_with_timepoint matching each sample_id
CD274_L = []
for sample_id in combined_matrix['sample_id']:
    CD274_L.append(adata_pseudo[adata_pseudo.obs['sample_id'] == sample_id, 'CD274'].X.toarray().flatten() + adata_pseudo[adata_pseudo.obs['sample_id'] == sample_id, 'PDCD1LG2'].X.toarray().flatten())
CD274_L = np.array(CD274_L).flatten()
combined_matrix['CD274(L)'] = CD274_L

CD274_L

In [None]:

def calculate_L_R(row):
    if row['timepoint'] == 'pre':
        return row['CD274(L)'] * row['PDCD1']
    elif row['timepoint'] == 'on':
        return row['CD274(L)'] * row['PDCD1'] * 0.1

combined_matrix['L*R'] = combined_matrix.apply(calculate_L_R, axis=1)

In [None]:
combined_matrix

In [None]:
# move the cluster column to the first column
cols = list(combined_matrix.columns)
cols = [cols[-1]] + cols[:-1]
combined_matrix = combined_matrix[cols]


In [None]:
print(combined_matrix)

In [None]:
# check how many samples in cluster 4
adata_T_4.obs['sample_id'].unique()


In [None]:
# output the combined_matrix
combined_matrix.to_csv(data_dir_NHDP + 'gene_expr_T_cells_withcluster.csv')


# sc data

In [None]:
genes_of_interest = ['PDCD1', 'HAVCR2', 'CD8A', 'CD4', 'PRDM1', 'CXCR4', 'CTLA4','CXCL13','TIGIT']
gene_scexpression = adata_T[:, genes_of_interest].X
# export the gene_scexpression
dense_gene_scexpression = gene_scexpression.toarray()  # For scipy sparse matrices
gene_scexpression_df = pd.DataFrame(dense_gene_scexpression, columns=genes_of_interest)

gene_scexpression_df.index = adata_T.obs.index

In [None]:
adata_T.obs['leiden']

In [None]:
gene_scexpression_df['timepoint'] = adata_T.obs['timepoint'].values
gene_scexpression_df['sample_id'] = adata_T.obs['sample_id'].values
gene_scexpression_df['leiden'] = adata_T.obs['leiden'].values

In [None]:
# show how many cellw with CD4 more than 0
gene_scexpression_df[gene_scexpression_df['CD8A'] > 0].shape

In [None]:
sum( gene_scexpression_df['CD4/CD8'] == 1 and gene_scexpression_df['CD8A'] > 0)


In [None]:
# add column for CD274(L)
CD274_L = []
for sample_id in gene_scexpression_df['sample_id']:
    CD274_L.append(adata_pseudo[adata_pseudo.obs['sample_id'] == sample_id, 'CD274'].X.toarray().flatten() + adata_pseudo[adata_pseudo.obs['sample_id'] == sample_id, 'PDCD1LG2'].X.toarray().flatten())
CD274_L = np.array(CD274_L).flatten()
gene_scexpression_df['CD274(L)'] = CD274_L

# add column for L*R
   
def calculate_L_R(row):
    if row['timepoint'] == 'pre':
        return row['CD274(L)'] * row['PDCD1']
    elif row['timepoint'] == 'on':
        return row['CD274(L)'] * row['PDCD1'] * 0.1 
gene_scexpression_df['L*R'] = gene_scexpression_df.apply(calculate_L_R, axis=1)


In [None]:
gene_scexpression_df

In [None]:
cd274_expression = adata[:, 'CD274'].X
num_cells_expressing_cd274 = np.sum(cd274_expression > 0)
print("Number of cells expressing CD274 with expression bigger than 0:", num_cells_expressing_cd274)



In [None]:
# export the gene_scexpression and gene_scexpression_CD274 in the same file but in different sheets
with pd.ExcelWriter(data_dir_NHDP + 'gene_sc_expression_data.xlsx') as writer:  
    gene_scexpression_df.to_excel(writer, sheet_name='gene_sc_expression_T_cells')




# prepare data for boyang adding the significant DEGs in my downstream analysis (May 14 2024)

In [None]:
data_dir = "/home/qiuaodon/Desktop/project_data_new/"
adata_T = sc.read(data_dir + '1863-counts_cells_cohort1_T_cells.h5ad')
# generate a pseudo-bulk AnnData object of the T cells 
adata_T_pseudo = scRNA2PseudoBulkAnnData(adata_T, sample_id_col='sample_id')
print(adata_T_pseudo.shape) 

In [None]:
# get the CD274 from the whole cells
adata = sc.read_h5ad(data_dir + '1863-counts_cells_cohort1_whole_cells.h5ad')
adata.raw = adata
adata_bulk = scRNA2PseudoBulkAnnData(adata, sample_id_col='sample_id')
cd274_bulk = adata_bulk[:, adata_bulk.var_names == 'CD274'].X.toarray()
cd274_bulk_df = pd.DataFrame(cd274_bulk, columns = ['CD274'], index = adata_bulk.obs.index)

In [None]:
adata.obs

In [None]:
cd274_bulk_df

In [None]:
# extract the PDCDLG2
PDCDLG2_bulk = adata_bulk[:, adata_bulk.var_names == 'PDCD1LG2'].X.toarray()
PDCDLG2_bulk_df = pd.DataFrame(PDCDLG2_bulk, columns = ['PDCD1LG2'], index = adata_bulk.obs.index)

In [None]:
PDCDLG2_bulk_df

In [None]:
# add the value of CD274 and PDCDLG2 together as PDCD1LG
PDCD1LG_bulk = cd274_bulk + PDCDLG2_bulk

#change the name to PDCD1LG
PDCD1LG_bulk_df = pd.DataFrame(PDCD1LG_bulk, columns = ['PDCD1LG'], index = adata_bulk.obs.index)
PDCD1LG_bulk_df

In [None]:
# get PDCD1LG from the M cells
# extract Meyloid cells from adata according to the cellType
adata_M = adata[adata.obs['cellType'] == 'Myeloid_cell', :]
adata_M_pseudo = scRNA2PseudoBulkAnnData(adata_M, sample_id_col='sample_id')

PCD274_M = adata_M_pseudo[:, adata_M_pseudo.var_names == 'CD274'].X.toarray()
PDCDLG2_M = adata_M_pseudo[:, adata_M_pseudo.var_names == 'PDCD1LG2'].X.toarray()
PDCD1LG_M = PCD274_M + PDCDLG2_M
PDCD1LG_M_df = pd.DataFrame(PDCD1LG_M, columns = ['PDCD1LG'], index = adata_M_pseudo.obs.index)



In [None]:
# extract the gene of interest
gene_of_interest = ['PDCD1', 'CXCL13', 'PRDM1',  'HAVCR2', 'TIGIT', 'CTLA4', 'TSC22D3', 'TXNIP', 'DDIT4', 'ERN1', 'IRF1', 'NFKBIA']
Tgene = adata_T_pseudo[:, gene_of_interest].X.toarray()
Tgene_df = pd.DataFrame(Tgene, columns = gene_of_interest, index = adata_T_pseudo.obs.index)
Tgene_df

In [None]:
# add the PDCD1LG_bulk_df to the Tgene_df
Tgene_df['PDCD1LG'] = PDCD1LG_bulk_df['PDCD1LG']
Tgene_df

In [None]:
#add one column for timepoint
timepoint = adata_T_pseudo.obs['timepoint']
Tgene_df = pd.concat([Tgene_df, timepoint], axis = 1)
# change pre to 0 and post to 1
Tgene_df['timepoint'] = Tgene_df['timepoint'].replace({'pre':0, 'on':1})
# change the name of the column timepoint to treatment
Tgene_df = Tgene_df.rename(columns = {'timepoint':'treatment'})
Tgene_df

In [None]:
# add expansion to the Tgene_df
expansion = adata_T_pseudo.obs['expansion']
Tgene_df = pd.concat([Tgene_df, expansion], axis = 1)
Tgene_df

In [None]:
Tgene_df['expansion'] = Tgene_df['expansion'].replace({'E':1, 'NE':0})
Tgene_df

In [None]:
# rename PDCD1LG_M_df column name to PDCD1LG_of_M
PDCD1LG_M_df = PDCD1LG_M_df.rename(columns = {'PDCD1LG':'PDCD1LG_of_M'})
# add the PDCD1LG_M_df to the Tgene_df
Tgene_df = pd.concat([Tgene_df, PDCD1LG_M_df], axis = 1)
Tgene_df

In [None]:
# export the Tgene_df
Tgene_df.to_csv(data_dir + 'Pseudobulkdata_for_CausalModel_May14.csv')