In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import scipy.io as sio
import anndata as ad
import seaborn as sns
import os as os
import sys as sys
sys.path.append('/home/qiuaodon/Desktop/PanCancer_scRNA_analysis/utils/')
from scRNA_utils import *
import operator as op
data_dir_NHDP = "/home/qiuaodon/Desktop/project_data_new/"


In [None]:
adata = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_whole_cells.h5ad')

In [None]:
adata

# get the DEGs of each cluster of T cells

In [None]:
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')

In [None]:
sc.pl.umap(adata_T, color = 'leiden')

In [None]:
# Iterate through B cell clusters and identify the genes that are differentially expressed before and after treatment
q_val_threshold = 1
pval_threshold = 1
cluster_deg_df_list = []
for c in adata_T.obs['leiden'].cat.categories:
    print("T cluster:", c)
    cell_in_cluster = adata_T[adata_T.obs['leiden'] == c, :]
    cluster_deg_df = paird_ttest(cell_in_cluster, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id') 
    cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
    cluster_deg_df_list.append(cluster_deg_df)

In [None]:
# save the DEG results into excel, each cluster a sheet
with pd.ExcelWriter(data_dir_NHDP + 'T_cell_DEG_withcluster.xlsx') as writer:
    for i, cluster_deg_df in enumerate(cluster_deg_df_list):
        cluster_deg_df.to_excel(writer, sheet_name = 'cluster_' + str(i))


In [None]:
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]

In [None]:
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
deg_3 = cluster_deg_df_list[3]
deg_4 = cluster_deg_df_list[4]
# sort deg_3 and deg_4 by pval and take the top 100 genes
deg_3 = deg_3.sort_values(by = 'pval').iloc[:50]
deg_4 = deg_4.sort_values(by = 'pval').iloc[:50]
deg_3_genes = set(deg_3.index)
deg_4_genes = set(deg_4.index)
venn2([deg_3_genes, deg_4_genes], set_labels = ('Cluster 3', 'Cluster 4'))
plt.show()


In [None]:
deg_4

## get the DEGs of PDCD1+- cells

In [None]:
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')

In [None]:
sc.pl.umap(adata_T, color = 'leiden')

In [None]:

import matplotlib.colors as mcolors
# change the color to grey-to-blue
colors = ["grey", "blue"]  # Start with grey and end with blue
cmap = mcolors.LinearSegmentedColormap.from_list("grey_to_blue", colors)
sc.pl.umap(adata_T, color = 'LTA', color_map=cmap)

In [None]:
# plot the violin plot of LTA


In [None]:
adata_T_pre = adata_T[adata_T.obs['timepoint'] == 'pre']
adata_T_on = adata_T[adata_T.obs['timepoint'] == 'on']

In [None]:
sc.pl.umap(adata_T_pre, color = 'LTA', color_map='Blues')
sc.pl.umap(adata_T_on, color = 'LTA', color_map='Blues')

In [None]:
sc.pl.umap(adata_T_pre, color = 'LTA', color_map=cmap)
sc.pl.umap(adata_T_on, color = 'LTA', color_map=cmap)

In [None]:
adata_T

In [None]:
# add one obs column for if the cell express PDCD1
adata_T.obs['PDCD1exp'] = (adata_T[:, 'PDCD1'].X>0).toarray().flatten()

In [None]:
# convert boolean True to 1 and False to 0 in PDCD1
adata_T.obs['PDCD1exp'] = adata_T.obs['PDCD1exp'].astype(str)
adata_T.obs['PDCD1exp']

In [None]:
adata_T_On = adata_T[adata_T.obs['timepoint'] == 'on']

In [None]:
adata_T_On

In [None]:
adata_T_On.obs['sample_id_PD1'] = adata_T_On.obs['sample_id'].astype(str) + '_' + adata_T_On.obs['PDCD1exp']
adata_T_On.obs['sample_id_PD1']

In [None]:
adata_T_On.obs['sample_id_PD1'].value_counts()

In [None]:
adata_T_On.obs['sample_id'].value_counts()

In [None]:
adata_T_On_0 = adata_T_On[adata_T_On.obs['PDCD1exp'] == 'False']
adata_T_On_1 = adata_T_On[adata_T_On.obs['PDCD1exp'] == 'True']

In [None]:
# normalize the adata_T_On_0 value count
countPD1 = adata_T_On_0.obs['sample_id'].value_counts()
countall = adata_T_On.obs['sample_id'].value_counts()

countPD1 = countPD1/countall


In [None]:
# normalize the adata_T_On_1 value count
countPD1_1 = adata_T_On_1.obs['sample_id'].value_counts()
countall_1 = adata_T_On.obs['sample_id'].value_counts()

countPD1_1 = countPD1_1/countall_1

countPD1_df = countPD1_1.reset_index()
countPD1_df.columns = ['sample_id', 'count']

In [None]:
countPD1_df = countPD1.reset_index()
countPD1_df.columns = ['sample_id', 'count']

In [None]:
sc.pl.violin(adata_T_On, keys='PDCD1', groupby='sample_id', rotation=90)


In [None]:

# Plotting using seaborn's barplot
plt.figure(figsize=(12, 8))
sns.barplot(x='sample_id', y='count', data=countPD1_df, palette='viridis', order=countPD1_df.sort_values('count', ascending=False).sample_id)
plt.xticks(rotation=90)
plt.title('proportion of cells expressing PDCD1 in each sample')
plt.xlabel('Sample ID')
plt.ylabel('proportion')
plt.show()


In [None]:
# plot the barplot of the number of cells in each sample
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.countplot(x='sample_id', data=adata_T_On_1.obs, order=adata_T_On_1.obs['sample_id'].value_counts().index)
plt.title('Number of cells expressing PDCD1 in each sample')
plt.xticks(rotation=90)
plt.show()

In [None]:
def scRNA2PseudoBulkAnnData(adata, sample_id_col = None): 
    '''        
        This function convert a scRNA AnnData oboject to an AnnData object,
           where gene expression from the same sample is merged and normalized as 
           transcript per million (TPM) format.  
         
        Parameters:
            adata: anndata object
            sample_id_col: the column in adata.obs that contains the sample id
        
        Returns:
            adata: AnnData object with adata.X in TPM format.  The annData object 
            is annoted with uns["pseudoBulk"] = "log_2_tpm"
        
    '''
    # check if input adata is AnnData object
    if not isinstance(adata, ad.AnnData):
        print ("Input adata is not an AnnData object")
        return None
    if not sample_id_col and not 'sample_id' in adata.obs.columns:
        print ("sample id column not provided")
        return None
    
    if not sample_id_col and 'sample_id' in adata.obs.columns:
        sample_id_col = 'sample_id_PD1'
    
    # check if adata have raw data
    if not adata.raw:
        print ("adata.raw is not available")
        return None

    col_to_remove = ['ncount_rna', 'nfeature_rna', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_genes_by_counts', 'log1p_n_genes_by_counts']
    col_to_keep_in_obs = [x for x in adata.obs.columns.str.lower() if x not in col_to_remove]

    nSamples = len(adata.obs['sample_id_PD1'].unique()) 
    nGenes = len(adata.var_names)
    X = np.zeros((nSamples, nGenes), dtype=np.float32)
    df_tpm = pd.DataFrame(X, index=adata.obs['sample_id_PD1'].unique(), columns = adata.var_names)

    # remove obs columns that are added by sc.pp functions
    col_to_remove = ['ncount_rna', 'nfeature_rna', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_genes_by_counts', 'log1p_n_genes_by_counts']
    col_to_keep_in_obs = [x for x in adata.obs.columns.str.lower() if x not in col_to_remove]
    df_obs = pd.DataFrame(index=adata.obs['sample_id_PD1'].unique(), columns = col_to_keep_in_obs)

    for sample in adata.obs['sample_id_PD1'].unique():
        tpm = np.sum(adata.X[adata.obs['sample_id_PD1'] == sample, :], axis = 0)
        tpm = np.array(tpm / np.sum(tpm) * 1e6, dtype=np.float32) # normalize to TPM/per cell and force to float32
        df_tpm.loc[sample,:] = tpm

        # Populate df_obs
        for col in adata.obs.columns:
            df_obs.loc[sample, col] = adata.obs.loc[adata.obs[sample_id_col] == sample, col].unique()[0]
 

    # Create an AnnData object for the pseudo-bulk RNA data
    adata_sample_tpm = ad.AnnData(df_tpm.values, obs=df_obs, var=adata.var)
    adata_sample_tpm.uns["pseudoBulk"] = "tpm"
    adata_sample_tpm.raw = adata_sample_tpm

    return adata_sample_tpm

In [None]:
adata_T_On = scRNA2PseudoBulkAnnData(adata_T_On, 'sample_id_PD1')

In [None]:
q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df = paird_ttest(adata_T_On, condition_key = 'PDCD1exp', sample_id_col = 'sample_id_PD1', patient_id_col = 'patient_id')
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_On_T_cell_comparingbyPDCD1.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

### get the DEGs of filtered cells

In [None]:
# get the list of patient_id with more than 100 cells expressing PDCD1
patient_id_list = adata_T_On_1.obs['patient_id'].value_counts()
patient_id_list = patient_id_list[patient_id_list > 100].index
patient_id_list

In [None]:
adata_T_On_filtered = adata_T_On[adata_T_On.obs['patient_id'].isin(patient_id_list)]

In [None]:
q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df = paird_ttest(adata_T_On_filtered, condition_key = 'PDCD1exp', sample_id_col = 'sample_id_PD1', patient_id_col = 'patient_id')
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/project_data_new/EGAS00001004809_On_T_cell_comparingbyPDCD1_filtered.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

In [None]:
cluster_deg_df = paird_ttest(adata_T,condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')

with pd.ExcelWriter('/home/qiuaodon/Desktop/project_data_new/EGAS00001004809_whole_T_cell_DEG.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

In [None]:
# plot the barplot of the number of cells in each sample
adata_T_On_0 = adata_T_On_0[adata_T_On_0.obs['patient_id'].isin (patient_id_list) ]
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.countplot(x='sample_id', data=adata_T_On_0.obs, order=adata_T_On_0.obs['sample_id'].value_counts().index)
plt.title('Number of cells in each sample')
plt.xticks(rotation=90)
plt.show()

In [None]:
# plot the barplot of the number of cells in each sample
adata_T_On_1 = adata_T_On_1[adata_T_On_1.obs['patient_id'].isin (patient_id_list) ]
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.countplot(x='sample_id', data=adata_T_On_1.obs, order=adata_T_On_1.obs['sample_id'].value_counts().index)
plt.title('Number of PDCD1 cells in each sample')
plt.xticks(rotation=90)
plt.show()

In [None]:
#name firt column as genes
cluster_deg_df = cluster_deg_df.rename(columns = {'Unnamed: 0':'genes'})
cluster_deg_df = cluster_deg_df[(cluster_deg_df['pval'] < 0.001) & (abs(cluster_deg_df['log2fc']) > 0.5) & (cluster_deg_df['qval'] < 0.1)]
cluster_deg_df

In [None]:
import mygene
from Bio import Entrez

# Initialize mygene
mg = mygene.MyGeneInfo()

# List of significant DEGs
significant_degs = cluster_deg_df.index.tolist()

# Function to search PubMed for gene associations with PD1
def search_pubmed(gene, query):
    Entrez.email = "qqqqq200110@gmail.com"  # Always set your email in Entrez
    handle = Entrez.esearch(db="pubmed", term=f"{gene} {query}", retmax=10)
    record = Entrez.read(handle)
    return record["IdList"]

# Search for associations with PD1
pd1_associations = {}
for gene in significant_degs:
    pubmed_results = search_pubmed(gene, "PD1")
    if pubmed_results:
        pd1_associations[gene] = pubmed_results

# Print genes with PubMed associations to PD1
for gene, ids in pd1_associations.items():
    print(f"{gene}: {ids}")

In [None]:
from Bio import Entrez
import pandas as pd

# Set your email for Entrez
Entrez.email = "your.email@example.com"

# Dictionary of unique DEGs by cluster
unique_degs_dict = {
    "T_cluster_0": ["CITED2", "LTA", "TUBB", "CD40LG", "ARL4C", "TGFBR2"],
    "T_cluster_1": ["KMT2A", "PDGFB", "H2AFZ", "NCF1", "GALNT3", "PRR14L"],
    "T_cluster_2": ["BATF", "AHNAK", "TNF", "GADD45B", "IFI44L", "IL2RA", "MIR4435-2HG", "NFKBIZ", "AC097534.2", "MAGEH1", "NPC1", "SLC39A10", "DGKE"],
    "T_cluster_3": ["TUBB2A", "EDEM1", "ADAMTS1", "IL15", "HSPA1B", "IGFBP3", "METTL21A", "OAS2", "PLAC8", "PLEK", "TNFRSF1B"],
    "T_cluster_4": ["KLF6", "GIMAP7", "TUBA1B", "TSC22D3", "NEAT1"],
    "T_cluster_5": ["AREG", "DDIT4", "SOCS1"],
    "T_cluster_6": ["IRF1", "HSPA1A", "TNFAIP3", "DDIT4", "NFKBIA"],
    "T_cluster_7": ["SAMD9", "PHLDA1", "ITGA4", "CCL4", "CXCL2", "ZNF366"],
    "T_cluster_8": ["PTGER4", "IGKC", "NUCB2", "MIGA1", "HSP90AA1", "FAAH2", "ENO1", "SLC38A2", "F5", "PDE4DIP", "BCL2", "BCL6", "PBXIP1", "ZEB2", "ANKRD36C", "CXCL9", "VPS13C", "AZGP1", "RBKS", "AL359220.1", "FXYD7", "HECTD2", "TGIF1", "SELL", "ITGAV", "IVNS1ABP", "CLOCK"]
}

# Function to search PubMed for a given query
def search_pubmed(query):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=5)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

# Create a DataFrame to store the results
columns = ["Cluster", "DEG", "PubMed IDs"]
results = []

# Perform the search for each DEG in each cluster
for cluster, degs in unique_degs_dict.items():
    for deg in degs:
        query = f"PDCD1 AND {deg}"
        pubmed_ids = search_pubmed(query)
        results.append([cluster, deg, ", ".join(pubmed_ids)])

# Convert the results to a DataFrame
results_df = pd.DataFrame(results, columns=columns)



In [None]:
results_df.to_csv(data_dir_NHDP + "deg_pdcd1_search_results.csv", index=False)

In [None]:
# filter DEGs with more than 3 PubMed associations
pd1_associations_filtered = {gene: ids for gene, ids in pd1_associations.items() if len(ids) > 3}
len(pd1_associations_filtered)

In [None]:
# show all the genes in the filtered DEGs
print(pd1_associations_filtered.keys())

## try to get DEGs of B cells

In [None]:
adata_B = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_B_cells.h5ad')

sc.tl.rank_genes_groups(adata_B, groupby='leiden',  method='wilcoxon')
sc.pl.rank_genes_groups(adata_B, n_genes=25, sharey=False)

In [None]:
is_present = "BIOKEY_22_Pre" in adata_B.obs.sample_id
print(is_present)

In [None]:
# Condition to keep cells that are not from patient_id "BIOKEY_22"
condition = adata_B.obs['patient_id'] != 'BIOKEY_22'
# Apply the condition to filter out these cells
adata_B_filtered = adata_B[condition].copy()

In [None]:
print(adata_B_filtered.shape)
print(adata_B.shape)
adata_B = adata_B_filtered

In [None]:
sc.pl.umap(adata_B, color= 'leiden')

In [None]:
# Iterate through B cell clusters and identify the genes that are differentially expressed before and after treatment
q_val_threshold = 0.2
pval_threshold = 0.005
cluster_deg_df_list = []
for c in adata_B.obs['leiden'].cat.categories:
    print("B cluster:", c)
    cell_in_cluster = adata_B[adata_B.obs['leiden'] == c, :]
    cluster_deg_df = paird_ttest(cell_in_cluster, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id') 
    cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
    cluster_deg_df_list.append(cluster_deg_df)

In [None]:
# Obtain the DEGs of the whole B cells group
q_val_threshold = 0.2
pval_threshold = 1
adata_B = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_B_cells.h5ad')
cluster_deg_df = paird_ttest(adata_B, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id') 
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_whole_B_cell.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

### we still have some trouble, the q_values are way too large and similiar

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse

# Assuming you have already extracted 'LTBP2' expression data as 'ltbp2_expression'
# and 'timepoints' from your AnnData object 'adata_B'

# If 'ltbp2_expression' is in a sparse matrix format, convert it to a dense format
if scipy.sparse.issparse(ltbp2_expression):
    ltbp2_expression_dense = ltbp2_expression.toarray().flatten()
else:
    ltbp2_expression_dense = ltbp2_expression.flatten()

# Create a DataFrame for plotting
df = pd.DataFrame({
    'LTBP2 Expression': ltbp2_expression_dense,
    'Timepoint': adata_B.obs['timepoint']
})

# Plotting the violin plot
plt.figure(figsize=(10, 6))
sns.violinplot(x='Timepoint', y='LTBP2 Expression', data=df)
plt.title('Violin Plot of LTBP2 Expression Across Timepoints')
plt.show()


In [None]:
# Find the index of 'LTBP2'
ltbp2_index = adata_B.var_names.get_loc('LTBP2')

# Filter the data for 'pre' timepoint and extract 'LTBP2' expression
ltbp2_pre = adata_B[adata_B.obs['timepoint'] == 'pre', ltbp2_index].X
ltbp2_pre_dense = ltbp2_pre.toarray() if scipy.sparse.issparse(ltbp2_pre) else ltbp2_pre
pre_mean = np.mean(ltbp2_pre_dense)

# Filter the data for 'on' timepoint and extract 'LTBP2' expression
ltbp2_on = adata_B[adata_B.obs['timepoint'] == 'on', ltbp2_index].X
ltbp2_on_dense = ltbp2_on.toarray() if scipy.sparse.issparse(ltbp2_on) else ltbp2_on
on_mean = np.mean(ltbp2_on_dense)

print("Pre Mean:", pre_mean)
print("On Mean:", on_mean)


In [None]:
import numpy as np
import scipy.sparse

# Find the expression data for 'LTBP2'
ltbp2_expression = adata_B[:, adata_B.var_names == 'LTBP2'].X

# If the expression data is in a sparse format, convert it to a dense format
if scipy.sparse.issparse(ltbp2_expression):
    ltbp2_expression_dense = ltbp2_expression.toarray()
else:
    ltbp2_expression_dense = ltbp2_expression

# Calculate the mean expression of 'LTBP2'
mean_ltbp2_expression = np.mean(ltbp2_expression_dense)

print("Mean expression of LTBP2:", mean_ltbp2_expression)


In [None]:
ltbp2_expression

In [None]:
def paird_ttest1(adata, condition_key = None, sample_id_col = None, patient_id_col = None, pval_cutoff = 0.05, log2fc_cutoff = 1):

    # check inputs
    if not isinstance(adata, ad.AnnData):
        print ("Input adata is not an AnnData object")
        return None 
    if not condition_key:
        print ("Condition key not provided")
        return None
    # check if condition to compare is binary
    if len(adata.obs[condition_key].unique()) != 2:
        print ("Condition to compare is not binary")
        return None
    if not sample_id_col:
        print ("sample id column not provided")
        return None
    if not patient_id_col:
        print ("patient id column not provided")
        return None
    # check if adata have raw data
    if not adata.raw:
        print ("adata.raw is not available")
        return None
    
    # assume data is already pseudo bulk, check
    if 'pseudoBulk' not in adata.uns.keys():
        print ("Input adata is not pseudo-bulk RNA data. Convert to pseudo-bulk RNA data.")
        adata = scRNA2PseudoBulkAnnData(adata, sample_id_col=sample_id_col)
    
    nPatients = len(adata.obs[patient_id_col].unique())
    nGenes = len(adata.var_names)
    nConditions = len(adata.obs[condition_key].unique())
    if nConditions != 2:
        print ("Number of conditions is not 2")
        return None
    
    X = np.zeros((nConditions, nPatients, nGenes), dtype=np.float32)

    condition1 = adata.obs[condition_key].unique()[0]
    condition2 = adata.obs[condition_key].unique()[1]
    condition1_mean_name = condition1 + '_mean'
    condition2_mean_name = condition2 + '_mean'

    # create a dataframe to store the results
    colNames = ['pval', 'log2fc', condition1_mean_name, condition2_mean_name]
    res_df = pd.DataFrame(index=adata.var_names, columns = colNames)
    patients = adata.obs[patient_id_col].unique()  # this is a numpy array
    
    for index, patient in np.ndenumerate(patients):
        indx_p = index[0]
        # print ("Processing patient %s" % patient)
        # check if the patient has two conditions
        if len(adata.obs[condition_key][adata.obs[patient_id_col] == patient].unique()) < 2:
            # print ("Patient %s does not have two conditions" % patient)
            continue
        # extract data from the patient under condition 1 and condition 2

        # print ("Extract data from patient %s under condition %s & %s" % (patient, condition1, condition2))
        
        X[0, indx_p, :] = adata.raw.X[(adata.obs[patient_id_col] == patient) & (adata.obs[condition_key] == condition1), :]
        X[1, indx_p, :] = adata.raw.X[(adata.obs[patient_id_col] == patient) & (adata.obs[condition_key] == condition2), :]
        
            # perform paired t-test 
    # for each gene, perform t-test between two conditions of the same patient
    for i in range(nGenes):  # need check how to parallelize this loop, maybe use cupy
        x_1 = X[0, :, i]
        x_2 = X[1, :, i]
        
        # check if x_1 and x_2 are all zeros
        if np.sum(x_1) == 0 or np.sum(x_2) == 0:
            continue
        
        gene_name = adata.var_names[i]        
        mean_condition1 = np.mean(x_1)
        mean_condition2 = np.mean(x_2)
        if gene_name == 'LTBP2':
            print(mean_condition1)
            print(mean_condition2)
            print(x_1)
            print(x_2)
            data = {
    'Expression': np.concatenate([x_1, x_2]),
    'Condition': ['Condition1'] * len(x_1) + ['Condition2'] * len(x_2)
}
            df = pd.DataFrame(data)

# Plotting the violin plot
            plt.figure(figsize=(8, 6))
            sns.violinplot(x='Condition', y='Expression', data=df)
            plt.title('Violin Plot of LTBP2 Expression')
            plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
q_val_threshold = 0.2
pval_threshold = 1
adata_B = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_B_cells.h5ad')
cluster_deg_df = paird_ttest(adata_B, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id') 

In [None]:

print(len(adata_B.raw.X[(adata_B.obs['patient_id'] == 'BIOKEY_12') & (adata_B.obs['timepoint'] == 'On'), :]))

In [None]:
# Filter the data and get the shape
filtered_data_shape = adata_B.raw.X[(adata_B.obs['patient_id'] == 'BIOKEY_1') & (adata_B.obs['timepoint'] == 'pre'), :].shape

# Print the number of rows
print(filtered_data_shape[0])


In [None]:
adata_B.obs

In [None]:
genes_to_keep = [
    "TNFSF8", "AC025164.1", "SOCS1", "CEMIP2", "KLHL14", "SLC15A4", "SESN1",
    "CCL4", "TLE1", "CD200", "KLF2", "SLF2", "IFIT1", "AL158850.1", "RGS1",
    "SPINK2", "FUT7", "CCDC6", "CNKSR2", "PAQR3", "KLF9", "PLPP3", "GPR25",
    "RNF144B", "NCK2", "MFSD14C", "NHLRC2", "COL1A1", "ICAM1", "LACTB2-AS1",
    "CENPM", "PDE4D", "HLX", "ARL15", "MTF1", "LMNA", "NRL", "TNFRSF10D",
    "TFAP2C", "IGHD", "FER", "MYC", "HMGB2", "EGR1", "MHENCR", "SPAG5",
    "GALNTL6", "COL1A2", "GPR183", "LINC01480", "DNAJB1", "TSPAN33", "ATG7",
    "HSPA1B", "KLF4", "RSAD2", "ISG15", "AL031733.2", "TLR1", "IGHV3-33",
    "ERCC6", "ACKR3", "ERV3-1", "AZIN1-AS1", "SERHL2", "LILRB2", "TESK1",
    "WDR4", "WBP4", "EMP2", "FBXO15", "CDCA5", "ZBTB32", "REXO5", "THRB",
    "CD2", "PDK4", "LINC02422", "AC013400.1", "SEMA7A", "HAPLN3", "AP1AR",
    "AL627171.1", "AL139089.1", "AC120193.1", "SLC7A6", "RAB37", "RRM2",
    "AF117829.1", "ADAM8", "LPCAT2", "CHRNA1", "BTG1", "LAMB1", "CRIP2",
    "GPR34", "ALKBH1", "CCNA2", "SGCA", "NANP", "ARL5B", "LINC00910", "RUFY2",
    "FAM30A", "STAG3", "OLFM2", "MMP2", "COQ2", "CD72", "HACE1", "OAS3",
    "AL353194.1", "AC090061.1", "CXCL2", "USP44", "MGAT4B", "LINC01685",
    "AL662796.1", "SFI1", "MATK", "TLR7", "NLRP6", "AL121658.1", "SULF1",
    "ARHGEF9", "LINC01588", "BICRAL", "SLC38A11", "CENPH", "SMG6", "CXCR3",
    "CHPT1", "SYNGAP1", "SIGLEC14", "AC022098.4", "APOE", "HSPA1A", "FAS",
    "HIST1H4F", "CEP55", "AC012640.2", "PLK2", "AHNAK", "KIF20A", "PLK1",
    "RASAL1", "ESPL1", "TMEM67", "RAB39A", "DNAAF1", "KCNQ5", "P3H1",
    "SLC12A7", "COL8A1", "HSPA2", "AC103831.1", "ARRDC3-AS1", "CD3D", "KYAT1",
    "RHOB", "HEATR6", "DNAH14", "HIST1H2AK", "GEM", "HBA1", "MAFB", "C2CD3",
    "DUSP2", "ZNF816-ZNF321P", "MTMR9", "TRAF3IP2-AS1", "CCR7", "CXCL10",
    "IGKV1-13", "SATB1", "CR2", "CNTNAP2", "H1FX", "VTI1A", "LINC01678",
    "ENDOV", "AL033527.3", "VCL", "PXDNL", "MICALL1", "WASHC5", "TMEM220-AS1",
    "CAVIN1", "SLC39A10", "AL359643.3", "GALNT3", "FILIP1L", "FBN1",
    "LINC01781", "ZNF683", "FXYD3", "AC105285.1", "NGFR", "TMEM161B-AS1",
    "FN1"
]
# Filter the AnnData object to keep only the specified genes
adata_filtered = adata_B[:, adata_B.var_names.isin(genes_to_keep)].copy()


In [None]:
# Obtain the DEGs of the selected B cells group
q_val_threshold = 0.2
pval_threshold = 1
cluster_deg_df = paird_ttest(adata_filtered, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id') 
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_selected_B_cell_test.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

In [None]:
# Obtain the DEGs of the whole B cells group
q_val_threshold = 0.2
pval_threshold = 1
adata_B = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_B_cells.h5ad')
cluster_deg_df = paird_ttest(adata_B, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id') 
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_whole_B_cell_test.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

In [None]:
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')

In [None]:
# generate a pseudo-bulk AnnData object of the T cells 
adata_T_pseudo = scRNA2PseudoBulkAnnData(adata_T, sample_id_col='sample_id')
print(adata_T_pseudo.shape) 
# save it as a csv file use to_csv
adata_T_pseudo.to_df().to_csv('/home/qiuaodon/Desktop/EGAS00001004809_T_cell_pseudo.csv')





In [None]:
adata_T_pseudo.shape

In [None]:
adata_T_pseudo

## get the DEGs of the whole T cells and other types of cells

In [None]:
#get the DEGs of the whole T cells
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df = paird_ttest(adata_T, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_whole_T_cell.xlsx') as writer:
    cluster_deg_df.to_excel(writer)


### get the DEG of expansion and NE T cells

In [None]:
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')

In [None]:
adata_T_E = adata_T[adata_T.obs['expansion'] == 'E'].copy()
adata_T_NE = adata_T[adata_T.obs['expansion'] == 'NE'].copy()

In [None]:
adata_T_E.obs['patient_id'].unique()

In [None]:
adata_T_NE.obs['patient_id'].unique()

In [None]:
adata_T_E 

In [None]:
#get the DEGs of the E T cells
q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df = paird_ttest(adata_T_E, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_T_cell_with_expansion.xlsx') as writer:
    cluster_deg_df.to_excel(writer)


In [None]:
#get the DEGs of the E T cells
q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df = paird_ttest(adata_T_NE, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_T_cell_without_expansion.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

In [None]:
#get the DEGs of the whole Myeloid cells
adata_M = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_M_cells.h5ad')
q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df = paird_ttest(adata_M, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_whole_M_cell.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

In [None]:
#get the DEGs of the whole Endothelial cells
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_Endo_cells.h5ad')
q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df = paird_ttest(adata_T, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_whole_Endo_cell.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

In [None]:
#get the DEGs of the whole Epi cells
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_Epi_cells.h5ad')
q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df = paird_ttest(adata_T, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_whole_Epi_cell.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

In [None]:
#get the DEGs of the whole Fibro cells
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_Fibro_cells.h5ad')
q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df = paird_ttest(adata_T, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_whole_Fibro_cell.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

In [None]:
#get the DEGs of the whole M cells
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_M_cells.h5ad')
q_val_threshold = 0.2
pval_threshold = 0.05
cluster_deg_df = paird_ttest(adata_T, condition_key = 'timepoint', sample_id_col = 'sample_id', patient_id_col = 'patient_id')
cluster_deg_df = cluster_deg_df[cluster_deg_df['pval'] < pval_threshold]
with pd.ExcelWriter('/home/qiuaodon/Desktop/EGAS00001004809_whole_M_cell.xlsx') as writer:
    cluster_deg_df.to_excel(writer)

## get the bilinear regression of five interested genes

In [None]:
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')

print(str(adata_T.shape))
# check if CD274 is in the var_names
is_present = "CD274" in adata_T.var_names
print(is_present)

In [None]:
adata_T  = ad.AnnData(X=adata_T.raw.X, obs=adata_T.obs, var=adata_T.raw.var)
adata_T.raw = adata_T
print(str(adata_T.shape))

In [None]:
# generate a pseudo-bulk AnnData object of the T cells 
adata_T_pseudo = scRNA2PseudoBulkAnnData(adata_T, sample_id_col='sample_id')
print(adata_T_pseudo.shape) 

In [None]:
genes_of_interest = ['TXNIP','DDIT4','TSC22D3', 'HAVCR2', 'PRDM1', 'CXCR4', 'CTLA4','CXCL13','TIGIT']
for gene in genes_of_interest:
    plt.figure(figsize = (4,3))
    sc.pl.umap(adata_T, color=gene)

In [None]:
# filter the adata_T to Pre_treatment
condition = adata_T.obs['timepoint'] == 'pre'
adata_T_pre = adata_T[condition].copy()
condition = adata_T.obs['timepoint'] == 'on'
adata_T_on = adata_T[condition].copy()


In [None]:
for gene in genes_of_interest:
    sc.pl.umap(adata_T_pre, color=gene)

In [None]:
for gene in genes_of_interest:
    sc.pl.umap(adata_T_on, color=gene)

In [None]:
# genes_of_interest is a list of gene names

for gene in genes_of_interest:
    sc.pl.scatter(adata_T_pseudo, x='PDCD1', y=gene, title=f'PDCD1 vs {gene}', color='timepoint', show=False, color_map= 'viridis',size=200)


In [None]:
adata = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_whole_cells.h5ad')
adata  = ad.AnnData(X=adata.raw.X, obs=adata.obs, var=adata.raw.var)
adata.raw = adata
adata_pseudo = scRNA2PseudoBulkAnnData(adata, sample_id_col='sample_id')


In [None]:
#draw the violin plot of CD274 and make the dots bigger 
sc.pl.violin(adata_pseudo, 'CD274', groupby='timepoint', stripplot=True, jitter=True, multi_panel=True, size=5)
sc.pl.violin(adata_pseudo, 'CD274', groupby='timepoint', stripplot=True, jitter=True, multi_panel=True)

In [None]:
adata_Epi = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_Epi_cells.h5ad')
adata_Epi  = ad.AnnData(X=adata_Epi.raw.X, obs=adata_Epi.obs, var=adata_Epi.raw.var)
adata_Epi.raw = adata_Epi
adata_Epi_pseudo = scRNA2PseudoBulkAnnData(adata_Epi, sample_id_col='sample_id')
print(adata_Epi_pseudo.shape) 

In [None]:
sc.pl.violin(adata_Epi_pseudo, 'CD274', groupby='timepoint', stripplot=True, jitter=True, multi_panel=True, size=5)

In [None]:
adata_M = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_M_cells.h5ad')
adata_M  = ad.AnnData(X=adata_M.raw.X, obs=adata_M.obs, var=adata_M.raw.var)
adata_M.raw = adata_M
adata_M_pseudo = scRNA2PseudoBulkAnnData(adata_M, sample_id_col='sample_id')
print(adata_M_pseudo.shape) 
sc.pl.violin(adata_M_pseudo, 'CD274', groupby='timepoint', stripplot=True, jitter=True, multi_panel=True, size=5)

In [None]:
adata_pseudo_combined = adata_T_pseudo
# change the CD274 values with the values of adata_pseudo


In [None]:
if 'CD274' in adata_pseudo_combined.var_names:
    adata_pseudo_combined.obs['CD274'] = adata_pseudo[:, 'CD274'].X.flatten()
else:
    print("Column 'CD274' does not exist in 'adata_pseudo'.")


In [None]:
print(adata_pseudo_combined)

In [None]:
sc.pl.scatter(adata_pseudo_combined, x='PDCD1', y='CD274', title=f'PDCD1 vs CD274', color='timepoint', show=False, color_map= 'viridis',size=200)

In [None]:
# generate violin plot of genes of interest seperately
for gene in genes_of_interest:
    sc.pl.violin(adata_T_pseudo, gene, groupby='timepoint', stripplot=True, jitter=True, multi_panel=True)
# generate violin plot of PDCD1
sc.pl.violin(adata_T_pseudo, 'PDCD1', groupby='timepoint', stripplot=True, jitter=True, multi_panel=True)

In [None]:
adata_T_pseudo.shape

In [None]:
is_present = "PDCD1" in adata_T_pseudo.var_names
print(is_present)

### draw the plot of genes vs LR

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Example: Accessing gene expression data
# Replace 'gene_name' with the actual names as they appear in adata_T_pseudo.var_names
pdc1_expression = adata_T_pseudo[:, 'PDCD1'].X
cd274_expression = adata_T_pseudo[:, 'CD274'].X

# Assuming 'HAVCR2' expression is also stored in .X
havcr2_expression = adata_T_pseudo[:, 'PRDM1'].X

# Assuming 'Timepoint' is a column in .obs
timepoints = adata_T_pseudo.obs['timepoint']

# Compute the products
pdc1_cd274_product = pdc1_expression * cd274_expression
if 'on' in timepoints.values:
    pdc1_cd274_product[timepoints == 'on'] *= 0.2

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'PDCD1_CD274': np.ravel(pdc1_cd274_product),
    'PRDM1': np.ravel(havcr2_expression),
    'timepoint': timepoints
})

# Plotting
plt.figure(figsize=(8, 5))
for timepoint in ['pre', 'on']:
    subset = plot_data[plot_data['timepoint'] == timepoint]
    plt.scatter(subset['PDCD1_CD274'], subset['PRDM1'], label=timepoint, alpha=1.0)

plt.xlabel('PDCD1 * CD274 (scaled by 0.2 for "on" timepoint)')
plt.ylabel('PRDM1')
plt.title('LR vs PRDM1')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pdc1_expression = adata_T_pseudo[:, 'PDCD1'].X
cd274_expression = adata_T_pseudo[:, 'CD274'].X

# Assuming 'HAVCR2' expression is also stored in .X
havcr2_expression = adata_T_pseudo[:, 'CXCR4'].X

# Assuming 'Timepoint' is a column in .obs
timepoints = adata_T_pseudo.obs['timepoint']

# Compute the products
pdc1_cd274_product = pdc1_expression * cd274_expression
if 'on' in timepoints.values:
    pdc1_cd274_product[timepoints == 'on'] *= 0.2

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'PDCD1_CD274': np.ravel(pdc1_cd274_product),
    'CXCR4': np.ravel(havcr2_expression),
    'timepoint': timepoints
})

# Plotting
plt.figure(figsize=(8, 5))
for timepoint in ['pre', 'on']:
    subset = plot_data[plot_data['timepoint'] == timepoint]
    plt.scatter(subset['PDCD1_CD274'], subset['CXCR4'], label=timepoint, alpha=1.0)

plt.xlabel('PDCD1 * CD274 (scaled by 0.2 for "on" timepoint)')
plt.ylabel('CXCR4')
plt.title('LR vs CXCR4')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pdc1_expression = adata_T_pseudo[:, 'PDCD1'].X
cd274_expression = adata_T_pseudo[:, 'CD274'].X

# Assuming 'HAVCR2' expression is also stored in .X
havcr2_expression = adata_T_pseudo[:, 'CXCL13'].X

# Assuming 'Timepoint' is a column in .obs
timepoints = adata_T_pseudo.obs['timepoint']

# Compute the products
pdc1_cd274_product = pdc1_expression * cd274_expression
if 'on' in timepoints.values:
    pdc1_cd274_product[timepoints == 'on'] *= 0.2

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'PDCD1_CD274': np.ravel(pdc1_cd274_product),
    'CXCL13': np.ravel(havcr2_expression),
    'timepoint': timepoints
})

# Plotting
plt.figure(figsize=(8, 5))
for timepoint in ['pre', 'on']:
    subset = plot_data[plot_data['timepoint'] == timepoint]
    plt.scatter(subset['PDCD1_CD274'], subset['CXCL13'], label=timepoint, alpha=1.0)

plt.xlabel('PDCD1 * CD274 (scaled by 0.2 for "on" timepoint)')
plt.ylabel('CXCL13')
plt.title('LR vs CXCL13')
plt.legend() 
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pdc1_expression = adata_T_pseudo[:, 'PDCD1'].X
cd274_expression = adata_T_pseudo[:, 'CD274'].X

# Assuming 'HAVCR2' expression is also stored in .X
havcr2_expression = adata_T_pseudo[:, 'CTLA4'].X

# Assuming 'Timepoint' is a column in .obs
timepoints = adata_T_pseudo.obs['timepoint']

# Compute the products
pdc1_cd274_product = pdc1_expression * cd274_expression
if 'on' in timepoints.values:
    pdc1_cd274_product[timepoints == 'on'] *= 0.2

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'PDCD1_CD274': np.ravel(pdc1_cd274_product),
    'CTLA4': np.ravel(havcr2_expression),
    'timepoint': timepoints
})

# Plotting
plt.figure(figsize=(8, 5))
for timepoint in ['pre', 'on']:
    subset = plot_data[plot_data['timepoint'] == timepoint]
    plt.scatter(subset['PDCD1_CD274'], subset['CTLA4'], label=timepoint, alpha=1.0)

plt.xlabel('PDCD1 * CD274 (scaled by 0.2 for "on" timepoint)')
plt.ylabel('CTLA4')
plt.title('LR vs CTLA4')
plt.legend() 
plt.show()


### draw the violin plot of scRNA data

In [None]:
# draw the violin plot of scRNA data of genes of interest
genes_of_interest = ['PDCD1', 'CD274', 'CD8A', 'HAVCR2', 'PRDM1', 'CXCR4', 'CTLA4','CXCL13','TIGIT']
for gene in genes_of_interest:
    sc.pl.violin(adata_T, gene, groupby='timepoint', stripplot=False, jitter=True, multi_panel=True)


In [None]:
sc.pl.violin(adata_T, 'PDCD1', groupby='timepoint', stripplot=False, jitter=True, multi_panel=True)

In [None]:
# filter the T cells data to keep only cells expressing PDCD1
adata_T_filtered = adata_T[adata_T[:, 'PDCD1'].X > 0, :].copy()
print(adata_T_filtered.shape, adata_T.shape)


In [None]:
# plot the violin plot of scRNA data of genes of interest of cells expressing PDCD1
for gene in genes_of_interest:
    sc.pl.violin(adata_T_filtered, gene, groupby='timepoint', stripplot=False, jitter=True, multi_panel=True)

## correlation of top 10 DEGs

### top 10 DEGs T & B
T:
PRDM1
SLA
TSC22D3
IRF1
TXNIP
DDIT4
NFKBIA
ZFP36L1
FKBP5
RGS1
B:
TNFSF8
AC025164.1
SOCS1
CEMIP2
KLHL14
SLC15A4
SESN1
CCL4
TLE1
CD200

In [None]:
adata_B = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_B_cells.h5ad')
# generate a pseudo-bulk AnnData object of the T cells 
adata_B_pseudo = scRNA2PseudoBulkAnnData(adata_B, sample_id_col='sample_id')
print(adata_B_pseudo.shape) 

In [None]:
# remove samples from patient BIOKEY_22
condition = adata_T_pseudo.obs['patient_id'] != 'BIOKEY_22'
adata_T_pseudo_filtered = adata_T_pseudo[condition].copy()
print(adata_T_pseudo_filtered.shape)
condition = adata_B_pseudo.obs['patient_id'] != 'BIOKEY_22'
adata_B_pseudo_filtered = adata_B_pseudo[condition].copy()
print(adata_B_pseudo_filtered.shape)

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

# Top 10 DEGs for T and B cells
top_10_DEGs_T = ['PRDM1', 'SLA', 'TSC22D3', 'IRF1', 'TXNIP', 'DDIT4', 'NFKBIA', 'ZFP36L1', 'FKBP5', 'RGS1']
top_10_DEGs_B = ['TNFSF8', 'AC025164.1', 'SOCS1', 'CEMIP2', 'KLHL14', 'SLC15A4', 'SESN1', 'CCL4', 'TLE1', 'CD200']

# Initialize an empty matrix to store the correlation values
corr_matrix = np.zeros((len(top_10_DEGs_T), len(top_10_DEGs_B)))
pval_matrix = np.zeros((len(top_10_DEGs_T), len(top_10_DEGs_B)))

# Calculate the correlation between the top 10 DEGs of T and B cells
for i, gene_T in enumerate(top_10_DEGs_T):
    for j, gene_B in enumerate(top_10_DEGs_B):
        # Get the expression data of the two genes and flatten them to 1D arrays
        gene_T_expression = adata_T_pseudo_filtered[:, adata_T_pseudo_filtered.var_names == gene_T].X.flatten()
        gene_B_expression = adata_B_pseudo_filtered[:, adata_B_pseudo_filtered.var_names == gene_B].X.flatten()
        
        # Calculate the correlation
        corr, pval = stats.pearsonr(gene_T_expression, gene_B_expression)
        
        # Store the correlation value in the matrix
        corr_matrix[i, j] = corr
        # store the log of pval in another matrix
        pval_matrix[i, j] = -np.log10(pval)


# Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1,
            xticklabels=top_10_DEGs_B, yticklabels=top_10_DEGs_T)
plt.title('Correlation between Top 10 DEGs of T and B Cells')
plt.xlabel('B Cell Genes')
plt.ylabel('T Cell Genes')
plt.show()



In [None]:

# Plotting the heatmap with correlation values
plt.figure(figsize=(12, 10))
heatmap = sns.heatmap(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1,
                      xticklabels=top_10_DEGs_B, yticklabels=top_10_DEGs_T,
                      annot=False)  # Turn off default annotation

# Overlaying the correlation values and -log10 of p-values
for i in range(corr_matrix.shape[0]):
    for j in range(corr_matrix.shape[1]):
        plt.text(j + 0.5, i + 0.5, f'{corr_matrix[i, j]:.2f}\n({pval_matrix[i, j]:.2f})',
                 horizontalalignment='center', verticalalignment='center', fontsize=8)

plt.title('Correlation between Top 10 DEGs of T and B Cells\n(Correlation / -log(p-value))')
plt.xlabel('B Cell Genes')
plt.ylabel('T Cell Genes')
plt.show()


In [None]:
# transform correlation matrix to dataframe
corr_df = pd.DataFrame(corr_matrix, index=top_10_DEGs_T, columns=top_10_DEGs_B)
corr_df


In [None]:
# transform pvalue matrix to dataframe
pval_df = pd.DataFrame(pval_matrix, index=top_10_DEGs_T, columns=top_10_DEGs_B)
pval_df

In [None]:
import os
# save the bilinear figures of T and B cells
# Create a directory named DEGs_T_vs_B
if not os.path.exists('DEGs_T_vs_B'):
    os.makedirs('DEGs_T_vs_B')

for gene_T in top_10_DEGs_T:
    for gene_B in top_10_DEGs_B:
        # Create a new figure
        plt.figure(figsize=(8, 6))
        
        # Iterate over each timepoint and use the filtered becasue we need to remove biokey 22
        for timepoint in adata_T_pseudo_filtered.obs['timepoint'].cat.categories:
            # Get indices for current timepoint
            indices = adata_T_pseudo_filtered.obs['timepoint'] == timepoint

            # Get the expression data for the current timepoint
            gene_T_expression = adata_T_pseudo_filtered[indices, adata_T_pseudo_filtered.var_names == gene_T].X.flatten()
            gene_B_expression = adata_B_pseudo_filtered[indices, adata_B_pseudo_filtered.var_names == gene_B].X.flatten()
            
            # Plot the data for the current timepoint
            plt.scatter(gene_T_expression, gene_B_expression, label=timepoint)

        # Add labels, title, and legend
        plt.xlabel(gene_T)
        plt.ylabel(gene_B)
        plt.title(f'{gene_T} vs {gene_B} (Corr: {corr_matrix[top_10_DEGs_T.index(gene_T), top_10_DEGs_B.index(gene_B)]:.2f})')
        plt.legend(title='Timepoint')

        # Save the plot whose corr value is higher than 0.5 in the directory
        if corr_matrix[top_10_DEGs_T.index(gene_T), top_10_DEGs_B.index(gene_B)] > 0.5:
             plt.savefig(f'DEGs_T_vs_B/{gene_T}_vs_{gene_B}.png')

        # Now show the plot
        plt.show()


### top 10 DEGs T & Endo

In [None]:
adata_Endo = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_Endo_cells.h5ad')
# generate a pseudo-bulk AnnData object of the Endo cells 
adata_Endo_pseudo = scRNA2PseudoBulkAnnData(adata_Endo, sample_id_col='sample_id')
print(adata_Endo_pseudo.shape) 

In [None]:
# show all the sample_ids in the Endo cells
adata_Endo_pseudo.obs['sample_id'].unique()
#there is no sample_id BIOKEY_28_On in the Endo cells

In [None]:
# remove samples from patient BIOKEY_28
condition = adata_Endo_pseudo.obs['patient_id'] != 'BIOKEY_28'
adata_Endo_pseudo_filtered = adata_Endo_pseudo[condition].copy()
print(adata_Endo_pseudo_filtered.shape)


In [None]:
condition = adata_T_pseudo.obs['patient_id'] != 'BIOKEY_28'
adata_T_pseudo_filtered = adata_T_pseudo[condition].copy()
print(adata_T_pseudo_filtered.shape)

In [None]:

# Top 10 DEGs for T and Endo cells
top_10_DEGs_T = ['PRDM1', 'SLA', 'TSC22D3', 'IRF1', 'TXNIP', 'DDIT4', 'NFKBIA', 'ZFP36L1', 'FKBP5', 'CXCR4']
top_10_DEGs_Endo = ['SPRY1', 'DDIT4', 'PER1', 'APOLD1', 'NEDD9', 'TSC22D3', 'ITPRIP', 'DUSP1', 'ID1']

# Initialize an empty matrix to store the correlation values
corr_matrix = np.zeros((len(top_10_DEGs_T), len(top_10_DEGs_Endo)))
pval_matrix = np.zeros((len(top_10_DEGs_T), len(top_10_DEGs_Endo)))

# Calculate the correlation between the top 10 DEGs of T and Endo cells
for i, gene_T in enumerate(top_10_DEGs_T):
    for j, gene_Endo in enumerate(top_10_DEGs_Endo):
        # Get the expression data of the two genes and flatten them to 1D arrays
        gene_T_expression = adata_T_pseudo_filtered[:, adata_T_pseudo_filtered.var_names == gene_T].X.flatten()
        gene_Endo_expression = adata_Endo_pseudo_filtered[:, adata_Endo_pseudo_filtered.var_names == gene_Endo].X.flatten()
        
        # Check if the lengths of the arrays are the same
        if len(gene_T_expression) != len(gene_Endo_expression):
            print(gene_Endo,len(gene_T_expression), len(gene_Endo_expression))
            continue
        
        # Calculate the correlation
        corr, pval = stats.pearsonr(gene_T_expression, gene_Endo_expression)
        
        # Store the correlation value in the matrix
        corr_matrix[i, j] = corr
        # store the pval in another matrix
        pval_matrix[i, j] = pval


# Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1,
            xticklabels=top_10_DEGs_Endo, yticklabels=top_10_DEGs_T)
plt.title('Correlation between Top 10 DEGs of T and Endo Cells')
plt.xlabel('Endo Cell Genes')
plt.ylabel('T Cell Genes')
plt.show()



In [None]:
# Convert 'timepoint' column to 'category' dtype
adata_T_pseudo.obs['timepoint'] = adata_T_pseudo.obs['timepoint'].astype('category')

for gene_T in top_10_DEGs_T:
    for gene_Endo in top_10_DEGs_Endo:
        # Create a new figure
        plt.figure(figsize=(8, 6))
        
        # Iterate over each timepoint
        for timepoint in adata_T_pseudo_filtered.obs['timepoint'].cat.categories:
            # Get indices for current timepoint
            indices = adata_T_pseudo_filtered.obs['timepoint'] == timepoint

            # Get the expression data for the current timepoint
            gene_T_expression = adata_T_pseudo_filtered[indices, adata_T_pseudo_filtered.var_names == gene_T].X.flatten()
            gene_Fibro_expression = adata_Endo_pseudo_filtered[indices, adata_Endo_pseudo_filtered.var_names == gene_Endo].X.flatten()
            
            # Plot the data for the current timepoint
            plt.scatter(gene_T_expression, gene_Fibro_expression, label=timepoint)

        # Add labels, title, and legend
        plt.xlabel(gene_T)
        plt.ylabel(gene_Endo)
        plt.title(f'{gene_T} vs {gene_Endo} (Corr: {corr_matrix[top_10_DEGs_T.index(gene_T), top_10_DEGs_Endo.index(gene_Endo)]:.2f})')
        plt.legend(title='Timepoint')

        # Now show the plot
        plt.show()

In [None]:
# check if KLF9 is in the variable names
is_present = "KLF9" in adata_Endo_pseudo_filtered.var_names
print(is_present)


### top 10 DEGs T & Fibro 
PER1
TSC22D3
DDIT4
DEPP1
DUSP1
MYC
NFKBIA
ZFP36
MT1X
KLF6

In [None]:
adata_Fibro = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_Fibro_cells.h5ad')
# generate a pseudo-bulk AnnData object of the Endo cells 
adata_Fibro_pseudo = scRNA2PseudoBulkAnnData(adata_Fibro, sample_id_col='sample_id')
print(adata_Fibro_pseudo.shape) 

In [None]:
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
adata_T_pseudo = scRNA2PseudoBulkAnnData(adata_T, sample_id_col='sample_id')

In [None]:
# generate the correlation matrix of the top 10 DEGs of T and Fibro cells
top_10_DEGs_Fibro = ['PER1', 'TSC22D3', 'DDIT4', 'DEPP1', 'DUSP1', 'MYC', 'NFKBIA', 'ZFP36', 'MT1X', 'KLF6']
top_10_DEGs_T = ['PRDM1', 'SLA', 'TSC22D3', 'IRF1', 'TXNIP', 'DDIT4', 'NFKBIA', 'ZFP36L1', 'FKBP5', 'CXCR4']

# Initialize an empty matrix to store the correlation values
corr_matrix = np.zeros((len(top_10_DEGs_T), len(top_10_DEGs_Fibro)))
pval_matrix = np.zeros((len(top_10_DEGs_T), len(top_10_DEGs_Fibro)))

# Calculate the correlation between the top 10 DEGs of T and Endo cells
for i, gene_T in enumerate(top_10_DEGs_T):
    for j, gene_Fibro in enumerate(top_10_DEGs_Fibro):
        # Get the expression data of the two genes and flatten them to 1D arrays
        gene_T_expression = adata_T_pseudo[:, adata_T_pseudo.var_names == gene_T].X.flatten()
        gene_Fibro_expression = adata_Fibro_pseudo[:, adata_Fibro_pseudo.var_names == gene_Fibro].X.flatten()
        
        # Check if the lengths of the arrays are the same
        if len(gene_T_expression) != len(gene_Fibro_expression):
            print(gene_Endo,len(gene_T_expression), len(gene_Fibro_expression))
            continue
        
        # Calculate the correlation
        corr, pval = stats.pearsonr(gene_T_expression, gene_Fibro_expression)
        
        # Store the correlation value in the matrix
        corr_matrix[i, j] = corr
        # store the pval in another matrix
        pval_matrix[i, j] = pval


# Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1,
            xticklabels=top_10_DEGs_Fibro, yticklabels=top_10_DEGs_T)
plt.title('Correlation between Top 10 DEGs of T and Fibro Cells')
plt.xlabel('Fibro Cell Genes')
plt.ylabel('T Cell Genes')
plt.show()



In [None]:
# transform correlation matrix to dataframe
corr_df = pd.DataFrame(corr_matrix, index=top_10_DEGs_T, columns=top_10_DEGs_Fibro)
corr_df

In [None]:
# save the bilinear figures of T and Fibro cells
# Create a directory named DEGs_T_vs_Fibro
if not os.path.exists('DEGs_T_vs_Fibro'):
    os.makedirs('DEGs_T_vs_Fibro')

# Convert 'timepoint' column to 'category' dtype
adata_T_pseudo.obs['timepoint'] = adata_T_pseudo.obs['timepoint'].astype('category')

for gene_T in top_10_DEGs_T:
    for gene_Fibro in top_10_DEGs_Fibro:
        # Create a new figure
        plt.figure(figsize=(8, 6))
        
        # Iterate over each timepoint
        for timepoint in adata_T_pseudo.obs['timepoint'].cat.categories:
            # Get indices for current timepoint
            indices = adata_T_pseudo.obs['timepoint'] == timepoint

            # Get the expression data for the current timepoint
            gene_T_expression = adata_T_pseudo[indices, adata_T_pseudo.var_names == gene_T].X.flatten()
            gene_Fibro_expression = adata_Fibro_pseudo[indices, adata_Fibro_pseudo.var_names == gene_Fibro].X.flatten()
            
            # Plot the data for the current timepoint
            plt.scatter(gene_T_expression, gene_Fibro_expression, label=timepoint)

        # Add labels, title, and legend
        plt.xlabel(gene_T)
        plt.ylabel(gene_Fibro)
        plt.title(f'{gene_T} vs {gene_Fibro} (Corr: {corr_matrix[top_10_DEGs_T.index(gene_T), top_10_DEGs_Fibro.index(gene_Fibro)]:.2f})')
        plt.legend(title='Timepoint')

        # Save the plot in the directory before showing it
        plt.savefig(f'DEGs_T_vs_Fibro/{gene_T}_vs_{gene_Fibro}.png')

        # Now show the plot
        plt.show()

### top 10 DEGs T & M TSC22D3
DDIT4
FKBP5
AC084871.2
SLC1A3
AREG
ACSL1
RGS1
CH25H
SMIM3

In [None]:
adata_M = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_M_cells.h5ad')
# generate a pseudo-bulk AnnData object of the Endo cells 
adata_M_pseudo = scRNA2PseudoBulkAnnData(adata_M, sample_id_col='sample_id')
print(adata_M_pseudo.shape) 

In [None]:

# Top 10 DEGs for T and M cells
top_10_DEGs_T = ['PRDM1', 'SLA', 'TSC22D3', 'TXNIP', 'DDIT4', 'NFKBIA', 'FKBP5', 'RGS1']
top_10_DEGs_M = ['TSC22D3', 'DDIT4', 'FKBP5', 'AC084871.2', 'SLC1A3', 'AREG', 'ACSL1', 'RGS1', 'CH25H', 'SMIM3']

# Initialize an empty matrix to store the correlation values
corr_matrix = np.zeros((len(top_10_DEGs_T), len(top_10_DEGs_M)))
pval_matrix = np.zeros((len(top_10_DEGs_T), len(top_10_DEGs_M)))

# Calculate the correlation between the top 10 DEGs of T and M cells
for i, gene_T in enumerate(top_10_DEGs_T):
    for j, gene_M in enumerate(top_10_DEGs_M):
        # Get the expression data of the two genes and flatten them to 1D arrays
        gene_T_expression = adata_T_pseudo[:, adata_T_pseudo.var_names == gene_T].X.flatten()
        gene_M_expression = adata_M_pseudo[:, adata_M_pseudo.var_names == gene_M].X.flatten()
        
        # Check if the lengths of the arrays are the same
        if len(gene_T_expression) != len(gene_M_expression):
            print(gene_M,len(gene_T_expression), len(gene_M_expression))
            continue
        
        # Calculate the correlation
        corr, pval = stats.pearsonr(gene_T_expression, gene_M_expression)
        
        # Store the correlation value in the matrix
        corr_matrix[i, j] = corr
        # store the pval in another matrix
        pval_matrix[i, j] = -np.log10(pval)


# Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1,
            xticklabels=top_10_DEGs_M, yticklabels=top_10_DEGs_T)
plt.title('Correlation between Top DEGs of T and M Cells')
plt.xlabel('M Cell Genes')
plt.ylabel('T Cell Genes')
plt.show()


In [None]:
# transform correlation matrix to dataframe
corr_df = pd.DataFrame(corr_matrix, index=top_10_DEGs_T, columns=top_10_DEGs_M)
corr_df

In [None]:
# transform the pval matrix to dataframe
pval_df = pd.DataFrame(pval_matrix, index=top_10_DEGs_T, columns=top_10_DEGs_M)
pval_df

In [None]:

# Plotting the heatmap with correlation values
plt.figure(figsize=(12, 6))
heatmap = sns.heatmap(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1,
                      xticklabels=top_10_DEGs_M, yticklabels=top_10_DEGs_T,
                      annot=False)  # Turn off default annotation

# Overlaying the correlation values and -log10 of p-values
for i in range(corr_matrix.shape[0]):
    for j in range(corr_matrix.shape[1]):
        plt.text(j + 0.5, i + 0.5, f'{corr_matrix[i, j]:.2f}\n({pval_matrix[i, j]:.2f})',
                 horizontalalignment='center', verticalalignment='center', fontsize=8)

plt.title('Correlation between Top  DEGs of T and M Cells\n(Correlation / -log(p-value))')
plt.xlabel('M Cell Genes')
plt.ylabel('T Cell Genes')
plt.show()


In [None]:
# Create a directory named DEGs_T_vs_M
if not os.path.exists('DEGs_T_vs_M'):
    os.makedirs('DEGs_T_vs_M')

# Convert 'timepoint' column to 'category' dtype
adata_T_pseudo.obs['timepoint'] = adata_T_pseudo.obs['timepoint'].astype('category')

for gene_T in top_10_DEGs_T:
    for gene_M in top_10_DEGs_M:
        # Create a new figure
        plt.figure(figsize=(8, 6))
        
        # Iterate over each timepoint
        for timepoint in adata_T_pseudo.obs['timepoint'].cat.categories:
            # Get indices for current timepoint
            indices = adata_T_pseudo.obs['timepoint'] == timepoint

            # Get the expression data for the current timepoint
            gene_T_expression = adata_T_pseudo[indices, adata_T_pseudo.var_names == gene_T].X.flatten()
            gene_M_expression = adata_M_pseudo[indices, adata_M_pseudo.var_names == gene_M].X.flatten()
            
            # Plot the data for the current timepoint
            plt.scatter(gene_T_expression, gene_M_expression, label=timepoint)

        # Add labels, title, and legend
        plt.xlabel(gene_T)
        plt.ylabel(gene_M)
        plt.title(f'{gene_T} vs {gene_M} (Corr: {corr_matrix[top_10_DEGs_T.index(gene_T), top_10_DEGs_M.index(gene_M)]:.2f})')
        plt.legend(title='Timepoint')

        # Save the plot in the directory before showing it
        if corr_matrix[top_10_DEGs_T.index(gene_T), top_10_DEGs_M.index(gene_M)] > 0.5:
             plt.savefig(f'DEGs_T_vs_M/{gene_T}_vs_{gene_M}.png')

        # Now show the plot
        plt.show()


### top 10 DEGs T & Epi DDIT4
RNPC3
TSC22D3
ACOT7
NFAT5
C1orf198
PPP1R16A
TOP3B
GNG11
MIR29B2CHG

In [None]:
adata_Epi = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_Epi_cells.h5ad')
# generate a pseudo-bulk AnnData object of the Endo cells 
adata_Epi_pseudo = scRNA2PseudoBulkAnnData(adata_Epi, sample_id_col='sample_id')
print(adata_Epi_pseudo.shape) 

In [None]:
# Top 10 DEGs for T and Epi cells
top_10_DEGs_Epi = ['DDIT4', 'RNPC3', 'TSC22D3', 'ACOT7', 'NFAT5', 'C1orf198', 'PPP1R16A', 'TOP3B', 'GNG11', 'MIR29B2CHG']

# Initialize an empty matrix to store the correlation values
corr_matrix = np.zeros((len(top_10_DEGs_T), len(top_10_DEGs_Epi)))
pval_matrix = np.zeros((len(top_10_DEGs_T), len(top_10_DEGs_Epi)))

# Calculate the correlation between the top 10 DEGs of T and Epi cells
for i, gene_T in enumerate(top_10_DEGs_T):
    for j, gene_Epi in enumerate(top_10_DEGs_Epi):
        # Get the expression data of the two genes and flatten them to 1D arrays
        gene_T_expression = adata_T_pseudo[:, adata_T_pseudo.var_names == gene_T].X.flatten()
        gene_Epi_expression = adata_Epi_pseudo[:, adata_Epi_pseudo.var_names == gene_Epi].X.flatten()
        
        # Check if the lengths of the arrays are the same
        if len(gene_T_expression) != len(gene_Epi_expression):
            print(gene_Epi,len(gene_T_expression), len(gene_Epi_expression))
            continue
        
        # Calculate the correlation
        corr, pval = stats.pearsonr(gene_T_expression, gene_Epi_expression)
        
        # Store the correlation value in the matrix
        corr_matrix[i, j] = corr
        # store the pval in another matrix
        pval_matrix[i, j] = -np.log10(pval)


# Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1,
            xticklabels=top_10_DEGs_Epi, yticklabels=top_10_DEGs_T)
plt.title('Correlation between Top 10 DEGs of T and Epi Cells')
plt.xlabel('Epi Cell Genes')
plt.ylabel('T Cell Genes')
plt.show()


In [None]:
# transform correlation matrix to dataframe
corr_df = pd.DataFrame(corr_matrix, index=top_10_DEGs_T, columns=top_10_DEGs_Epi)
corr_df

In [None]:
# transform the pval matrix to dataframe
pval_df = pd.DataFrame(pval_matrix, index=top_10_DEGs_T, columns=top_10_DEGs_Epi)
pval_df

In [None]:

# Plotting the heatmap with correlation values
plt.figure(figsize=(12, 10))
heatmap = sns.heatmap(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1,
                      xticklabels=top_10_DEGs_Epi, yticklabels=top_10_DEGs_T,
                      annot=False)  # Turn off default annotation

# Overlaying the correlation values and -log10 of p-values
for i in range(corr_matrix.shape[0]):
    for j in range(corr_matrix.shape[1]):
        plt.text(j + 0.5, i + 0.5, f'{corr_matrix[i, j]:.2f}\n({pval_matrix[i, j]:.2f})',
                 horizontalalignment='center', verticalalignment='center', fontsize=8)

plt.title('Correlation between Top 10 DEGs of T and Epi Cells\n(Correlation / -log(p-value))')
plt.xlabel('Epi Cell Genes')
plt.ylabel('T Cell Genes')
plt.show()


In [None]:
# Create a directory named DEGs_T_vs_Epi
if not os.path.exists('DEGs_T_vs_Epi'):
    os.makedirs('DEGs_T_vs_Epi')

# Convert 'timepoint' column to 'category' dtype
adata_T_pseudo.obs['timepoint'] = adata_T_pseudo.obs['timepoint'].astype('category')

for gene_T in top_10_DEGs_T:
    for gene_Epi in top_10_DEGs_Epi:
        # Create a new figure
        plt.figure(figsize=(8, 6))
        
        # Iterate over each timepoint
        for timepoint in adata_T_pseudo.obs['timepoint'].cat.categories:
            # Get indices for current timepoint
            indices = adata_T_pseudo.obs['timepoint'] == timepoint

            # Get the expression data for the current timepoint
            gene_T_expression = adata_T_pseudo[indices, adata_T_pseudo.var_names == gene_T].X.flatten()
            gene_Epi_expression = adata_Epi_pseudo[indices, adata_Epi_pseudo.var_names == gene_Epi].X.flatten()
            
            # Plot the data for the current timepoint
            plt.scatter(gene_T_expression, gene_Epi_expression, label=timepoint)

        # Add labels, title, and legend
        plt.xlabel(gene_T)
        plt.ylabel(gene_Epi)
        plt.title(f'{gene_T} vs {gene_Epi} (Corr: {corr_matrix[top_10_DEGs_T.index(gene_T), top_10_DEGs_Epi.index(gene_Epi)]:.2f})')
        plt.legend(title='Timepoint')

        # Save the plot in the directory before showing it
        if abs(corr_matrix[top_10_DEGs_T.index(gene_T), top_10_DEGs_Epi.index(gene_Epi)]) > 0.4:
            plt.savefig(f'DEGs_T_vs_Epi/{gene_T}_vs_{gene_Epi}.png')


        # Now show the plot
        plt.show()


# plot UMAP of genes of interest

In [None]:
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
adata_T.shape

In [None]:
sc.pl.umap(adata_T, color=['leiden', 'timepoint' ,'PDCD1','HAVCR2','PRDM1','CXCR4', 'CXCL13', 'CTLA4' ])

In [None]:
adata_T.obs['timepoint'].unique()

In [None]:
# Filter the adata_T with timepoint "on"
adata_T_on = adata_T[adata_T.obs['timepoint'] == 'on']

# Filter the adata_T with timepoint "pre"
adata_T_pre = adata_T[adata_T.obs['timepoint'] == 'pre']


In [None]:
sc.pl.umap(adata_T_on, color=['leiden', 'CD8A' ,'HAVCR2','PDCD1','PRDM1','CXCR4', 'CXCL13', 'CTLA4' ])

In [None]:
sc.pl.umap(adata_T_pre, color=['leiden', 'CD8A' ,'HAVCR2','PDCD1','PRDM1','CXCR4', 'CXCL13', 'CTLA4' ])

In [None]:
# filter the T cells data to keep only cells expressing PDCD1
adata_T_filtered = adata_T[adata_T[:, 'PDCD1'].X > 0, :].copy()

In [None]:
sc.pl.umap(adata_T_filtered, color=['leiden', 'CD8A' ,'HAVCR2','PDCD1','PRDM1','CXCR4', 'CXCL13', 'CTLA4' ])

In [None]:
# Filter the adata_T with timepoint "on"
adata_T_filtered_on = adata_T_filtered[adata_T_filtered.obs['timepoint'] == 'on']

# Filter the adata_T with timepoint "pre"
adata_T_filtered_pre = adata_T_filtered[adata_T_filtered.obs['timepoint'] == 'pre']

In [None]:
sc.pl.umap(adata_T_filtered_pre, color=['leiden', 'CD8A' ,'HAVCR2','PDCD1','PRDM1','CXCR4', 'CXCL13', 'CTLA4' ])

In [None]:
sc.pl.umap(adata_T_filtered_on, color=['leiden', 'CD8A' ,'HAVCR2','PDCD1','PRDM1','CXCR4', 'CXCL13', 'CTLA4' ])

# use the whole scRNA data to generate pseudo bulk data

## basic filter

In [None]:
# Read in data in H5AD format
data_dir = "/home/data/ICI_exprs/EGAS00001004809/"
data_file =  data_dir + '1863-counts_cells_cohort1.h5ad'
adata = sc.read_h5ad(data_file)
print ("Read in dataset with dimension: " + str(adata.shape))

In [None]:
# read in the csv meta-data
meta_file_pathname = data_dir + "1872-BIOKEY_metaData_cohort1_web.csv"
cohort1_meta = pd.read_csv(meta_file_pathname, header = 0, index_col = 0)

In [None]:
# make sure the index of cohort1_meta agree with the obs of adata
cohort1_meta = cohort1_meta.reindex(adata.obs.index)
adata.obs = cohort1_meta
print(adata.obs.columns)
print(adata.obs['patient_id'].unique())

In [None]:
# change "timepoint" values to lowercase
adata.obs["timepoint"] = adata.obs["timepoint"].str.lower()
adata.obs["timepoint"].unique()

In [None]:
# the field "cohort" is equivalent to "treatment", rename it
adata.obs.rename(columns={'cohort': 'treatment'}, inplace=True)

In [None]:
# extract sample_id from index, re-join first three columns of index after splitting by "_"
adata.obs['sample_id'] = adata.obs.index.str.split("_").str[0:3].str.join("_")

In [None]:
# drop the nCount_RNA and nFeature_RNA columns
adata.obs.drop(columns=['nCount_RNA', 'nFeature_RNA'], inplace=True)

# rearrange the columns order: "patient_id", "sample_id", "timepoint", "treatment", "cell_type"
adata.obs = adata.obs[["patient_id", "sample_id", "timepoint", "treatment", 'expansion', 'BC_type', 'cellType']]   

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
# removing cells containing <400 || >25000 UMIs
sc.pp.filter_cells(adata, min_counts = 400)
sc.pp.filter_cells(adata, max_counts = 25000)

In [None]:
# label genes as mt
adata.var['mt'] = adata.var_names.str.startswith('MT-')  

# annotate cells with the percent of genes assigned as mt
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# remove mitochondrial genes from analysis
adata = adata[:, ~adata.var['mt'].values]

# Here we keep cells with < 20% mito ratio
adata = adata[adata.obs['pct_counts_mt'] < 20, :]
adata.shape

In [None]:
adata.raw = adata

In [None]:
data_dir_NHDP = "/home/qiuaodon/Desktop/project_data_new/"
adata.write_h5ad(data_dir_NHDP + '1863-counts_cells_cohort1_whole_cells.h5ad')

## draw the plot of genes vs LR based on whole tumor scRNA data

In [None]:
adata.raw = adata

In [None]:
adata_pseudo = scRNA2PseudoBulkAnnData(adata, sample_id_col='sample_id')
print(adata_pseudo.shape) 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Replace 'gene_name' with the actual names as they appear in adata_T_pseudo.var_names
pdcd1_expression = adata_T_pseudo[:, 'PDCD1'].X
cd274_expression = adata_pseudo[:, 'CD274'].X

# Assuming 'HAVCR2' expression is also stored in .X
havcr2_expression = adata_T_pseudo[:, 'HAVCR2'].X

#timepoint is a column in .obs
timepoints = adata_pseudo.obs['timepoint']

# Compute the products
pdcd1_cd274_product = pdcd1_expression * cd274_expression
if 'on' in timepoints.values:
    pdcd1_cd274_product[timepoints == 'on'] *= 0.2

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'PDCD1_CD274': np.ravel(pdcd1_cd274_product),
    'HAVCR2': np.ravel(havcr2_expression),
    'timepoint': timepoints
})

# Plotting
plt.figure(figsize=(8, 5))
for timepoint in ['on', 'pre']:
    subset = plot_data[plot_data['timepoint'] == timepoint]
    plt.scatter(subset['PDCD1_CD274'], subset['HAVCR2'], label=timepoint, alpha=1.0)

plt.xlabel('PDCD1 * CD274 (scaled by 0.2 for "on" timepoint)')
plt.ylabel('HAVCR2')
plt.title('LR vs HAVCR2')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Replace 'gene_name' with the actual names as they appear in adata_T_pseudo.var_names
pdcd1_expression = adata_T_pseudo[:, 'PDCD1'].X
cd274_expression = adata_pseudo[:, 'CD274'].X

# Assuming 'HAVCR2' expression is also stored in .X
havcr2_expression = adata_T_pseudo[:, 'PRDM1'].X

#timepoint is a column in .obs
timepoints = adata_pseudo.obs['timepoint']

# Compute the products
pdcd1_cd274_product = pdcd1_expression * cd274_expression
if 'on' in timepoints.values:
    pdcd1_cd274_product[timepoints == 'on'] *= 0.2

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'PDCD1_CD274': np.ravel(pdcd1_cd274_product),
    'PRDM1': np.ravel(havcr2_expression),
    'timepoint': timepoints
})

# Plotting
plt.figure(figsize=(8, 5))
for timepoint in ['on', 'pre']:
    subset = plot_data[plot_data['timepoint'] == timepoint]
    plt.scatter(subset['PDCD1_CD274'], subset['PRDM1'], label=timepoint, alpha=1.0)

plt.xlabel('PDCD1 * CD274 (scaled by 0.2 for "on" timepoint)')
plt.ylabel('PRDM1')
plt.title('LR vs PRDM1')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Replace 'gene_name' with the actual names as they appear in adata_T_pseudo.var_names
pdcd1_expression = adata_T_pseudo[:, 'PDCD1'].X
cd274_expression = adata_pseudo[:, 'CD274'].X

# Assuming 'HAVCR2' expression is also stored in .X
havcr2_expression = adata_T_pseudo[:, 'CXCR4'].X

#timepoint is a column in .obs
timepoints = adata_pseudo.obs['timepoint']

# Compute the products
pdcd1_cd274_product = pdcd1_expression * cd274_expression
if 'on' in timepoints.values:
    pdcd1_cd274_product[timepoints == 'on'] *= 0.2

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'PDCD1_CD274': np.ravel(pdcd1_cd274_product),
    'CXCR4': np.ravel(havcr2_expression),
    'timepoint': timepoints
})

# Plotting
plt.figure(figsize=(8, 5))
for timepoint in ['on', 'pre']:
    subset = plot_data[plot_data['timepoint'] == timepoint]
    plt.scatter(subset['PDCD1_CD274'], subset['CXCR4'], label=timepoint, alpha=1.0)

plt.xlabel('PDCD1 * CD274 (scaled by 0.2 for "on" timepoint)')
plt.ylabel('CXCR4')
plt.title('LR vs CXCR4')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Replace 'gene_name' with the actual names as they appear in adata_T_pseudo.var_names
pdcd1_expression = adata_T_pseudo[:, 'PDCD1'].X
cd274_expression = adata_pseudo[:, 'CD274'].X

# Assuming 'HAVCR2' expression is also stored in .X
havcr2_expression = adata_T_pseudo[:, 'CXCL13'].X

#timepoint is a column in .obs
timepoints = adata_pseudo.obs['timepoint']

# Compute the products
pdcd1_cd274_product = pdcd1_expression * cd274_expression
if 'on' in timepoints.values:
    pdcd1_cd274_product[timepoints == 'on'] *= 0.2

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'PDCD1_CD274': np.ravel(pdcd1_cd274_product),
    'CXCL13': np.ravel(havcr2_expression),
    'timepoint': timepoints
})

# Plotting
plt.figure(figsize=(8, 5))
for timepoint in ['on', 'pre']:
    subset = plot_data[plot_data['timepoint'] == timepoint]
    plt.scatter(subset['PDCD1_CD274'], subset['CXCL13'], label=timepoint, alpha=1.0)

plt.xlabel('PDCD1 * CD274 (scaled by 0.2 for "on" timepoint)')
plt.ylabel('CXCL13')
plt.title('LR vs CXCL13')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Replace 'gene_name' with the actual names as they appear in adata_T_pseudo.var_names
pdcd1_expression = adata_T_pseudo[:, 'PDCD1'].X
cd274_expression = adata_pseudo[:, 'CD274'].X

# Assuming 'HAVCR2' expression is also stored in .X
havcr2_expression = adata_T_pseudo[:, 'CTLA4'].X

#timepoint is a column in .obs
timepoints = adata_pseudo.obs['timepoint']

# Compute the products
pdcd1_cd274_product = pdcd1_expression * cd274_expression
if 'on' in timepoints.values:
    pdcd1_cd274_product[timepoints == 'on'] *= 0.2

# Create DataFrame for plotting
plot_data = pd.DataFrame({
    'PDCD1_CD274': np.ravel(pdcd1_cd274_product),
    'CTLA4': np.ravel(havcr2_expression),
    'timepoint': timepoints
})

# Plotting
plt.figure(figsize=(8, 5))
for timepoint in ['on', 'pre']:
    subset = plot_data[plot_data['timepoint'] == timepoint]
    plt.scatter(subset['PDCD1_CD274'], subset['CTLA4'], label=timepoint, alpha=1.0)

plt.xlabel('PDCD1 * CD274 (scaled by 0.2 for "on" timepoint)')
plt.ylabel('CTLA4')
plt.title('LR vs CTLA4')
plt.legend()
plt.show()


# get the pseudo bulk data adding PDL1 for boyang

In [None]:
adata_T = sc.read(data_dir_NHDP + '1863-counts_cells_cohort1_T_cells.h5ad')
# generate a pseudo-bulk AnnData object of the T cells 
adata_T_pseudo = scRNA2PseudoBulkAnnData(adata_T, sample_id_col='sample_id')
print(adata_T_pseudo.shape) 

In [None]:
cd274_data = adata_T.raw[:, 'CD274'].X
cd274_df = pd.DataFrame(cd274_data, columns=['CD274'])


In [None]:
import anndata as ad
import scipy.sparse

# Create a new AnnData object for CD274
cd274_adata = ad.AnnData(X=cd274_data.reshape(-1, 1))
cd274_adata.var_names = ['CD274']

# Concatenate this new object with the original adata_T
# If adata_T is sparse, ensure cd274_adata is also sparse
if scipy.sparse.issparse(adata_T.X):
    cd274_adata.X = scipy.sparse.csr_matrix(cd274_adata.X)

combined_adata = ad.concat([adata_T, cd274_adata], axis=1)


In [None]:
# Extract CD274 data
cd274_data = adata_T.raw[:, 'CD274'].X
