In [1]:
import os
import torch
import numpy as np
import pandas as pd

import scanpy as sc
from anndata import AnnData

only_not_interact_gene=True

result_dir = "../edges/"

In [2]:
to_save_dir="../edges/"
data_dir="../../data/BC/processed1/"
genes = torch.load("/".join(data_dir.split("/")[:-2]) + "/genes.pth")


cell_types=['B_Cells', 'CD4+_T_Cells', 'CD8+_T_Cells', 'DCIS_1', 'DCIS_2', 'Endothelial', 'IRF7+_DCs', 'Invasive_Tumor', 'LAMP3+_DCs', 'Macrophages_1', 'Macrophages_2', 'Mast_Cells', 'Myoepi_ACTA2+', 'Myoepi_KRT15+', 'Perivascular-Like', 'Prolif_Invasive_Tumor', 'Stromal', 'Stromal_&_T_Cell_Hybrid', 'T_Cell_&_Tumor_Hybrid', 'Unlabeled']

samples=['sample1_rep1', 'sample1_rep2']

In [3]:
cell_type_pair_sequence=[]
for cell_typei in cell_types:
    for cell_typej in cell_types:
        cell_type_pair_sequence.append(cell_typei+"__"+cell_typej)

def reshape_z_value(result_dict):
    results=[]
    for genei in genes+["all"]:
        resulti=np.zeros((len(cell_type_pair_sequence)))
        tmp=result_dict[genei]
        for j in range(len(tmp[0])):
            resulti[cell_type_pair_sequence.index(tmp[0][j])]=tmp[1][j]
        results.append(resulti)
    return np.stack(results,axis=0).transpose((-1,-2)) #(number_of_cell_type_pair,genes)

In [4]:
z_dir="./counts/"
if not os.path.exists(z_dir):
    os.system("mkdir "+z_dir)

def get_counts(sample):
    results = torch.load(result_dir + "edges_" + sample + ".pth")
    cell_type_name = results["cell_type_name"]
    cell_type_target = [cell_type_name[i][0] for i in range(len(cell_type_name))]
    types,counts=np.unique(cell_type_target,return_counts=True)
    counts=counts.tolist()
    types=types.tolist()
    counts1=[]
    for i in range(len(cell_types)):
        if cell_types[i] not in types:
            print(cell_types[i],"not in",sample,"with cell types:",types)
            counts1.append(0)
            continue
        counts1.append(counts[types.index(cell_types[i])])
    df=pd.DataFrame({"cell_type":cell_types,"counts":counts1})
    df.to_csv(z_dir+sample+".csv",index=False)

for i in range(len(samples)):
    samplei=samples[i]
    get_counts(samplei)
    print("finish counting:",samplei)

Unlabeled not in sample1_rep1 with cell types: ['B_Cells', 'CD4+_T_Cells', 'CD8+_T_Cells', 'DCIS_1', 'DCIS_2', 'Endothelial', 'IRF7+_DCs', 'Invasive_Tumor', 'LAMP3+_DCs', 'Macrophages_1', 'Macrophages_2', 'Mast_Cells', 'Myoepi_ACTA2+', 'Myoepi_KRT15+', 'Perivascular-Like', 'Prolif_Invasive_Tumor', 'Stromal', 'Stromal_&_T_Cell_Hybrid', 'T_Cell_&_Tumor_Hybrid']
finish counting: sample1_rep1
Unlabeled not in sample1_rep2 with cell types: ['B_Cells', 'CD4+_T_Cells', 'CD8+_T_Cells', 'DCIS_1', 'DCIS_2', 'Endothelial', 'IRF7+_DCs', 'Invasive_Tumor', 'LAMP3+_DCs', 'Macrophages_1', 'Macrophages_2', 'Mast_Cells', 'Myoepi_ACTA2+', 'Myoepi_KRT15+', 'Perivascular-Like', 'Prolif_Invasive_Tumor', 'Stromal', 'Stromal_&_T_Cell_Hybrid', 'T_Cell_&_Tumor_Hybrid']
finish counting: sample1_rep2


In [5]:
# Statistics: regression

In [6]:
import pandas as pd

def read_regression(sample):
    # Assuming result_dir is a globally available directory path
    global result_dir, cell_types, cell_type_pair_sequence
    
    # Load the results
    results = torch.load(result_dir + "edges_" + sample + ".pth")
    
    # Extract relevant data
    attention_scores = results["attention_score"]  # Shape (B, 49, C)
    proportion=torch.abs(attention_scores)
    proportion=proportion/torch.sum(proportion,dim=1,keepdim=True)
    attention_scores[proportion<0.035]=0
    
    cell_type_names = np.array(results["cell_type_name"])  # Shape (B, 50)
    true_expression = results["y"]  # Shape (B, C)
    #print(calculate_mean_expression_by_cell_type(true_expression, cell_type_names[:,0], cell_types))
    
    # Initialize a tensor to hold aggregated interaction strengths
    B, _, C = attention_scores.shape
    t = len(cell_types)
    aggregated_interactions = torch.zeros((B, t, C))
    
    # Map cell type names to indices
    cell_type_to_index = {ct: idx for idx, ct in enumerate(cell_types)}
    
    # Aggregate interaction strengths by cell type
    for b in range(B):
        for n in range(1, 50):  # Skip the first element, which is the target cell type
            neighbor_type = cell_type_names[b][n]
            if neighbor_type in cell_type_to_index:
                idx = cell_type_to_index[neighbor_type]
                aggregated_interactions[b, idx] += attention_scores[b, n-1]
    
    aggregated_interactions=torch.abs(aggregated_interactions)/torch.sum(torch.abs(aggregated_interactions),dim=1,keepdim=True)
    
    # Prepare to compute correlations for each cell type pair
    results_matrix = []
    
    for pair in cell_type_pair_sequence:
        from_type, to_type = pair.split("__")
        if from_type in cell_type_to_index:
            mask = (cell_type_names[:, 0] == to_type)
            filtered_interactions = aggregated_interactions[mask, cell_type_to_index[from_type]]
            filtered_expressions = true_expression[mask]
            if np.sum(mask)==0:
                results_matrix.append([0 for k in range(C)])
                continue
            
            # Calculate Pearson correlation coefficient for each gene
            corr_coeffs = []
            for i in range(C):
                gene_interactions = filtered_interactions[:, i]
                gene_expressions = filtered_expressions[:, i]
                if len(gene_interactions)<=10 or ((gene_interactions == gene_interactions[0]).all() or (gene_expressions == gene_expressions[0]).all()):
                    corr_coeffs.append(0)
                    continue
                r = torch.corrcoef(torch.stack((gene_interactions, gene_expressions)))[0, 1]
                n = gene_interactions.numel()
                z_value = r * ((n-2)**0.5) / (1 - r**2)**0.5
                if torch.isnan(z_value) or torch.isinf(z_value) or r==1:
                    print(from_type, to_type, np.sum((cell_type_names[:, 0] == to_type)))
                    print(r,z_value,gene_interactions,gene_expressions)
                    z_value=0
                corr_coeffs.append(float(z_value))
            results_matrix.append(corr_coeffs)
    
    # Convert results to a tensor of shape (t^2, C)
    results_tensor = np.array(results_matrix)
    results_matrix=np.nan_to_num(results_matrix)
    return results_tensor

z_dir="./z_regressionp/"
if not os.path.exists(z_dir):
    os.system("mkdir "+z_dir)
    
results=[]
cnt=0
for samplei in samples:
    print(cnt+1,len(samples))
    cnt=cnt+1
    tmp=read_regression(samplei)
    print(tmp.shape,np.max(tmp),np.min(tmp),np.mean(tmp),np.median(tmp))
    df=pd.DataFrame(data=tmp,columns=genes,index=cell_type_pair_sequence)
    df.to_csv(z_dir+samplei+".csv")
    results.append(tmp)
    print("regression:",samplei)

np.save(z_dir+"z_values.npy",np.stack(results,axis=0))

1 2
B_Cells CD4+_T_Cells 8453
tensor(nan) tensor(nan) tensor([0.0000, 0.0000, 0.4688,  ..., 0.0000, 0.3662, 0.0000]) tensor([-0.2651, -0.2651, -0.2651,  ..., -0.2651, -0.2651,  0.4280])
B_Cells CD4+_T_Cells 8453
tensor(nan) tensor(nan) tensor([0.0000, 0.0000, 0.8752,  ..., 0.0000, 0.8142, 0.0000]) tensor([-0.0227, -0.0227, -0.0227,  ..., -0.0227, -0.0227, -0.0227])
B_Cells CD4+_T_Cells 8453
tensor(nan) tensor(nan) tensor([0.0000, 0.0000, 0.4310,  ..., 0.0000, 0.6524, 0.0000]) tensor([-0.1348, -0.1348, -0.1348,  ..., -0.1348, -0.1348, -0.1348])
B_Cells CD4+_T_Cells 8453
tensor(nan) tensor(nan) tensor([0.0000, 0.0000, 0.3959,  ..., 0.0000, 0.0393, 0.0000]) tensor([-0.0105, -0.0105, -0.0105,  ..., -0.0105, -0.0105, -0.0105])
B_Cells CD4+_T_Cells 8453
tensor(nan) tensor(nan) tensor([0.0000, 0.0000, 0.4718,  ..., 0.0000, 0.6831, 0.0000]) tensor([-0.0040, -0.0040, -0.0040,  ..., -0.0040, -0.0040, -0.0040])
B_Cells CD4+_T_Cells 8453
tensor(nan) tensor(nan) tensor([0.0000, 0.0000, 0.6586,  ...

In [7]:
import pandas as pd

def read_regression_adapt(sample):
    # Assuming result_dir is a globally available directory path
    global result_dir, cell_types, cell_type_pair_sequence
    
    # Load the results
    results = torch.load(result_dir + "edges_" + sample + ".pth")
    
    # Extract relevant data
    attention_scores = results["attention_score"]  # Shape (B, 49, C)

    proportion=torch.abs(results["attention_score"])
    proportion=proportion/torch.sum(proportion,dim=1,keepdim=True)
    attention_scores[proportion<0.35]=0
    
    cell_type_names = np.array(results["cell_type_name"])  # Shape (B, 50)
    true_expression = results["y"]  # Shape (B, C)
    #print(calculate_mean_expression_by_cell_type(true_expression, cell_type_names[:,0], cell_types))
    
    # Initialize a tensor to hold aggregated interaction strengths
    B, _, C = attention_scores.shape
    t = len(cell_types)
    aggregated_interactions = torch.zeros((B, t, C))
    
    # Map cell type names to indices
    cell_type_to_index = {ct: idx for idx, ct in enumerate(cell_types)}
    
    # Aggregate interaction strengths by cell type
    for b in range(B):
        for n in range(1, 50):  # Skip the first element, which is the target cell type
            neighbor_type = cell_type_names[b][n]
            if neighbor_type in cell_type_to_index:
                idx = cell_type_to_index[neighbor_type]
                aggregated_interactions[b, idx] += attention_scores[b, n-1]
    
    aggregated_interactions1=torch.abs(aggregated_interactions)/torch.sum(torch.abs(aggregated_interactions),dim=1,keepdim=True)
    aggregated_interactions=torch.where(torch.sum(torch.abs(aggregated_interactions),dim=1,keepdim=True)==0,torch.zeros_like(aggregated_interactions),aggregated_interactions1)
    # Prepare to compute correlations for each cell type pair
    results_matrix = []
    
    for pair in cell_type_pair_sequence:
        from_type, to_type = pair.split("__")
        if from_type in cell_type_to_index:
            mask = (cell_type_names[:, 0] == to_type)
            filtered_interactions = aggregated_interactions[mask, cell_type_to_index[from_type]]
            filtered_expressions = true_expression[mask]
            if np.sum(mask)==0:
                results_matrix.append([0 for k in range(C)])
                continue
            
            # Calculate Pearson correlation coefficient for each gene
            corr_coeffs = []
            for i in range(C):
                gene_interactions = filtered_interactions[:, i]
                gene_expressions = filtered_expressions[:, i]
                if len(gene_interactions)<=10 or ((gene_interactions == gene_interactions[0]).all() or (gene_expressions == gene_expressions[0]).all()):
                    corr_coeffs.append(0)
                    continue
                r = torch.corrcoef(torch.stack((gene_interactions, gene_expressions)))[0, 1]
                n = gene_interactions.numel()
                z_value = r * ((n-2)**0.5) / (1 - r**2)**0.5
                if torch.isnan(z_value) or torch.isinf(z_value) or r==1:
                    print(from_type, to_type, np.sum((cell_type_names[:, 0] == to_type)))
                    print(r,z_value,gene_interactions,gene_expressions)
                    z_value=10
                corr_coeffs.append(float(z_value))
            results_matrix.append(corr_coeffs)
    
    # Convert results to a tensor of shape (t^2, C)
    results_tensor = np.array(results_matrix)
    results_tensor[results_tensor>10]=10
    results_tensor[results_tensor<-10]=-10
    results_matrix=np.nan_to_num(results_matrix)
    return results_tensor

z_dir="./z_regressionp_adapt/"
if not os.path.exists(z_dir):
    os.system("mkdir "+z_dir)
    
results=[]
cnt=0
for samplei in samples:
    print(cnt+1,len(samples))
    cnt=cnt+1
    tmp=read_regression_adapt(samplei)
    print(tmp.shape,np.max(tmp),np.min(tmp),np.mean(tmp),np.median(tmp))
    df=pd.DataFrame(data=tmp,columns=genes,index=cell_type_pair_sequence)
    df.to_csv(z_dir+samplei+".csv")
    results.append(tmp)
    print("regression:",samplei)

np.save(z_dir+"z_values.npy",np.stack(results,axis=0))

1 2
(400, 321) 10.0 -10.0 -0.013360465559265483 0.0
regression: sample1_rep1
2 2
(400, 321) 10.0 -10.0 -0.023821230408652454 0.0
regression: sample1_rep2


In [8]:
import pandas as pd

def calcualte_z_neighbor(x,y,avg_cnti):
    p=torch.mean(y/avg_cnti)
    var=y.shape[0]*p*(1-p)/30
    if var==0:
        return 0
    return float(torch.mean(x-p))

def calculate_strength_spatial_neighbor_adapt(sample):
    # Assuming result_dir is a globally available directory path
    global result_dir, cell_types, cell_type_pair_sequence
    
    # Load the results
    results = torch.load(result_dir + "edges_" + sample + ".pth")
    cell_type_counts=pd.read_csv("./counts/"+sample+".csv")
    counts_all=float(np.sum(cell_type_counts.loc[:,"counts"].values))
    
    # Extract relevant data
    attention_scores = results["attention_score"]  # Shape (B, 49, C)

    proportion=torch.abs(results["attention_score"])
    proportion=proportion/torch.sum(proportion,dim=1,keepdim=True)
    attention_scores[proportion<0.35]=0
    
    expect_cnt_attention_scores=torch.where(attention_scores!=0,torch.ones_like(attention_scores),torch.zeros_like(attention_scores))
    
    cell_type_names = np.array(results["cell_type_name"])  # Shape (B, 50)
    true_expression = results["y"]  # Shape (B, C)
    pred_expression=results["y_pred"]
    
    cell_type_target=[cell_type_names[i][0] for i in range(len(cell_type_names))]
    type_exp_dict=np.load(data_dir + sample + "_TypeExp.npz", allow_pickle=True)
    type_exps=torch.Tensor(np.stack([type_exp_dict[cell_typei] for cell_typei in cell_type_target],axis=0))
    
    #true_expression=true_expression+type_exps
    #pred_expression=pred_expression+type_exps
    
    # Initialize a tensor to hold aggregated interaction strengths
    B, _, C = attention_scores.shape
    t = len(cell_types)
    aggregated_interactions = torch.zeros((B, t, C))
    expected_interactions = torch.zeros((B, t, C))
    
    # Map cell type names to indices
    cell_type_to_index = {ct: idx for idx, ct in enumerate(cell_types)}
    
    # Aggregate interaction strengths by cell type
    for b in range(B):
        for n in range(1, 50):  # Skip the first element, which is the target cell type
            neighbor_type = cell_type_names[b][n]
            if neighbor_type in cell_type_to_index:
                idx = cell_type_to_index[neighbor_type]
                aggregated_interactions[b, idx] += attention_scores[b, n-1]
                expected_interactions[b, idx]+=expect_cnt_attention_scores[b, n-1] 

    aggregated_interactions1=torch.abs(aggregated_interactions)/torch.sum(torch.abs(aggregated_interactions),dim=1,keepdim=True)
    aggregated_interactions=torch.where(torch.sum(torch.abs(aggregated_interactions),dim=1,keepdim=True)==0,torch.zeros_like(aggregated_interactions),aggregated_interactions1)

    for cell_typei in cell_types:
        mask = (cell_type_names[:, 0] == cell_typei)
        for genei in range(C):
            aggregated_interactions[mask,:,genei]=aggregated_interactions[mask,:,genei]/torch.sum(torch.abs(aggregated_interactions[mask,:,genei]))*aggregated_interactions[mask,:,genei].shape[0]
    
    # Prepare to compute correlations for each cell type pair
    results_matrix = []
    for pair in cell_type_pair_sequence:
        from_type, to_type = pair.split("__")
        if from_type in cell_type_to_index:
            mask = (cell_type_names[:, 0] == to_type)
            filtered_interactions = aggregated_interactions[mask, cell_type_to_index[from_type]]
            filtered_expected_interactions = expected_interactions[mask, cell_type_to_index[from_type]]
            filtered_expressions = true_expression[mask]
            filtered_pred=pred_expression[mask]

            avg_cnt=torch.mean(torch.sum(expected_interactions[mask],dim=1),dim=0)
            
            if np.sum(mask)==0:
                results_matrix.append([0 for k in range(C)])
                continue
            
            # Calculate Pearson correlation coefficient for each gene
            corr_coeffs = []
            for i in range(C):
                gene_interactions = filtered_interactions[:, i]
                gene_expressions = filtered_expressions[:, i]
                expectedi=filtered_expected_interactions[:, i]
                predi=filtered_pred[:,i]
                #r = torch.corrcoef(torch.stack((gene_interactions, gene_expressions)))[0, 1]
                if len(gene_interactions)<=20 or ((gene_interactions == gene_interactions[0]).all() or (gene_expressions == gene_expressions[0]).all()):
                    corr_coeffs.append(0)
                    continue

                count_from=(cell_type_counts.loc[cell_type_counts["cell_type"]==from_type,"counts"].values)[0]      
                count_to=(cell_type_counts.loc[cell_type_counts["cell_type"]==to_type,"counts"].values)[0]
                avg_cnti=avg_cnt[i]
                strength = calcualte_z_neighbor(gene_interactions,expectedi,avg_cnti)
                
                corr_coeffs.append(float(strength))
            results_matrix.append(corr_coeffs)
    
    # Convert results to a tensor of shape (t^2, C)
    results_tensor = np.array(results_matrix)
    results_matrix=np.nan_to_num(results_matrix)
    return results_tensor

z_dir="./z_strength_spatial_neighbor_adapt/"
if not os.path.exists(z_dir):
    os.system("mkdir "+z_dir)
    
results=[]
cnt=0
for samplei in samples:
    print(cnt+1,len(samples))
    cnt=cnt+1
    tmp=calculate_strength_spatial_neighbor_adapt(samplei)
    print(tmp.shape,tmp)
    df=pd.DataFrame(data=tmp,columns=genes,index=cell_type_pair_sequence)
    df.to_csv(z_dir+samplei+".csv")
    results.append(tmp)
    print("spatial neighbor strength:",samplei)

np.save(z_dir+"z_values.npy",np.stack(results,axis=0))

1 2
(400, 321) [[ 5.73696202e-10  5.35449773e-09 -2.56250967e-08 ...  7.17120252e-10
   2.15351201e-06 -6.50189014e-09]
 [ 6.31796526e-09  4.73847415e-08  2.14359530e-09 ...  0.00000000e+00
  -9.81541071e-09  1.28615723e-08]
 [-2.33608977e-09  5.49668200e-10  0.00000000e+00 ...  5.49668200e-10
   4.12251150e-10  2.49004923e-04]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ... -9.42676426e-09
  -1.09978915e-08  0.00000000e+00]
 [-2.42871212e-09  0.00000000e+00  0.00000000e+00 ... -3.80498228e-08
   0.00000000e+00 -3.23828298e-09]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]
spatial neighbor strength: sample1_rep1
2 2
(400, 321) [[ 3.01795677e-09 -1.05628484e-08 -4.22513935e-08 ...  0.00000000e+00
   7.24473102e-06  5.78441695e-09]
 [ 3.21102456e-09  3.33946559e-08  1.92661478e-08 ...  0.00000000e+00
   4.49543469e-09 -1.15596883e-08]
 [ 2.12013296e-09  0.00000000e+00  0.00000000e+00 ... -9.63696767e-09
  -1.07934035e