In [None]:
import anndata as ad
import scanpy as sc
import gc
import sys
import cellanova as cnova
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sea
import os

from metrics1 import calculate_metrics

In [None]:
adata_raw=sc.read_h5ad("./data/AD.h5ad")
adata_raw.raw = None
for colori in ["Subclass","donor_id","disease"]:
    sc.pl.umap(adata_raw,color=colori)

In [None]:
sc.pl.umap(adata_raw,color=["Subclass","disease"],ncols=1)

In [None]:
import pandas as pd
df = pd.DataFrame(adata_raw.obs[["disease",'donor_id']].copy())

# Remove duplicated rows
unique_df = df.drop_duplicates()

# Construct a dictionary mapping the first column to the second column
mapping_dict = dict(zip(unique_df['donor_id'], unique_df["disease"]))

print("Unique DataFrame:")
print(unique_df)
print("\nMapping Dictionary:")
print(mapping_dict)

In [None]:
print(len(unique_df.loc[unique_df["disease"]=="normal"]))

In [None]:
print(adata_raw.obs.columns)
df = pd.DataFrame(adata_raw.obs[["ADNC",'donor_id']].copy())

# Remove duplicated rows
unique_df = df.drop_duplicates()

# Construct a dictionary mapping the first column to the second column
mapping_dict = dict(zip(unique_df['donor_id'], unique_df["ADNC"]))

print("Unique DataFrame:")
print(unique_df)
print("\nMapping Dictionary:")
print(mapping_dict)

In [None]:
print(len(unique_df.loc[unique_df["ADNC"]=='Not AD']))

In [None]:
flag=adata_raw.obs["Subclass"]=='Lamp5'
adata_raw.obs["WhereIsThisCellType"]=flag
sc.pl.umap(adata_raw,color="WhereIsThisCellType")

In [None]:
sc.pl.umap(adata_raw,color="APOE",ncols=1)

In [None]:
sc.pl.umap(adata_raw,color="ADNC")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

def plot_distribution_with_binary_zscore(vector, gene_name):
    """
    Plots the distribution of the values in the vector, calculates the z-score
    based on the proportion of values > 0 using CLT, and uses the gene name and z-score
    as the title of the plot.

    Args:
    - vector (numpy.ndarray or list): The input vector of values.
    - gene_name (str): The gene name to include in the title.

    Returns:
    - None: Displays the plot.
    """
    # Convert to numpy array if not already
    vector = np.array(vector)

    # Binary transformation: count the number of values > 0
    count_positive = np.sum(vector > 0)+0.5*np.sum(vector==0)
    n = len(vector)
    proportion_positive = count_positive / n  # Proportion of values > 0

    # Calculate z-score using the CLT
    p_null = 0.5  # Null hypothesis: Proportion of values > 0 is 0.5
    std_error = np.sqrt(p_null * (1 - p_null) / n)
    z_score = (proportion_positive - p_null) / std_error

    # Plot the distribution of the vector
    plt.figure(figsize=(8, 6))
    plt.hist(vector, bins=30, alpha=0.7, edgecolor='k', color='blue')
    plt.axvline(0, color='red', linestyle='--', label='Reference: 0')
    plt.title(f"{gene_name} | Z-score: {z_score:.2f} (Proportion > 0: {proportion_positive:.2f})", fontsize=14)
    plt.xlabel('Values')
    plt.ylabel('Frequency')
    plt.legend()
    plt.tight_layout()

    # Show the plot
    plt.show()

def plot_distribution(vector, bins=30, kde=True, title="Distribution Plot"):
    """
    Plots the distribution of a vector using a histogram and optionally overlays a KDE fitted line.

    Parameters:
    - vector (array-like): The input data vector.
    - bins (int): Number of bins for the histogram. Default is 30.
    - kde (bool): Whether to add a KDE fitted line. Default is True.
    - title (str): Title for the plot. Default is "Distribution Plot".
    """
    plt.figure(figsize=(8, 6))

    # Plot histogram and optionally the KDE
    sns.histplot(vector, bins=bins, kde=kde, color='blue', stat='density', edgecolor='black')

    # Add titles and labels
    plt.title(title, fontsize=14)
    plt.xlabel("Value", fontsize=12)
    plt.ylabel("Density", fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)

    # Show the plot
    plt.tight_layout()
    plt.show()

def plot_distribution1(vector, bins=30, kde=True, title="Distribution Plot"):
    """
    Plots the distribution of a vector using a histogram and optionally overlays a KDE fitted line and a normal distribution.

    Parameters:
    - vector (array-like): The input data vector.
    - bins (int): Number of bins for the histogram. Default is 30.
    - kde (bool): Whether to add a KDE fitted line. Default is True.
    - title (str): Title for the plot. Default is "Distribution Plot".
    """
    plt.figure(figsize=(8, 6))

    # Plot histogram and optionally the KDE
    sns.histplot(vector, bins=bins, kde=kde, color='blue', stat='density', edgecolor='black')

    # Overlay the normal distribution curve
    mean, std = np.mean(vector), np.std(vector)
    x = np.linspace(min(vector), max(vector), 1000)
    plt.plot(x, norm.pdf(x, mean, std), color='red', label=f'Normal Dist. (μ={mean:.2f}, σ={std:.2f})')

    # Add titles, labels, and legend
    plt.title(title, fontsize=14)
    plt.xlabel("Value", fontsize=12)
    plt.ylabel("Density", fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(fontsize=12)

    # Show the plot
    plt.tight_layout()
    plt.show()

from scipy.stats import norm
from statsmodels.stats.multitest import multipletests

def zscores_to_adjusted_pvalues(z_scores, adjustment_method='fdr_bh'):
    """
    Convert z-scores to two-sided p-values and adjust them for multiple comparisons.

    Parameters:
        z_scores (array-like): Vector of z-scores.
        adjustment_method (str): Method for p-value adjustment. 
                                 Options include 'bonferroni', 'fdr_bh', etc.
                                 (default is 'fdr_bh').

    Returns:
        dict: A dictionary containing:
              - 'z_scores': The original z-scores.
              - 'p_values': The two-sided p-values.
              - 'adjusted_p_values': The adjusted p-values.
    """
    # Convert z-scores to two-sided p-values
    p_values = 2*(1 - norm.cdf(np.abs(z_scores)))
    p_values = np.nan_to_num(p_values, nan=0)
    # Adjust p-values for multiple comparisons
    adjusted_p_values = multipletests(p_values, method=adjustment_method)[1]
    
    return adjusted_p_values

import pandas as pd
def Plot_and_Estimate_CATE_adata(
    adata,
    indices,
    up_regulate=True,
    genes=None,
    topk=5,
    plot=True,
    parametric=False
):
    # Subset the ITE matrix using the selected indices
    ITE_subset = adata.X[indices, :]

    # Initialize z-scores for genes
    z_scores = []

    # Calculate z-scores for each gene (column in the ITE matrix)
    for gene_idx in range(ITE_subset.shape[1]):
        gene_ite = ITE_subset[:, gene_idx]
        # Test if ITE > 0 using non-parametric test (binomial distribution assumption)
        n = len(gene_ite)
        successes = np.sum(gene_ite > 0) + 0.5*np.sum(gene_ite==0)
        # z-score for binomial test
        z_score = (successes - n * 0.5) / np.sqrt(n * 0.25)
        z_scores.append(z_score)
    
    if parametric:
        z_scores = []
        for gene_idx in range(ITE_subset.shape[1]):
            gene_ite = ITE_subset[:, gene_idx]
            # Test if ITE > 0 using non-parametric test (binomial distribution assumption)
            n = len(gene_ite)
            # z-score for binomial test
            z_score = np.mean(gene_ite)/np.std(gene_ite, ddof=1)*np.sqrt(n)
            z_scores.append(z_score)
    
    # Select top-k significant genes if `genes` is not provided
    z_scores = np.array(z_scores)
    #print(np.max(z_scores))
    #plot_distribution(z_scores)
    p_adjs=zscores_to_adjusted_pvalues(z_scores)

    if genes is not None:
        print(z_scores[adata.var_names.tolist().index(genes)])
        gene_idx = np.where(adata.var_names == genes)[0][0]
        ite_values = ITE_subset[:, gene_idx]
        plot_distribution_with_binary_zscore(vector=-ite_values, gene_name=genes)
        return
            
    #plot_distribution(2 * (1 - norm.cdf(np.abs(z_scores))))
    if up_regulate:
        print("Calculating up-regulation")
        z_scores=-z_scores
    # Get indices of top-k absolute z-scores
    topk_indices = np.argsort(z_scores)[:topk]
    genes = adata.var_names[topk_indices]
    #print(z_scores[topk_indices])

    print(adata.var_names[np.argsort(z_scores)[:200]].tolist())
    
    #print("Number of genes with ITE adjusted p-values less than 0.01",np.sum(p_adjs<0.01))
    
    # Print the selected most significant genes
    print("Selected most significant genes:")
    genes=genes.tolist()

    df=pd.DataFrame({"Gene":genes, "Z score":np.array([z_scores[adata.var_names.tolist().index(gene)] for gene in genes]),
                    "Adjusted_p":np.array([p_adjs[adata.var_names.tolist().index(gene)] for gene in genes])})
    print(df)

    if plot:
        # Plot the distribution of ITE values for the selected genes
        plt.figure(figsize=(10, 6))
        for gene in genes:
            if gene is None:
                continue
            print("Plotting:", gene)
            print("adjusted p-value:",p_adjs[adata.var_names.tolist().index(gene)])
            gene_idx = np.where(adata.var_names == gene)[0][0]
            ite_values = ITE_subset[:, gene_idx]

            plot_distribution_with_binary_zscore(vector=-ite_values, gene_name=gene)
            sc.pl.umap(adata_raw, color=gene,ncols=1)

    top1000=adata.var_names[np.argsort(z_scores)[:1000]].tolist()
    return [x for x in top1000 if x is not None][:300]

In [None]:
def read_ITE():
    print("adata preprocessing...")

    import warnings
    warnings.filterwarnings("ignore")
    warnings.filterwarnings("ignore", category=FutureWarning)

    ITE = []
    for filei in os.listdir("./AD"):
        if filei.find("ITE.h5ad") > 0:
            ITE.append(sc.read_h5ad("./AD/" + filei))
    ITE = ad.concat(ITE, merge="same", uns_merge="same")
    print("Finish")
    return ITE

In [None]:
ITE=read_ITE()

In [None]:
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import StrVector

# Ensure required R packages are installed
utils = importr("utils")
utils.chooseCRANmirror(ind=1)  # Select a CRAN mirror
importr("org.Hs.eg.db")
importr("AnnotationDbi")

# Define the R function in Python
ro.r('''
library(org.Hs.eg.db)
library(AnnotationDbi)

convert_ensg_to_gene_name_local <- function(ensg_vector) {
  gene_names <- mapIds(
    org.Hs.eg.db,
    keys = ensg_vector,
    column = "SYMBOL",
    keytype = "ENSEMBL",
    multiVals = "first"
  )
  result <- data.frame(ENSG_ID = ensg_vector, Gene_Name = gene_names, stringsAsFactors = FALSE)
  return(result)
}
''')

# Python function to call the R function
def convert_ensg_to_gene_name(ensg_list):
    r_convert_func = ro.globalenv['convert_ensg_to_gene_name_local']
    r_result = r_convert_func(StrVector(ensg_list))
    print()
    # Convert R data frame to Python dictionary
    result = [r_result["Gene_Name"][i] for i in range(len(r_result))]
    return result

# Example usage
ensg_ids = ["ENSG00000139618", "ENSG00000227232", "ENSG00000157764"]
gene_names = convert_ensg_to_gene_name(ensg_ids)
print(gene_names)

In [None]:
gene_names=np.array(convert_ensg_to_gene_name(ITE.var_names.tolist()))
print(gene_names)
ITE.var_names=gene_names

In [None]:
gene_names1=np.array(convert_ensg_to_gene_name(adata_raw.var_names.tolist()))
print(gene_names1)
adata_raw.var_names=gene_names1

# Common

In [None]:
for genei in ["ATP6","ND4","CYTB","FTH1"]:
    Plot_and_Estimate_CATE_adata(ITE,range(ITE.shape[0]),genes=genei)

In [None]:
Plot_and_Estimate_CATE_adata(ITE,range(ITE.shape[0]),genes="COX2")

# Oligo

In [None]:
indices=ITE.obs["Subclass"]=='Oligodendrocyte'
Plot_and_Estimate_CATE_adata(ITE,indices)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True)

In [None]:
Plot_and_Estimate_CATE_adata(ITE,indices,up_regulate=False)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True,up_regulate=False)

In [None]:
indices=ITE.obs["Subclass"]=='Astrocyte'
Plot_and_Estimate_CATE_adata(ITE,indices)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True)

In [None]:
indices=ITE.obs["Subclass"]=='Astrocyte'
Plot_and_Estimate_CATE_adata(ITE,indices,genes="APOE")

In [None]:
Plot_and_Estimate_CATE_adata(ITE,indices,up_regulate=False)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True,up_regulate=False)

In [None]:
indices=ITE.obs["Subclass"]=='Microglia-PVM'
#Plot_and_Estimate_CATE_adata(ITE,indices,genes="SPP1")
Plot_and_Estimate_CATE_adata(ITE,indices)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True)

In [None]:
sc.pl.umap(adata_raw[adata_raw.obs["Subclass"]=='Microglia-PVM'],color=['ADNC','disease'],ncols=1)

In [None]:
for colori in ['CCL8', 'VCAN-AS1', 'CCL3', 'IFITM1', 'CH25H', 'SIGLEC1', 'H2BC7', 'FOS']:
    sc.pl.umap(adata_raw[adata_raw.obs["Subclass"]=='Microglia-PVM'],color=colori,ncols=1)

In [None]:
Plot_and_Estimate_CATE_adata(ITE,indices,up_regulate=False)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True,up_regulate=False)

In [None]:
indices=ITE.obs["Class"]=='Neuronal: GABAergic'
Plot_and_Estimate_CATE_adata(ITE,indices)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True)

In [None]:
Plot_and_Estimate_CATE_adata(ITE,indices,up_regulate=False)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True,up_regulate=False)

In [None]:
for colori in ['P2RY14','IPO9-AS1','ANGPT2','PDE8A','EGFR','ERBB4']:
    sc.pl.umap(adata_raw,color=colori,ncols=1)

In [None]:
indices=ITE.obs["Class"]=='Neuronal: Glutamatergic'
Plot_and_Estimate_CATE_adata(ITE,indices)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True)

In [None]:
indices=ITE.obs["Class"]=='Neuronal: Glutamatergic'
Plot_and_Estimate_CATE_adata(ITE,indices,genes="TIMP3")
Plot_and_Estimate_CATE_adata(ITE,indices,genes="HS3ST2")

In [None]:
Plot_and_Estimate_CATE_adata(ITE,indices,up_regulate=False)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True,up_regulate=False)

In [None]:
for colori in ['TIMP3','HS3ST2','CHST9', "CNTN5",'ABCB1', 'SHOC1']:
    sc.pl.umap(adata_raw,color=colori,ncols=1)

In [None]:
indices=ITE.obs["Subclass"]=='L2/3 IT'
#Plot_and_Estimate_CATE_adata(ITE,indices,genes="SPP1")
Plot_and_Estimate_CATE_adata(ITE,indices)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True)

In [None]:
indices=ITE.obs["Subclass"]=='Lamp5'
#Plot_and_Estimate_CATE_adata(ITE,indices,genes="SPP1")
Plot_and_Estimate_CATE_adata(ITE,indices,up_regulate=False)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True,up_regulate=False)

In [None]:
indices=ITE.obs["Subclass"]=='Lamp5'
#Plot_and_Estimate_CATE_adata(ITE,indices,genes="SPP1")
Plot_and_Estimate_CATE_adata(ITE,indices)
print("="*20)
Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True)

In [None]:
indices=ITE.obs["Subclass"]=='Lamp5'
#Plot_and_Estimate_CATE_adata(ITE,indices,genes="SPP1")
for genei in ['SPHKAP',"KIT"]:
    Plot_and_Estimate_CATE_adata(ITE,indices,genes=genei)

In [None]:
for colori in ['RHOH', 'PECAM1']:#['SPHKAP','KIT','FTL','PCDH11Y','CCK']:
    sc.pl.umap(adata_raw,color=colori,ncols=1)

In [None]:
cell_types = ['Astrocyte', 'Microglia-PVM', 'L2/3 IT', 'Oligodendrocyte', 'Chandelier', 'Endothelial', 'L4 IT', 'L5 ET', 'L5 IT',
              'L5/6 NP', 'L6 CT', 'L6 IT', 'L6 IT Car3', 'L6b', 'Lamp5', 'Lamp5 Lhx6',
              'OPC', 'Pax6', 'Pvalb', 'Sncg', 'Sst',
              'Sst Chodl', 'VLMC', 'Vip']

significants_unparametric_up={}
significants_parametric_up={}
significants_unparametric_down={}
significants_parametric_down={}

for cell_typei in cell_types:
    indices=ITE.obs["Subclass"]==cell_typei
    top100_1=Plot_and_Estimate_CATE_adata(ITE,indices)
    significants_unparametric_up[cell_typei]=top100_1
    print("="*20)
    top100_2=Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True)
    significants_parametric_up[cell_typei]=top100_2
    print("="*20)
    top100_3=Plot_and_Estimate_CATE_adata(ITE,indices,up_regulate=False)
    significants_unparametric_down[cell_typei]=top100_3
    print("="*20)
    top100_4=Plot_and_Estimate_CATE_adata(ITE,indices,parametric=True,up_regulate=False)
    significants_parametric_down[cell_typei]=top100_4
    print("---"*20)
    print("---"*20)
    print("---"*20)
    print("---"*20)
    print("---"*20)
    print("---"*20)
    print("---"*20)
    print("---"*20)
    print("---"*20)
    print("---"*20)
    df1=pd.DataFrame(significants_unparametric_up)
    df2=pd.DataFrame(significants_parametric_up)
    df1.to_csv("./AD_stat/up_significant_unparametric.csv")
    df2.to_csv("./AD_stat/up_significant_parametric.csv")

    df3=pd.DataFrame(significants_unparametric_down)
    df4=pd.DataFrame(significants_parametric_down)
    df3.to_csv("./AD_stat/down_significant_unparametric.csv")
    df4.to_csv("./AD_stat/down_significant_parametric.csv")