In [1]:
import os
import torch
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import scipy
from statsmodels.stats.multitest import multipletests

In [2]:
# Define global variables
data_dir = "../edges/"
genes = torch.load("../../data/AD/genes.pth")

def compute_pearson_pcc(y, y_pred):
    """ Compute Pearson correlation coefficient for each gene. """
    pccs = [pearsonr(y[:, i], y_pred[:, i])[0] for i in range(y.shape[1])]
    return np.array(pccs)

def calculate_z_values(pccs, n):
    """ Convert PCCs to z-values. """
    z_values = pccs * np.sqrt((n - 2) / (1 - pccs**2))
    return z_values

def main():
    pcc_matrix = []

    # Process each file in the directory
    for file in os.listdir(data_dir):
        if file.startswith("edge") and file.endswith(".pth"):
            data = torch.load(os.path.join(data_dir, file))
            y = data["y"]
            y_pred = data["y_pred"]

            # Compute PCCs and add to matrix
            pccs = compute_pearson_pcc(y, y_pred)
            pcc_matrix.append(pccs)
            print(file)

    # Convert to numpy array and calculate z-values and p-values
    pcc_matrix = np.array(pcc_matrix)
    z_matrix = calculate_z_values(pcc_matrix, y.shape[0])
    p_values = 1 - scipy.stats.norm.cdf(z_matrix)

    # Flatten the matrix of p-values and adjust
    flat_p_values = p_values.flatten()
    adjusted_flat_p_values = multipletests(flat_p_values, method='fdr_bh')[1]
    
    # Reshape the adjusted p-values back to the original matrix shape
    adjusted_p_values = adjusted_flat_p_values.reshape(p_values.shape)

    # Prepare dataframes
    sample_names = [file[6:-4] for file in os.listdir(data_dir) if file.startswith("edge") and file.endswith(".pth")]
    df_p = pd.DataFrame(data=adjusted_p_values, index=sample_names, columns=genes)
    df_z = pd.DataFrame(data=z_matrix, index=sample_names, columns=genes)
    df_pcc = pd.DataFrame(data=pcc_matrix, index=sample_names, columns=genes)
    
    df_p.insert(0, 'Sample', sample_names)
    df_z.insert(0, 'Sample', sample_names)
    df_pcc.insert(0, 'Sample', sample_names)

    # Save to CSV
    df_p.to_csv('p.csv', index=False)
    df_z.to_csv('z.csv', index=False)
    df_pcc.to_csv('pcc.csv', index=False)

In [3]:
main()

edges_H21.33.021.Cx26.MTG.02.007.1.04.pth
edges_H20.33.001.Cx28.MTG.02.007.1.01.03.pth
edges_H21.33.040.Cx22.MTG.02.007.3.03.04.pth
edges_H21.33.038.Cx20.MTG.02.007.3.01.02.pth
edges_H21.33.005.Cx18.MTG.02.007.02.03.pth
edges_H20.33.012.Cx24.MTG.02.007.1.03.03.pth
edges_H21.33.005.Cx18.MTG.02.007.02.04.pth
edges_H20.33.004.Cx26.MTG.02.007.1.02.02.pth
edges_H20.33.025.Cx28.MTG.02.007.1.01.04.pth
edges_H21.33.032.CX24.MTG.02.007.1.01.04.pth
edges_H20.33.004.Cx26.MTG.02.007.1.02.04.pth
edges_H21.33.022.Cx26.MTG.02.007.2.M.04.pth
edges_H21.33.014.CX26.MTG.02.007.1.02.02.pth
edges_H21.33.040.Cx22.MTG.02.007.3.03.01.pth
edges_H21.33.012.Cx26.MTG.02.007.1.01.06.pth
edges_H21.33.023.Cx26.MTG.02.007.1.03.01.pth
edges_H21.33.015.Cx26.MTG.02.007.1.2.pth
edges_H20.33.012.Cx24.MTG.02.007.1.03.02.pth
edges_H20.33.040.Cx25.MTG.02.007.1.01.03.pth
edges_H21.33.001.Cx22.MTG.02.007.1.01.04.pth
edges_H21.33.016.Cx26.MTG.02.007.3.01.01.pth
edges_H20.33.015.Cx24.MTG.02.007.1.03.03.pth
edges_H21.33.015.Cx26.

In [4]:
ys,y_preds,sample_names=[],[],[]

for file in os.listdir(data_dir):
        if file.startswith("edge") and file.endswith(".pth"):
            data = torch.load(os.path.join(data_dir, file))
            ys.append(data["y"].numpy())
            y_preds.append(data["y_pred"].numpy())
            sample_names.append(np.array([file[6:-4] for j in range(data["y"].shape[0])]))

sample_names=np.concatenate(sample_names,axis=0)
df_prediction=pd.DataFrame(data=np.concatenate(y_preds,axis=0),columns=genes)
df_prediction["sample"]=sample_names
df_prediction.to_csv("./predictions.csv",index=False)

df_y=pd.DataFrame(data=np.concatenate(ys,axis=0),columns=genes)
df_y["sample"]=sample_names
df_y.to_csv("./ys.csv",index=False)