# Description

It reads the pvalues generated previously and adjust them using FDR.

# Modules loading

In [27]:
import numpy as np
import pandas as pd
import os

from pathlib import Path
from statsmodels.stats.multitest import multipletests

from ccc import conf

# Settings

In [28]:
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

# Configuration constants
TOP_N_GENES = "all"
DATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8")
GENE_SELECTION_DIR = DATA_DIR / "gene_selection" / TOP_N_GENES
SIMILARITY_MATRICES_DIR = DATA_DIR / "similarity_matrices" / TOP_N_GENES

# Paths

In [29]:
OUTPUT_DIR = Path("/mnt/data/projs/manuscripts/ccc-gpu/results/") / "pvalues"
assert OUTPUT_DIR.exists()

In [30]:
OUTPUT_DIR

PosixPath('/mnt/data/projs/manuscripts/ccc-gpu/results/pvalues')

In [31]:
INPUT_PVALUES_FILE = OUTPUT_DIR / "gene_pair-samples-pvalues.pkl"
display(INPUT_PVALUES_FILE)
assert INPUT_PVALUES_FILE.exists()

PosixPath('/mnt/data/projs/manuscripts/ccc-gpu/results/pvalues/gene_pair-samples-pvalues.pkl')

# Load pvalues

In [32]:
pvalues = pd.read_pickle(INPUT_PVALUES_FILE).sort_index()

In [33]:
pvalues.shape

(17008, 9)

In [34]:
pvalues.head()

Unnamed: 0,gene0,gene1,group,ccc,ccc_pvalue,pearson,pearson_pvalue,spearman,spearman_pvalue
0,ENSG00000255555.1,ENSG00000230840.1,all_high-top_ccc,1.0,0.001998,1.0,0.0,1.0,0.0
1,ENSG00000255945.1,ENSG00000257296.1,all_high-top_ccc,1.0,0.003996,1.0,0.0,1.0,0.0
2,ENSG00000284356.1,ENSG00000248928.1,all_high-top_ccc,1.0,0.003996,1.0,0.0,1.0,0.0
3,ENSG00000256281.1,ENSG00000267687.1,all_high-top_ccc,1.0,0.000999,1.0,0.0,1.0,0.0
4,ENSG00000255945.1,ENSG00000232604.1,all_high-top_ccc,1.0,0.002997,1.0,0.0,1.0,0.0


# Get set of unique gene pairs

In [35]:
pvalues = pvalues.set_index(["gene0", "gene1"])

In [36]:
assert not pvalues.index.is_unique

In [37]:
pvalues.loc[pvalues.index.duplicated(keep=False)].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_pvalue,pearson,pearson_pvalue,spearman,spearman_pvalue
gene0,gene1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000001167.14,ENSG00000123700.4,ccc_high_and_spearman_low-top_ccc,0.100209,0.000999,-0.032490,3.726696e-01,-0.023504,0.519030
ENSG00000001167.14,ENSG00000123700.4,ccc_high_and_spearman_low-top_spearman,0.100209,0.000999,-0.032490,3.726696e-01,-0.023504,0.519030
ENSG00000005007.12,ENSG00000231721.6,ccc_high_and_spearman_pearson_low-top_ccc,0.083668,0.000999,0.003094,9.323651e-01,-0.023397,0.520925
ENSG00000005007.12,ENSG00000231721.6,ccc_high_and_spearman_pearson_low-top_spearman,0.083668,0.000999,0.003094,9.323651e-01,-0.023397,0.520925
ENSG00000007171.16,ENSG00000269621.1,pearson_high_and_ccc_low-top_ccc,0.000138,0.757243,0.908452,1.745591e-287,-0.036033,0.322773
...,...,...,...,...,...,...,...,...
ENSG00000283680.1,ENSG00000278497.1,all_high-top_pearson,1.000000,0.000999,1.000000,0.000000e+00,1.000000,0.000000
ENSG00000284508.1,ENSG00000221325.1,all_high-top_pearson,1.000000,0.002997,1.000000,0.000000e+00,1.000000,0.000000
ENSG00000284508.1,ENSG00000221325.1,all_high-top_spearman,1.000000,0.002997,1.000000,0.000000e+00,1.000000,0.000000
ENSG00000284508.1,ENSG00000271315.1,all_high-top_pearson,1.000000,0.001998,1.000000,0.000000e+00,1.000000,0.000000


In [38]:
pvalues_nodup = pvalues.loc[~pvalues.index.duplicated(keep="first"), :]

In [39]:
pvalues_nodup.shape

(16005, 7)

# Adjust p-values for all methods

In [44]:
# remove nans
pvalues_nodup = pvalues_nodup.dropna(subset=["pearson_pvalue"])
pvalues_nodup.shape
# Adjust p-values for all methods

(15954, 7)

In [45]:
pvalues_nodup.isna().sum()

group              0
ccc                0
ccc_pvalue         0
pearson            0
pearson_pvalue     0
spearman           0
spearman_pvalue    0
dtype: int64

In [46]:
adj_pvals = multipletests(pvalues_nodup["pearson_pvalue"], alpha=0.05, method="fdr_bh")

In [47]:
adj_pvals[1].shape

(15954,)

In [48]:
adj_pvals

(array([ True,  True,  True, ...,  True, False, False]),
 array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.14369259e-83, 3.29834078e-01, 9.79228840e-01]),
 3.2150690693066863e-06,
 3.134010279553717e-06)

In [49]:
for coef in ("ccc", "pearson", "spearman"):
    pval_col_name = f"{coef}_pvalue"
    fdr_col_name = f"{coef}_fdr"
    print(f"{pval_col_name} - {fdr_col_name}")

    adj_pvals = multipletests(pvalues_nodup[pval_col_name], alpha=0.05, method="fdr_bh")
    pvalues_nodup = pvalues_nodup.assign(**{fdr_col_name: adj_pvals[1]})

ccc_pvalue - ccc_fdr
pearson_pvalue - pearson_fdr
spearman_pvalue - spearman_fdr


In [50]:
pvalues_nodup.shape

(15954, 10)

In [51]:
# reorder columns
_tmp = (
    pvalues_nodup.rename(columns={"group": "agroup"})
    .sort_index(axis="columns")
    .rename(columns={"agroup": "group"})
)
display(_tmp.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_fdr,ccc_pvalue,pearson,pearson_fdr,pearson_pvalue,spearman,spearman_fdr,spearman_pvalue
gene0,gene1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000255555.1,ENSG00000230840.1,all_high-top_ccc,1.0,0.004304,0.001998,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000255945.1,ENSG00000257296.1,all_high-top_ccc,1.0,0.008145,0.003996,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000284356.1,ENSG00000248928.1,all_high-top_ccc,1.0,0.008145,0.003996,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000256281.1,ENSG00000267687.1,all_high-top_ccc,1.0,0.00228,0.000999,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000255945.1,ENSG00000232604.1,all_high-top_ccc,1.0,0.006217,0.002997,1.0,0.0,0.0,1.0,0.0,0.0


In [52]:
# Remove nans
pvalues_nodup = _tmp.dropna(subset=["pearson_fdr"])

In [53]:
pvalues_nodup.shape

(15954, 10)

In [54]:
pvalues_nodup.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_fdr,ccc_pvalue,pearson,pearson_fdr,pearson_pvalue,spearman,spearman_fdr,spearman_pvalue
gene0,gene1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000255555.1,ENSG00000230840.1,all_high-top_ccc,1.0,0.004304,0.001998,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000255945.1,ENSG00000257296.1,all_high-top_ccc,1.0,0.008145,0.003996,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000284356.1,ENSG00000248928.1,all_high-top_ccc,1.0,0.008145,0.003996,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000256281.1,ENSG00000267687.1,all_high-top_ccc,1.0,0.00228,0.000999,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000255945.1,ENSG00000232604.1,all_high-top_ccc,1.0,0.006217,0.002997,1.0,0.0,0.0,1.0,0.0,0.0


# Reassign adjusted pvalues to original file

In [55]:
pvalues.shape

(17008, 7)

In [56]:
pvalues = pvalues.assign(
    **{
        (col := f"{coef}_fdr"): pvalues_nodup[col]
        for coef in ("ccc", "pearson", "spearman")
    }
)
pvalues = pvalues[pvalues_nodup.columns]

In [57]:
pvalues.shape

(17008, 10)

In [58]:
pvalues.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_fdr,ccc_pvalue,pearson,pearson_fdr,pearson_pvalue,spearman,spearman_fdr,spearman_pvalue
gene0,gene1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000255555.1,ENSG00000230840.1,all_high-top_ccc,1.0,0.004304,0.001998,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000255945.1,ENSG00000257296.1,all_high-top_ccc,1.0,0.008145,0.003996,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000284356.1,ENSG00000248928.1,all_high-top_ccc,1.0,0.008145,0.003996,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000256281.1,ENSG00000267687.1,all_high-top_ccc,1.0,0.00228,0.000999,1.0,0.0,0.0,1.0,0.0,0.0
ENSG00000255945.1,ENSG00000232604.1,all_high-top_ccc,1.0,0.006217,0.002997,1.0,0.0,0.0,1.0,0.0,0.0


In [59]:
# Make sure duplicated gene pairs have the same pvalues/values
pvalues.loc[pvalues.index.duplicated(keep=False)].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,group,ccc,ccc_fdr,ccc_pvalue,pearson,pearson_fdr,pearson_pvalue,spearman,spearman_fdr,spearman_pvalue
gene0,gene1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENSG00000001167.14,ENSG00000123700.4,ccc_high_and_spearman_low-top_ccc,0.100209,0.002280,0.000999,-0.032490,7.595262e-01,3.726696e-01,-0.023504,0.866054,0.519030
ENSG00000001167.14,ENSG00000123700.4,ccc_high_and_spearman_low-top_spearman,0.100209,0.002280,0.000999,-0.032490,7.595262e-01,3.726696e-01,-0.023504,0.866054,0.519030
ENSG00000005007.12,ENSG00000231721.6,ccc_high_and_spearman_pearson_low-top_ccc,0.083668,0.002280,0.000999,0.003094,9.939301e-01,9.323651e-01,-0.023397,0.866054,0.520925
ENSG00000005007.12,ENSG00000231721.6,ccc_high_and_spearman_pearson_low-top_spearman,0.083668,0.002280,0.000999,0.003094,9.939301e-01,9.323651e-01,-0.023397,0.866054,0.520925
ENSG00000007171.16,ENSG00000269621.1,pearson_high_and_ccc_low-top_ccc,0.000138,1.000000,0.757243,0.908452,1.514364e-286,1.745591e-287,-0.036033,0.726205,0.322773
...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000283680.1,ENSG00000278497.1,all_high-top_pearson,1.000000,0.002280,0.000999,1.000000,0.000000e+00,0.000000e+00,1.000000,0.000000,0.000000
ENSG00000284508.1,ENSG00000221325.1,all_high-top_pearson,1.000000,0.006217,0.002997,1.000000,0.000000e+00,0.000000e+00,1.000000,0.000000,0.000000
ENSG00000284508.1,ENSG00000221325.1,all_high-top_spearman,1.000000,0.006217,0.002997,1.000000,0.000000e+00,0.000000e+00,1.000000,0.000000,0.000000
ENSG00000284508.1,ENSG00000271315.1,all_high-top_pearson,1.000000,0.004304,0.001998,1.000000,0.000000e+00,0.000000e+00,1.000000,0.000000,0.000000


In [60]:
def _assert_same_values(x):
    for coef in ("ccc", "pearson", "spearman"):
        assert x[f"{coef}"].unique().shape[0] == 1
        assert x[f"{coef}_fdr"].unique().shape[0] == 1

        # for CCC, the pvalue column is computed via permutations, so we don't expect to be all the same
        if coef == "ccc":
            assert x[f"{coef}_pvalue"].unique().shape[0] >= 1, x
        else:
            assert x[f"{coef}_pvalue"].unique().shape[0] == 1, x

In [61]:
pvalues.loc[pvalues.index.duplicated(keep=False)].groupby(["gene0", "gene1"]).apply(
    _assert_same_values
)
print("values seem correct")

values seem correct


# Save

In [62]:
INPUT_PVALUES_FILE.parent

PosixPath('/mnt/data/projs/manuscripts/ccc-gpu/results/pvalues')

In [63]:
INPUT_PVALUES_FILE.stem

'gene_pair-samples-pvalues'

In [64]:
INPUT_PVALUES_FILE.suffix

'.pkl'

In [65]:
output_file = (
    INPUT_PVALUES_FILE.parent
    / f"{INPUT_PVALUES_FILE.stem}-fdr{INPUT_PVALUES_FILE.suffix}"
)
display(output_file)

PosixPath('/mnt/data/projs/manuscripts/ccc-gpu/results/pvalues/gene_pair-samples-pvalues-fdr.pkl')

In [66]:
pvalues.to_pickle(output_file)