# Description

It sample gene pairs from the categories in Figure 3b.

# Modules loading

In [89]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import pandas as pd
from pathlib import Path

from ccc import conf

# Settings

In [90]:
# Configuration constants
TOP_N_GENES = "all"
DATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8")
GENE_SELECTION_DIR = DATA_DIR / "gene_selection" / TOP_N_GENES
SIMILARITY_MATRICES_DIR = DATA_DIR / "similarity_matrices" / TOP_N_GENES

In [91]:
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"
N_MAX_SAMPLES_PER_CATEGORY = 500

RANDOM_STATE = np.random.RandomState(0)

# Paths

In [92]:
INPUT_GENE_EXPR_FILE = (
    GENE_SELECTION_DIR / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_FILE)

assert INPUT_GENE_EXPR_FILE.exists()

PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/gene_selection/all/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [93]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = Path(
    "/mnt/data/projs/manuscripts/ccc-gpu/results/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

PosixPath('/mnt/data/projs/manuscripts/ccc-gpu/results/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')

In [94]:
OUTPUT_DIR = Path("/mnt/data/projs/manuscripts/ccc-gpu/results/") / "pvalues"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [95]:
OUTPUT_DIR

PosixPath('/mnt/data/projs/manuscripts/ccc-gpu/results/pvalues')

# Load gene pairs intersection

In [96]:
df_plot = pd.read_pickle(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

In [97]:
df_plot.shape

(276440000, 9)

In [98]:
df_plot.head()

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),CCC (high),CCC (low),ccc,pearson,spearman
ENSG00000000419.12,ENSG00000000419.12,False,False,False,False,False,False,,,
ENSG00000000419.12,ENSG00000000938.12,False,False,False,False,False,False,,,
ENSG00000000419.12,ENSG00000001167.14,False,False,False,False,False,False,,,
ENSG00000000419.12,ENSG00000001561.6,False,False,False,False,False,False,,,
ENSG00000000419.12,ENSG00000002549.12,False,False,False,False,False,False,,,


In [99]:
df_plot = df_plot.dropna(subset=["spearman"])

In [100]:
df_cleaned.shape

(130408456, 9)

In [101]:
df_plot[
    df_plot["Spearman (high)"]
    & df_plot["Pearson (low)"]
    & (~df_plot["CCC (high)"])
    & (~df_plot["CCC (low)"])
]

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),CCC (high),CCC (low),ccc,pearson,spearman
ENSG00000000938.12,ENSG00000137965.10,False,True,True,False,False,False,0.023258,0.006621,0.216907
ENSG00000000938.12,ENSG00000268836.1,False,True,True,False,False,False,0.032612,0.014857,0.213642
ENSG00000001167.14,ENSG00000134339.8,False,True,True,False,False,False,0.027904,0.004786,0.215145
ENSG00000001167.14,ENSG00000173432.10,False,True,True,False,False,False,0.026785,0.006571,0.234618
ENSG00000001167.14,ENSG00000243063.1,False,True,True,False,False,False,0.036472,0.000864,0.258927
...,...,...,...,...,...,...,...,...,...,...
ENSG00000268350.7,ENSG00000157423.17,False,True,True,False,False,False,0.030687,0.014044,0.243887
ENSG00000268686.1,ENSG00000039139.9,False,True,True,False,False,False,0.027179,0.009811,0.194752
ENSG00000272259.5,ENSG00000075035.9,False,True,True,False,False,False,0.025143,0.004198,0.221999
ENSG00000279010.3,ENSG00000124092.12,False,True,True,False,False,False,0.019237,0.001117,0.193529


# Select gene pairs from each category in Figure 3b

In [102]:
gene_pair_cats = {}

In [103]:
gene_pair_cats["all_high"] = df_plot[
    df_plot["CCC (high)"] & df_plot["Spearman (high)"] & df_plot["Pearson (high)"]
]
display(gene_pair_cats["all_high"].shape)

(29892823, 9)

In [104]:
gene_pair_cats["all_low"] = df_plot[
    df_plot["CCC (low)"] & df_plot["Spearman (low)"] & df_plot["Pearson (low)"]
]
display(gene_pair_cats["all_low"].shape)

(23625260, 9)

In [105]:
gene_pair_cats["ccc_spearman_high_and_pearson_low"] = df_plot[
    df_plot["CCC (high)"] & df_plot["Spearman (high)"] & df_plot["Pearson (low)"]
]
display(gene_pair_cats["ccc_spearman_high_and_pearson_low"].shape)

(115638, 9)

In [106]:
gene_pair_cats["ccc_high_and_pearson_low"] = df_plot[
    df_plot["CCC (high)"]
    & (~df_plot["Spearman (high)"])
    & (~df_plot["Spearman (low)"])
    & df_plot["Pearson (low)"]
]
display(gene_pair_cats["ccc_high_and_pearson_low"].shape)

(1062901, 9)

In [107]:
gene_pair_cats["ccc_high_and_spearman_low"] = df_plot[
    df_plot["CCC (high)"]
    & df_plot["Spearman (low)"]
    & (~df_plot["Pearson (low)"])
    & (~df_plot["Pearson (high)"])
]
display(gene_pair_cats["ccc_high_and_spearman_low"].shape)

(38304, 9)

In [108]:
gene_pair_cats["ccc_high_and_spearman_pearson_low"] = df_plot[
    df_plot["CCC (high)"] & df_plot["Spearman (low)"] & df_plot["Pearson (low)"]
]
display(gene_pair_cats["ccc_high_and_spearman_pearson_low"].shape)

(5378, 9)

In [109]:
gene_pair_cats["pearson_high_and_ccc_low"] = df_plot[
    df_plot["CCC (low)"]
    & (~df_plot["Spearman (low)"])
    & (~df_plot["Spearman (high)"])
    & df_plot["Pearson (high)"]
]
display(gene_pair_cats["pearson_high_and_ccc_low"].shape)

(33473, 9)

In [110]:
gene_pair_cats["pearson_high_and_ccc_spearman_low"] = df_plot[
    df_plot["CCC (low)"] & df_plot["Spearman (low)"] & df_plot["Pearson (high)"]
]
display(gene_pair_cats["pearson_high_and_ccc_spearman_low"].shape)

(75674, 9)

In [111]:
assert len(gene_pair_cats) == 8

# Sample gene pairs

Here I take all the categories defined above (keys in dictionaries) and I create three subcategories for each, where I take the top genes prioritized by the three coefficients.

In [112]:
# prepare weights for sampling, where I will put zeros on already sampled gene pairs
gene_pairs_weights = (
    df_plot.drop(columns=df_plot.columns[:-1])
    .rename(columns={df_plot.columns[-1]: "weight"})
    .assign(weight=1.0)
    .squeeze()
    .sort_index()
)

In [113]:
gene_pairs_weights

ENSG00000000003.14  ENSG00000003987.13    1.0
                    ENSG00000019549.9     1.0
                    ENSG00000060656.19    1.0
                    ENSG00000061337.15    1.0
                    ENSG00000061676.14    1.0
                                         ... 
ENSG00000284600.1   ENSG00000284471.1     1.0
                    ENSG00000284488.1     1.0
                    ENSG00000284518.1     1.0
                    ENSG00000284519.1     1.0
                    ENSG00000284522.1     1.0
Name: weight, Length: 130408456, dtype: float64

In [114]:
_tmp = df_plot.sample(n=10, replace=False, weights=gene_pairs_weights)
assert _tmp.shape[0] == 10

display(_tmp)

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),CCC (high),CCC (low),ccc,pearson,spearman
ENSG00000244756.1,ENSG00000250310.1,False,False,False,False,False,True,0.0,0.02001,0.026727
ENSG00000241539.1,ENSG00000232223.1,False,True,False,True,False,True,0.0,0.014177,0.01656
ENSG00000258557.1,ENSG00000207095.1,False,True,False,True,False,True,0.0,0.009926,0.011207
ENSG00000260378.1,ENSG00000234776.4,False,False,False,False,False,False,0.027693,0.110594,0.080923
ENSG00000271959.1,ENSG00000167332.7,False,False,False,False,False,False,0.012219,0.079028,0.117659
ENSG00000166401.14,ENSG00000163399.15,False,False,True,False,True,False,0.056554,0.151035,0.217069
ENSG00000229817.1,ENSG00000254467.1,False,False,False,False,False,True,0.0,0.02964,0.041419
ENSG00000235885.7,ENSG00000259734.1,False,True,False,True,False,True,0.0,0.012179,0.014005
ENSG00000256098.1,ENSG00000230583.6,False,True,False,False,False,False,0.018606,0.007308,0.063136
ENSG00000271299.1,ENSG00000233947.1,False,False,False,True,False,True,0.0,0.025113,0.02244


In [115]:
gene_pair_samples = {}

for k, v in gene_pair_cats.items():
    # sample at most 100 gene pairs
    df = gene_pair_cats[k]

    n = min(N_MAX_SAMPLES_PER_CATEGORY, df.shape[0])

    for coef in ("ccc", "pearson", "spearman", "random"):
        if coef == "random":
            new_k = f"{k}-{coef}"

            # do not sample if all gene pairs were already sampled
            df_weights = gene_pairs_weights.loc[df.index]
            if (df_weights > 0).sum() < n:
                display(f"  WARNING: {new_k}: none selected")
                continue

            sample_n = df.sample(
                n=n,
                replace=False,
                random_state=RANDOM_STATE,
                weights=gene_pairs_weights,
            )

            # do not sample these gene pairs again
            gene_pairs_weights.loc[sample_n.index] = 0.0

            gene_pair_samples[new_k] = sample_n

            display(f"{new_k}: {gene_pair_samples[new_k].shape}")

            continue

        df_coef = df.sort_values(coef, ascending=False)
        sample_n = df_coef.head(n)

        # when taking the top gene pairs by a coefficient, I do not remove repeated ones

        # do not sample these gene pairs again
        gene_pairs_weights.loc[sample_n.index] = 0.0

        new_k = f"{k}-top_{coef}"
        gene_pair_samples[new_k] = sample_n

        display(f"{new_k}: {gene_pair_samples[new_k].shape}")

'all_high-top_ccc: (500, 9)'

'all_high-top_pearson: (500, 9)'

'all_high-top_spearman: (500, 9)'

'all_high-random: (500, 9)'

'all_low-top_ccc: (500, 9)'

'all_low-top_pearson: (500, 9)'

'all_low-top_spearman: (500, 9)'

'all_low-random: (500, 9)'

'ccc_spearman_high_and_pearson_low-top_ccc: (500, 9)'

'ccc_spearman_high_and_pearson_low-top_pearson: (500, 9)'

'ccc_spearman_high_and_pearson_low-top_spearman: (500, 9)'

'ccc_spearman_high_and_pearson_low-random: (500, 9)'

'ccc_high_and_pearson_low-top_ccc: (500, 9)'

'ccc_high_and_pearson_low-top_pearson: (500, 9)'

'ccc_high_and_pearson_low-top_spearman: (500, 9)'

'ccc_high_and_pearson_low-random: (500, 9)'

'ccc_high_and_spearman_low-top_ccc: (500, 9)'

'ccc_high_and_spearman_low-top_pearson: (500, 9)'

'ccc_high_and_spearman_low-top_spearman: (500, 9)'

'ccc_high_and_spearman_low-random: (500, 9)'

'ccc_high_and_spearman_pearson_low-top_ccc: (500, 9)'

'ccc_high_and_spearman_pearson_low-top_pearson: (500, 9)'

'ccc_high_and_spearman_pearson_low-top_spearman: (500, 9)'

'ccc_high_and_spearman_pearson_low-random: (500, 9)'

'pearson_high_and_ccc_low-top_ccc: (500, 9)'

'pearson_high_and_ccc_low-top_pearson: (500, 9)'

'pearson_high_and_ccc_low-top_spearman: (500, 9)'

'pearson_high_and_ccc_low-random: (500, 9)'

'pearson_high_and_ccc_spearman_low-top_ccc: (500, 9)'

'pearson_high_and_ccc_spearman_low-top_pearson: (500, 9)'

'pearson_high_and_ccc_spearman_low-top_spearman: (500, 9)'

'pearson_high_and_ccc_spearman_low-random: (500, 9)'

# Include gene pairs mentioned in the paper

In [116]:
selected_gene_pairs = [
    # ('SDS', 'IFNG')
    ("ENSG00000135094.10", "ENSG00000111537.4"),
    # ('APOC1', 'JUN')
    ("ENSG00000130208.9", "ENSG00000177606.6"),
    # ('CCL18', 'ZDHHC12')
    ("ENSG00000275385.1", "ENSG00000160446.18"),
    # ('KDM6A', 'UTY')
    ("ENSG00000147050.14", "ENSG00000183878.15"),
    # ('CYTIP', 'RASSF2')
    ("ENSG00000115165.9", "ENSG00000101265.15"),
    # ('KLHL21', 'AC068580.6')
    ("ENSG00000162413.16", "ENSG00000235027.1"),
    # ('TNNI2', 'MYOZ1')
    ("ENSG00000130598.15", "ENSG00000177791.11"),
    # ('TPM2', 'PYGM')
    ("ENSG00000198467.13", "ENSG00000068976.13"),
]

In [117]:
gene_pair_samples["selected_in_manuscript"] = df_plot.loc[selected_gene_pairs]
display(gene_pair_samples["selected_in_manuscript"].shape)

(8, 9)

In [118]:
gene_pair_samples["selected_in_manuscript"]

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),CCC (high),CCC (low),ccc,pearson,spearman
ENSG00000135094.10,ENSG00000111537.4,False,False,True,False,True,False,0.706993,0.090451,0.765177
ENSG00000130208.9,ENSG00000177606.6,False,False,True,False,True,False,0.612233,0.084623,0.748265
ENSG00000275385.1,ENSG00000160446.18,False,False,True,False,True,False,0.446659,0.099853,0.560171
ENSG00000147050.14,ENSG00000183878.15,True,False,False,False,True,False,0.294391,0.23987,0.100621
ENSG00000115165.9,ENSG00000101265.15,False,False,False,False,True,False,0.201962,0.15606,0.107882
ENSG00000162413.16,ENSG00000235027.1,False,False,False,False,True,False,0.184217,0.062875,0.186421
ENSG00000130598.15,ENSG00000177791.11,True,False,True,False,False,False,0.034594,0.967834,0.284206
ENSG00000198467.13,ENSG00000068976.13,True,False,False,False,False,False,0.034912,0.94443,0.029852


# Include a random sample across the entire dataset

This includes all possible gene pairs from the top 5k genes initially selected, not the filtered list derived from the intersections.

## Load all correlations

In [119]:
INPUT_CORR_FILE_TEMPLATE = (
    SIMILARITY_MATRICES_DIR
    / "gtex_v8_data_{tissue}-{gene_sel_strategy}-{corr_method}.pkl"
)
display(INPUT_CORR_FILE_TEMPLATE)

PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/similarity_matrices/all/gtex_v8_data_{tissue}-{gene_sel_strategy}-{corr_method}.pkl')

In [120]:
INPUT_CORR_FILE = SIMILARITY_MATRICES_DIR / str(INPUT_CORR_FILE_TEMPLATE).format(
    tissue=GTEX_TISSUE,
    gene_sel_strategy=GENE_SEL_STRATEGY,
    corr_method="all",
)
display(INPUT_CORR_FILE)

PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/similarity_matrices/all/gtex_v8_data_whole_blood-var_pc_log2-all.pkl')

In [121]:
df = pd.read_pickle(INPUT_CORR_FILE)

In [122]:
df.shape

(276440000, 3)

In [123]:
df.head()

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman
ENSG00000000419.12,ENSG00000000419.12,,,
ENSG00000000419.12,ENSG00000000938.12,,,
ENSG00000000419.12,ENSG00000001167.14,,,
ENSG00000000419.12,ENSG00000001561.6,,,
ENSG00000000419.12,ENSG00000002549.12,,,


## Select 2n here (double)

In [124]:
sample_n = df.sample(n=int(n * 2), replace=False, random_state=RANDOM_STATE)

new_k = f"entire_dataset-random"
gene_pair_samples[new_k] = sample_n

In [125]:
gene_pair_samples[new_k].shape

(1000, 3)

In [126]:
gene_pair_samples[new_k]

Unnamed: 0,Unnamed: 1,ccc,pearson,spearman
ENSG00000254053.1,ENSG00000278007.1,,,
ENSG00000196352.14,ENSG00000182054.9,,,
ENSG00000272870.1,ENSG00000116819.6,0.038725,0.352643,0.340834
ENSG00000279744.1,ENSG00000159618.15,0.139359,0.374609,0.545365
ENSG00000188092.14,ENSG00000186204.14,0.035889,0.275522,0.332118
...,...,...,...,...
ENSG00000275426.1,ENSG00000175548.8,,,
ENSG00000250560.2,ENSG00000251747.1,,,
ENSG00000251998.1,ENSG00000224865.7,0.009948,,
ENSG00000196604.12,ENSG00000224237.1,,,


# Save

In [127]:
output_file = OUTPUT_DIR / "gene_pair-samples.pkl"

In [128]:
pd.to_pickle(gene_pair_samples, output_file)