# Description

This notebook computes expected coefficient values for all tissues (using their sample sizes) using random data.
These thresholds are used to define "high" and "low" values.

# Modules

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import numpy.typing as npt
from scipy.spatial.distance import pdist

# Settings

In [2]:
N_GENES = 500
N_JOBS = 10
PERCENTILES = np.linspace(0, 1, 101)
RNG = np.random.RandomState(42)

In [3]:
PERCENTILES

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [4]:
print(f"Number of permutations is approximately: {N_GENES * (N_GENES - 1) / 2}")

Number of permutations is approximately: 124750.0


# Paths

In [5]:
BASE_DIR = Path("/home/miltondp/projects/ccc/ccc-gpu/ccc-gpu/base/gtex")
DATA_BY_TISSUE_PATH = BASE_DIR / "data_by_tissue"
assert DATA_BY_TISSUE_PATH.exists()

# Functions

In [6]:
def pearson(data: pd.DataFrame) -> npt.NDArray:
    """
    Compute the Pearson correlation coefficient.

    Args:
        data: genes in rows, samples in columns.
    """
    return (1 - pdist(data.to_numpy(), metric="correlation"))


def spearman(data: pd.DataFrame) -> npt.NDArray:
    """
    Compute the Spearman correlation coefficient.

    Args:
        data: genes in rows, samples in columns.
    """
    # compute ranks
    data = data.rank(axis=1)

    # corr_mat = 1 - pairwise_distances(data.to_numpy(), metric="correlation", n_jobs=1)
    return (1 - pdist(data.to_numpy(), metric="correlation"))


def ccc(data: pd.DataFrame, internal_n_clusters=None, n_jobs=1) -> npt.NDArray:
    """
    Compute the Clustermatch Correlation Coefficient (CCC).

    Args:
        data: genes in rows, samples in columns.
    """
    from ccc.coef import ccc

    return ccc(
        data.to_numpy(),
        internal_n_clusters=None,
        n_jobs=N_JOBS,
    )

# Compute

In [7]:
# FIXME: testing, replace by real tissue data
SOMETHING = [
    (
        f.name.split(".")[0].split("gtex_v8_data_")[1],
        pd.read_pickle(f)
    )
    for f in DATA_BY_TISSUE_PATH.glob("*.pkl")
]

In [8]:
SOMETHING[0][0]

'whole_blood'

In [9]:
SOMETHING[0][1]

Unnamed: 0_level_0,GTEX-111YS-0006-SM-5NQBE,GTEX-1122O-0005-SM-5O99J,GTEX-1128S-0005-SM-5P9HI,GTEX-113IC-0006-SM-5NQ9C,GTEX-113JC-0006-SM-5O997,GTEX-117XS-0005-SM-5PNU6,GTEX-117YW-0005-SM-5NQ8Z,GTEX-1192W-0005-SM-5NQBQ,GTEX-1192X-0005-SM-5NQC3,GTEX-11DXW-0006-SM-5NQ7Y,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972.5,0.02171,0.03015,0.0203,0.1675,0.02104,0.02537,0.02492,0.06809,0.0000,0.02443,...,0.0000,0.04841,0.00000,0.000,0.04858,0.11450,0.02417,0.0000,0.02128,0.0000
ENSG00000227232.5,1.55500,2.74700,4.4410,10.4700,1.28700,3.13800,4.21700,2.09800,0.3579,4.26200,...,0.8979,3.06700,2.46400,1.504,3.19400,2.90600,3.43000,4.1690,2.74300,6.0720
ENSG00000278267.1,0.00000,0.00000,0.0000,0.0000,0.00000,0.00000,0.00000,0.00000,0.0000,0.62120,...,0.0000,0.00000,0.65930,0.000,0.00000,0.00000,0.00000,0.0000,0.00000,0.6112
ENSG00000243485.5,0.00000,0.00000,0.0000,0.0000,0.00000,0.00000,0.00000,0.04532,0.0000,0.00000,...,0.0000,0.00000,0.00000,0.000,0.09698,0.11430,0.00000,0.0000,0.00000,0.0000
ENSG00000237613.2,0.00000,0.00000,0.0576,0.0000,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,...,0.0000,0.00000,0.03678,0.000,0.00000,0.04058,0.00000,0.0000,0.00000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000198695.2,1972.00000,1148.00000,2575.0000,1333.0000,700.00000,1904.00000,3412.00000,1809.00000,36.1600,2005.00000,...,158.9000,1580.00000,873.70000,1162.000,1065.00000,3747.00000,2355.00000,1593.0000,997.60000,1464.0000
ENSG00000210194.1,6.52900,3.77700,18.3200,9.2320,2.63500,1.90700,9.36600,7.96200,0.0000,3.67300,...,0.0000,4.85200,2.59900,4.725,1.82600,9.32100,9.69000,2.8150,5.33200,7.2280
ENSG00000198727.2,3069.00000,2477.00000,2089.0000,1859.0000,1744.00000,4737.00000,8042.00000,3144.00000,229.7000,6881.00000,...,785.0000,2299.00000,2216.00000,2760.000,4229.00000,4499.00000,5323.00000,4977.0000,1840.00000,4281.0000
ENSG00000210195.2,0.00000,0.00000,0.0000,0.0000,0.00000,0.66470,0.00000,0.00000,0.0000,0.64000,...,0.0000,0.00000,0.00000,1.235,0.63630,2.24900,0.00000,0.4906,1.11500,0.0000


In [10]:
results = {}

for tissue_name, tissue_data in SOMETHING:
    # select top genes
    tissue_data = tissue_data.sample(n=N_GENES, replace=False, axis=0)
    n_genes, n_samples = tissue_data.shape

    print(tissue_name, n_genes, n_samples, flush=False)
    
    data_shuffled = tissue_data.apply(lambda x: x.sample(frac=1, replace=False, random_state=RNG).to_numpy(), axis=0)
    
    # x, y = np.random.normal(size=n), np.random.normal(size=n)
    
    tissue_null = pd.DataFrame({
        "ccc": np.abs(ccc(data_shuffled)),
        "pearson": np.abs(pearson(data_shuffled)),
        "spearman": np.abs(spearman(data_shuffled)),
    })

    tissue_null_perc = tissue_null.quantile(PERCENTILES)

    results[tissue_name] = tissue_null_perc

whole_blood 500 755


In [11]:
results

{'whole_blood':            ccc       pearson      spearman
 0.00  0.000000  2.792049e-08  3.715811e-07
 0.01  0.000000  2.942975e-04  4.572201e-04
 0.02  0.000000  5.995973e-04  9.205445e-04
 0.03  0.000000  8.780155e-04  1.381920e-03
 0.04  0.000000  1.177830e-03  1.848560e-03
 ...        ...           ...           ...
 0.96  0.030931  6.857707e-02  7.528763e-02
 0.97  0.033204  8.453252e-02  7.952328e-02
 0.98  0.036521  1.108411e-01  8.530303e-02
 0.99  0.041587  1.617704e-01  9.494232e-02
 1.00  0.083127  7.574201e-01  1.605603e-01
 
 [101 rows x 3 columns]}

In [12]:
results["whole_blood"].tail(51)

Unnamed: 0,ccc,pearson,spearman
0.5,0.00459,0.012089,0.02469
0.51,0.004817,0.012304,0.025253
0.52,0.005079,0.012532,0.025841
0.53,0.005326,0.012755,0.026419
0.54,0.005594,0.012982,0.026994
0.55,0.005836,0.01321,0.027606
0.56,0.006111,0.01344,0.028203
0.57,0.006404,0.01368,0.028821
0.58,0.006691,0.013919,0.02945
0.59,0.00699,0.014165,0.030052
