# Description

This notebook computes expected coefficient values for all tissues (using their sample sizes) using random data.
These thresholds are used to define "high" and "low" values.

# Modules

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import numpy.typing as npt
from scipy.spatial.distance import pdist

# Settings

In [2]:
N_GENES = 10000
N_JOBS = 10
PERCENTILES = np.linspace(0, 1, 101)
RNG = np.random.RandomState(42)

In [3]:
PERCENTILES

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [4]:
print(f"Number of permutations is approximately: {N_GENES * (N_GENES - 1) / 2}")

Number of permutations is approximately: 49995000.0


In [5]:
# FIXME: hardcoded
TISSUE_NAMES = """
adipose_subcutaneous
adipose_visceral_omentum
adrenal_gland
artery_aorta
artery_coronary
artery_tibial
bladder
brain_amygdala
brain_anterior_cingulate_cortex_ba24
brain_caudate_basal_ganglia
brain_cerebellar_hemisphere
brain_cerebellum
brain_cortex
brain_frontal_cortex_ba9
brain_hippocampus
brain_hypothalamus
brain_nucleus_accumbens_basal_ganglia
brain_putamen_basal_ganglia
brain_spinal_cord_cervical_c1
brain_substantia_nigra
breast_mammary_tissue
cells_cultured_fibroblasts
cells_ebvtransformed_lymphocytes
cervix_ectocervix
cervix_endocervix
colon_sigmoid
colon_transverse
esophagus_gastroesophageal_junction
esophagus_mucosa
esophagus_muscularis
fallopian_tube
heart_atrial_appendage
heart_left_ventricle
kidney_cortex
kidney_medulla
liver
lung
minor_salivary_gland
muscle_skeletal
nerve_tibial
ovary
pancreas
pituitary
prostate
skin_not_sun_exposed_suprapubic
skin_sun_exposed_lower_leg
small_intestine_terminal_ileum
spleen
stomach
testis
thyroid
uterus
vagina
whole_blood
""".split()

In [6]:
assert len(TISSUE_NAMES) == 54

# Paths

In [7]:
BASE_DIR = Path("/home/miltondp/projects/ccc/ccc-gpu/ccc-gpu/base/gtex")
DATA_BY_TISSUE_PATH = BASE_DIR / "data_by_tissue"
assert DATA_BY_TISSUE_PATH.exists()

In [8]:
OUTPUT_DIR = BASE_DIR / "tissue_thresholds"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [9]:
OUTPUT_FILE_TEMPLATE = "{tissue}-null_coefs_percentiles.pkl"

# Functions

In [10]:
def pearson(data: pd.DataFrame) -> npt.NDArray:
    """
    Compute the Pearson correlation coefficient.

    Args:
        data: genes in rows, samples in columns.
    """
    return (1 - pdist(data.to_numpy(), metric="correlation"))


def spearman(data: pd.DataFrame) -> npt.NDArray:
    """
    Compute the Spearman correlation coefficient.

    Args:
        data: genes in rows, samples in columns.
    """
    # compute ranks
    data = data.rank(axis=1)

    # corr_mat = 1 - pairwise_distances(data.to_numpy(), metric="correlation", n_jobs=1)
    return (1 - pdist(data.to_numpy(), metric="correlation"))


def ccc(data: pd.DataFrame, internal_n_clusters=None, n_jobs=1) -> npt.NDArray:
    """
    Compute the Clustermatch Correlation Coefficient (CCC).

    Args:
        data: genes in rows, samples in columns.
    """
    # from ccc.coef import ccc
    from ccc.coef.impl_gpu import ccc

    return ccc(
        data.to_numpy(),
        internal_n_clusters=None,
        n_jobs=N_JOBS,
    )

# Compute

In [None]:
for tissue_name in TISSUE_NAMES:
    # get tissue data
    tissue_data = pd.read_pickle(DATA_BY_TISSUE_PATH / f"gtex_v8_data_{tissue_name}.pkl")
    
    # select top genes
    tissue_data = tissue_data.sample(n=N_GENES, replace=False, axis=0)
    n_genes, n_samples = tissue_data.shape

    print(tissue_name, n_genes, n_samples, flush=False)
    
    data_shuffled = tissue_data.apply(lambda x: x.sample(frac=1, replace=False, random_state=RNG).to_numpy(), axis=0)
    
    # x, y = np.random.normal(size=n), np.random.normal(size=n)
    
    tissue_null = pd.DataFrame({
        "ccc": np.abs(ccc(data_shuffled)),
        "pearson": np.abs(pearson(data_shuffled)),
        "spearman": np.abs(spearman(data_shuffled)),
    })

    tissue_null_perc = tissue_null.quantile(PERCENTILES)

    # save
    output_filename = OUTPUT_FILE_TEMPLATE.format(tissue=tissue_name)
    print(f"  {output_filename}", flush=True)
    tissue_null_perc.to_pickle(OUTPUT_DIR / output_filename)

adipose_subcutaneous 10000 663
[2025-07-23 14:44:30.464] [debug] CUDA Device Info:
[2025-07-23 14:44:30.580] [debug] Device 0: "NVIDIA GeForce RTX 4090"
[2025-07-23 14:44:30.580] [debug]   CUDA Driver Version / Runtime Version          12.7 / 12.5
[2025-07-23 14:44:30.580] [debug]   CUDA Capability Major/Minor version number:    8.9
[2025-07-23 14:44:30.580] [debug]   Total amount of global memory:                 23.52 GBytes (25251414016 bytes)
[2025-07-23 14:44:30.580] [debug]   GPU Clock rate:                                2535 MHz (2.54 GHz)
[2025-07-23 14:44:30.580] [debug]   Memory Clock rate:                             10501 Mhz
[2025-07-23 14:44:30.580] [debug]   Memory Bus Width:                              384-bit
[2025-07-23 14:44:30.580] [debug]   Shared Memory per Block:                       48.00 KB
[2025-07-23 14:44:30.580] [debug]   Shared Memory per Multiprocessor:              100.00 KB
[2025-07-23 14:44:30.580] [debug]   Number of Multiprocessors:               

# Testing

In [None]:
results["whole_blood"].tail(51)