# Description

This notebook computes expected coefficient values for all tissues (using their sample sizes) using random data.
These thresholds can be used to define "high" and "low" values.

# Modules

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import numpy.typing as npt
from scipy.spatial.distance import pdist

# Settings

In [2]:
N_GENES = 10000
VAR_THRESHOLD = -0.10 # disable var filtering with negative value
N_JOBS = 10
RNG = np.random.RandomState(42)

In [3]:
GENERAL_PERCENTILES = np.linspace(0, 1, 101)[:-1].round(decimals=2)
display(GENERAL_PERCENTILES)

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99])

In [4]:
TOP_PERCENTILES = 1.0 - np.array([np.round(10 ** (-i), decimals=i) for i in range(3, 11)] + [0.0])
display(TOP_PERCENTILES)

array([0.999     , 0.9999    , 0.99999   , 0.999999  , 0.9999999 ,
       0.99999999, 1.        , 1.        , 1.        ])

In [5]:
PERCENTILES = np.concatenate([GENERAL_PERCENTILES, TOP_PERCENTILES])
display(PERCENTILES)

array([0.        , 0.01      , 0.02      , 0.03      , 0.04      ,
       0.05      , 0.06      , 0.07      , 0.08      , 0.09      ,
       0.1       , 0.11      , 0.12      , 0.13      , 0.14      ,
       0.15      , 0.16      , 0.17      , 0.18      , 0.19      ,
       0.2       , 0.21      , 0.22      , 0.23      , 0.24      ,
       0.25      , 0.26      , 0.27      , 0.28      , 0.29      ,
       0.3       , 0.31      , 0.32      , 0.33      , 0.34      ,
       0.35      , 0.36      , 0.37      , 0.38      , 0.39      ,
       0.4       , 0.41      , 0.42      , 0.43      , 0.44      ,
       0.45      , 0.46      , 0.47      , 0.48      , 0.49      ,
       0.5       , 0.51      , 0.52      , 0.53      , 0.54      ,
       0.55      , 0.56      , 0.57      , 0.58      , 0.59      ,
       0.6       , 0.61      , 0.62      , 0.63      , 0.64      ,
       0.65      , 0.66      , 0.67      , 0.68      , 0.69      ,
       0.7       , 0.71      , 0.72      , 0.73      , 0.74   

In [6]:
print(f"Number of permutations is approximately: {N_GENES * (N_GENES - 1) / 2}")

Number of permutations is approximately: 49995000.0


In [7]:
# FIXME: hardcoded
TISSUE_NAMES = """
adipose_subcutaneous
adipose_visceral_omentum
adrenal_gland
artery_aorta
artery_coronary
artery_tibial
bladder
brain_amygdala
brain_anterior_cingulate_cortex_ba24
brain_caudate_basal_ganglia
brain_cerebellar_hemisphere
brain_cerebellum
brain_cortex
brain_frontal_cortex_ba9
brain_hippocampus
brain_hypothalamus
brain_nucleus_accumbens_basal_ganglia
brain_putamen_basal_ganglia
brain_spinal_cord_cervical_c1
brain_substantia_nigra
breast_mammary_tissue
cells_cultured_fibroblasts
cells_ebvtransformed_lymphocytes
cervix_ectocervix
cervix_endocervix
colon_sigmoid
colon_transverse
esophagus_gastroesophageal_junction
esophagus_mucosa
esophagus_muscularis
fallopian_tube
heart_atrial_appendage
heart_left_ventricle
kidney_cortex
kidney_medulla
liver
lung
minor_salivary_gland
muscle_skeletal
nerve_tibial
ovary
pancreas
pituitary
prostate
skin_not_sun_exposed_suprapubic
skin_sun_exposed_lower_leg
small_intestine_terminal_ileum
spleen
stomach
testis
thyroid
uterus
vagina
whole_blood
""".split()

In [8]:
assert len(TISSUE_NAMES) == 54

# Paths

In [9]:
BASE_DIR = Path("/home/miltondp/projects/ccc/ccc-gpu/ccc-gpu/base/")
GTEX_DIR = BASE_DIR / "data/gtex"

In [10]:
DATA_BY_TISSUE_PATH = GTEX_DIR / "gene_selection/all"
assert DATA_BY_TISSUE_PATH.exists()

In [11]:
OUTPUT_DIR = GTEX_DIR / "tissue_thresholds"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [12]:
OUTPUT_FILE_TEMPLATE = "{tissue}-null_coefs_percentiles.{ext}"

# Functions

In [13]:
def pearson(data: pd.DataFrame) -> npt.NDArray:
    """
    Compute the Pearson correlation coefficient.

    Args:
        data: genes in rows, samples in columns.
    """
    return 1 - pdist(data.to_numpy(), metric="correlation")


def spearman(data: pd.DataFrame) -> npt.NDArray:
    """
    Compute the Spearman correlation coefficient.

    Args:
        data: genes in rows, samples in columns.
    """
    # compute ranks
    data = data.rank(axis=1)

    # corr_mat = 1 - pairwise_distances(data.to_numpy(), metric="correlation", n_jobs=1)
    return 1 - pdist(data.to_numpy(), metric="correlation")


def ccc(data: pd.DataFrame, internal_n_clusters=None, n_jobs=1) -> npt.NDArray:
    """
    Compute the Clustermatch Correlation Coefficient (CCC).

    Args:
        data: genes in rows, samples in columns.
    """
    # from ccc.coef import ccc
    from ccc.coef.impl_gpu import ccc

    return ccc(
        data.to_numpy(),
        internal_n_clusters=None,
        n_jobs=N_JOBS,
    )

# Compute

In [14]:
for tissue_name in TISSUE_NAMES:
    # get tissue data
    tissue_data = pd.read_pickle(
        DATA_BY_TISSUE_PATH / f"gtex_v8_data_{tissue_name}-var_pc_log2.pkl"
    )

    tvar = tissue_data.T.var()
    tissue_data = tissue_data.loc[tvar[tvar > VAR_THRESHOLD].index]

    # select random set of genes
    sampled = tissue_data.sample(n=N_GENES, replace=False, axis=0, random_state=RNG)
    values = sampled.to_numpy()
    n_genes, n_samples = values.shape

    print(tissue_name, n_genes, n_samples, flush=False)

    # shuffle samples across genes
    for i in range(n_genes):
        RNG.shuffle(values[i, :])

    data_shuffled = pd.DataFrame(
        values,
        index=sampled.index,
        columns=sampled.columns,
    )

    tissue_null = pd.DataFrame(
        {
            "ccc": np.abs(ccc(data_shuffled)),
            "pearson": np.abs(pearson(data_shuffled)),
            "spearman": np.abs(spearman(data_shuffled)),
        }
    )

    tissue_null_perc = tissue_null.quantile(PERCENTILES).rename_axis("percentile")

    # save
    output_filename = OUTPUT_FILE_TEMPLATE.format(tissue=tissue_name, ext="pkl")
    print(f"  {output_filename}", flush=True)
    tissue_null_perc.to_pickle(OUTPUT_DIR / output_filename)

    output_filename = OUTPUT_FILE_TEMPLATE.format(tissue=tissue_name, ext="tsv")
    print(f"  {output_filename}", flush=True)
    tissue_null_perc.to_csv(OUTPUT_DIR / output_filename, sep="\t")

adipose_subcutaneous 10000 663


  adipose_subcutaneous-null_coefs_percentiles.pkl


  adipose_subcutaneous-null_coefs_percentiles.tsv


adipose_visceral_omentum 10000 541


  adipose_visceral_omentum-null_coefs_percentiles.pkl


  adipose_visceral_omentum-null_coefs_percentiles.tsv


adrenal_gland 10000 258


  adrenal_gland-null_coefs_percentiles.pkl


  adrenal_gland-null_coefs_percentiles.tsv


artery_aorta 10000 432


  artery_aorta-null_coefs_percentiles.pkl


  artery_aorta-null_coefs_percentiles.tsv


artery_coronary 10000 240


  artery_coronary-null_coefs_percentiles.pkl


  artery_coronary-null_coefs_percentiles.tsv


artery_tibial 10000 663


  artery_tibial-null_coefs_percentiles.pkl


  artery_tibial-null_coefs_percentiles.tsv


bladder 10000 21


  bladder-null_coefs_percentiles.pkl


  bladder-null_coefs_percentiles.tsv


brain_amygdala 10000 152


  brain_amygdala-null_coefs_percentiles.pkl


  brain_amygdala-null_coefs_percentiles.tsv


brain_anterior_cingulate_cortex_ba24 10000 176


  brain_anterior_cingulate_cortex_ba24-null_coefs_percentiles.pkl


  brain_anterior_cingulate_cortex_ba24-null_coefs_percentiles.tsv


brain_caudate_basal_ganglia 10000 246


  brain_caudate_basal_ganglia-null_coefs_percentiles.pkl


  brain_caudate_basal_ganglia-null_coefs_percentiles.tsv


brain_cerebellar_hemisphere 10000 215


  brain_cerebellar_hemisphere-null_coefs_percentiles.pkl


  brain_cerebellar_hemisphere-null_coefs_percentiles.tsv


brain_cerebellum 10000 241


  brain_cerebellum-null_coefs_percentiles.pkl


  brain_cerebellum-null_coefs_percentiles.tsv


brain_cortex 10000 255


  brain_cortex-null_coefs_percentiles.pkl


  brain_cortex-null_coefs_percentiles.tsv


brain_frontal_cortex_ba9 10000 209


  brain_frontal_cortex_ba9-null_coefs_percentiles.pkl


  brain_frontal_cortex_ba9-null_coefs_percentiles.tsv


brain_hippocampus 10000 197


  brain_hippocampus-null_coefs_percentiles.pkl


  brain_hippocampus-null_coefs_percentiles.tsv


brain_hypothalamus 10000 202


  brain_hypothalamus-null_coefs_percentiles.pkl


  brain_hypothalamus-null_coefs_percentiles.tsv


brain_nucleus_accumbens_basal_ganglia 10000 246


  brain_nucleus_accumbens_basal_ganglia-null_coefs_percentiles.pkl


  brain_nucleus_accumbens_basal_ganglia-null_coefs_percentiles.tsv


brain_putamen_basal_ganglia 10000 205


  brain_putamen_basal_ganglia-null_coefs_percentiles.pkl


  brain_putamen_basal_ganglia-null_coefs_percentiles.tsv


brain_spinal_cord_cervical_c1 10000 159


  brain_spinal_cord_cervical_c1-null_coefs_percentiles.pkl


  brain_spinal_cord_cervical_c1-null_coefs_percentiles.tsv


brain_substantia_nigra 10000 139


  brain_substantia_nigra-null_coefs_percentiles.pkl


  brain_substantia_nigra-null_coefs_percentiles.tsv


breast_mammary_tissue 10000 459


  breast_mammary_tissue-null_coefs_percentiles.pkl


  breast_mammary_tissue-null_coefs_percentiles.tsv


cells_cultured_fibroblasts 10000 504


  cells_cultured_fibroblasts-null_coefs_percentiles.pkl


  cells_cultured_fibroblasts-null_coefs_percentiles.tsv


cells_ebvtransformed_lymphocytes 10000 174


  cells_ebvtransformed_lymphocytes-null_coefs_percentiles.pkl


  cells_ebvtransformed_lymphocytes-null_coefs_percentiles.tsv


cervix_ectocervix 10000 9


  cervix_ectocervix-null_coefs_percentiles.pkl


  cervix_ectocervix-null_coefs_percentiles.tsv


cervix_endocervix 10000 10


  cervix_endocervix-null_coefs_percentiles.pkl


  cervix_endocervix-null_coefs_percentiles.tsv


colon_sigmoid 10000 373


  colon_sigmoid-null_coefs_percentiles.pkl


  colon_sigmoid-null_coefs_percentiles.tsv


colon_transverse 10000 406


  colon_transverse-null_coefs_percentiles.pkl


  colon_transverse-null_coefs_percentiles.tsv


esophagus_gastroesophageal_junction 10000 375


  esophagus_gastroesophageal_junction-null_coefs_percentiles.pkl


  esophagus_gastroesophageal_junction-null_coefs_percentiles.tsv


esophagus_mucosa 10000 555


  esophagus_mucosa-null_coefs_percentiles.pkl


  esophagus_mucosa-null_coefs_percentiles.tsv


esophagus_muscularis 10000 515


  esophagus_muscularis-null_coefs_percentiles.pkl


  esophagus_muscularis-null_coefs_percentiles.tsv


fallopian_tube 10000 9


  fallopian_tube-null_coefs_percentiles.pkl


  fallopian_tube-null_coefs_percentiles.tsv


heart_atrial_appendage 10000 429


  heart_atrial_appendage-null_coefs_percentiles.pkl


  heart_atrial_appendage-null_coefs_percentiles.tsv


heart_left_ventricle 10000 432


  heart_left_ventricle-null_coefs_percentiles.pkl


  heart_left_ventricle-null_coefs_percentiles.tsv


kidney_cortex 10000 85


  kidney_cortex-null_coefs_percentiles.pkl


  kidney_cortex-null_coefs_percentiles.tsv


kidney_medulla 10000 4


  kidney_medulla-null_coefs_percentiles.pkl


  kidney_medulla-null_coefs_percentiles.tsv


liver 10000 226


  liver-null_coefs_percentiles.pkl


  liver-null_coefs_percentiles.tsv


lung 10000 578


  lung-null_coefs_percentiles.pkl


  lung-null_coefs_percentiles.tsv


minor_salivary_gland 10000 162


  minor_salivary_gland-null_coefs_percentiles.pkl


  minor_salivary_gland-null_coefs_percentiles.tsv


muscle_skeletal 10000 803


  muscle_skeletal-null_coefs_percentiles.pkl


  muscle_skeletal-null_coefs_percentiles.tsv


nerve_tibial 10000 619


  nerve_tibial-null_coefs_percentiles.pkl


  nerve_tibial-null_coefs_percentiles.tsv


ovary 10000 180


  ovary-null_coefs_percentiles.pkl


  ovary-null_coefs_percentiles.tsv


pancreas 10000 328


  pancreas-null_coefs_percentiles.pkl


  pancreas-null_coefs_percentiles.tsv


pituitary 10000 283


  pituitary-null_coefs_percentiles.pkl


  pituitary-null_coefs_percentiles.tsv


prostate 10000 245


  prostate-null_coefs_percentiles.pkl


  prostate-null_coefs_percentiles.tsv


skin_not_sun_exposed_suprapubic 10000 604


  skin_not_sun_exposed_suprapubic-null_coefs_percentiles.pkl


  skin_not_sun_exposed_suprapubic-null_coefs_percentiles.tsv


skin_sun_exposed_lower_leg 10000 701


  skin_sun_exposed_lower_leg-null_coefs_percentiles.pkl


  skin_sun_exposed_lower_leg-null_coefs_percentiles.tsv


small_intestine_terminal_ileum 10000 187


  small_intestine_terminal_ileum-null_coefs_percentiles.pkl


  small_intestine_terminal_ileum-null_coefs_percentiles.tsv


spleen 10000 241


  spleen-null_coefs_percentiles.pkl


  spleen-null_coefs_percentiles.tsv


stomach 10000 359


  stomach-null_coefs_percentiles.pkl


  stomach-null_coefs_percentiles.tsv


testis 10000 361


  testis-null_coefs_percentiles.pkl


  testis-null_coefs_percentiles.tsv


thyroid 10000 653


  thyroid-null_coefs_percentiles.pkl


  thyroid-null_coefs_percentiles.tsv


uterus 10000 142


  uterus-null_coefs_percentiles.pkl


  uterus-null_coefs_percentiles.tsv


vagina 10000 156


  vagina-null_coefs_percentiles.pkl


  vagina-null_coefs_percentiles.tsv


whole_blood 10000 755


  whole_blood-null_coefs_percentiles.pkl


  whole_blood-null_coefs_percentiles.tsv


# Testing

In [15]:
INPUT_FILE = OUTPUT_DIR / OUTPUT_FILE_TEMPLATE.format(tissue="adipose_subcutaneous", ext="pkl")
display(INPUT_FILE)

PosixPath('/home/miltondp/projects/ccc/ccc-gpu/ccc-gpu/base/data/gtex/tissue_thresholds/adipose_subcutaneous-null_coefs_percentiles.pkl')

In [16]:
results = pd.read_pickle(INPUT_FILE)

In [17]:
results.shape

(109, 3)

In [18]:
results

Unnamed: 0_level_0,ccc,pearson,spearman
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.00,0.0,8.835763e-10,0.000000
0.01,0.0,4.991361e-04,0.000533
0.02,0.0,9.996197e-04,0.001066
0.03,0.0,1.499664e-03,0.001573
0.04,0.0,1.964099e-03,0.002107
...,...,...,...
1.00,1.0,1.000000e+00,1.000000
1.00,1.0,1.000000e+00,1.000000
1.00,1.0,1.000000e+00,1.000000
1.00,1.0,1.000000e+00,1.000000


In [19]:
results.describe()

Unnamed: 0,ccc,pearson,spearman
count,109.0,109.0,109.0
mean,0.061268,0.09331235,0.088231
std,0.220498,0.2341272,0.218019
min,0.0,8.835763e-10,0.0
25%,3.6e-05,0.01179992,0.013128
50%,0.001374,0.02525586,0.028087
75%,0.004749,0.04663152,0.050299
max,1.0,1.0,1.0
