# Description

This notebook computes expected coefficient values for all tissues (using their sample sizes) using random data.
These thresholds are used to define "high" and "low" values.

# Modules

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import numpy.typing as npt
from scipy.spatial.distance import pdist

# Settings

In [None]:
N_GENES = 10000
N_JOBS = 10
PERCENTILES = np.linspace(0, 1, 101)
RNG = np.random.RandomState(42)

In [None]:
PERCENTILES

In [None]:
print(f"Number of permutations is approximately: {N_GENES * (N_GENES - 1) / 2}")

In [None]:
# FIXME: hardcoded
TISSUE_NAMES = """
adipose_subcutaneous
adipose_visceral_omentum
adrenal_gland
artery_aorta
artery_coronary
artery_tibial
bladder
brain_amygdala
brain_anterior_cingulate_cortex_ba24
brain_caudate_basal_ganglia
brain_cerebellar_hemisphere
brain_cerebellum
brain_cortex
brain_frontal_cortex_ba9
brain_hippocampus
brain_hypothalamus
brain_nucleus_accumbens_basal_ganglia
brain_putamen_basal_ganglia
brain_spinal_cord_cervical_c1
brain_substantia_nigra
breast_mammary_tissue
cells_cultured_fibroblasts
cells_ebvtransformed_lymphocytes
cervix_ectocervix
cervix_endocervix
colon_sigmoid
colon_transverse
esophagus_gastroesophageal_junction
esophagus_mucosa
esophagus_muscularis
fallopian_tube
heart_atrial_appendage
heart_left_ventricle
kidney_cortex
kidney_medulla
liver
lung
minor_salivary_gland
muscle_skeletal
nerve_tibial
ovary
pancreas
pituitary
prostate
skin_not_sun_exposed_suprapubic
skin_sun_exposed_lower_leg
small_intestine_terminal_ileum
spleen
stomach
testis
thyroid
uterus
vagina
whole_blood
""".split()

In [None]:
assert len(TISSUE_NAMES) == 54

# Paths

In [None]:
BASE_DIR = Path("/home/miltondp/projects/ccc/ccc-gpu/ccc-gpu/base/gtex")
DATA_BY_TISSUE_PATH = BASE_DIR / "data_by_tissue"
assert DATA_BY_TISSUE_PATH.exists()

In [None]:
OUTPUT_DIR = BASE_DIR / "tissue_thresholds"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
OUTPUT_FILE_TEMPLATE = "{tissue}-null_coefs_percentiles.pkl"

# Functions

In [None]:
def pearson(data: pd.DataFrame) -> npt.NDArray:
    """
    Compute the Pearson correlation coefficient.

    Args:
        data: genes in rows, samples in columns.
    """
    return (1 - pdist(data.to_numpy(), metric="correlation"))


def spearman(data: pd.DataFrame) -> npt.NDArray:
    """
    Compute the Spearman correlation coefficient.

    Args:
        data: genes in rows, samples in columns.
    """
    # compute ranks
    data = data.rank(axis=1)

    # corr_mat = 1 - pairwise_distances(data.to_numpy(), metric="correlation", n_jobs=1)
    return (1 - pdist(data.to_numpy(), metric="correlation"))


def ccc(data: pd.DataFrame, internal_n_clusters=None, n_jobs=1) -> npt.NDArray:
    """
    Compute the Clustermatch Correlation Coefficient (CCC).

    Args:
        data: genes in rows, samples in columns.
    """
    # from ccc.coef import ccc
    from ccc.coef.impl_gpu import ccc

    return ccc(
        data.to_numpy(),
        internal_n_clusters=None,
        n_jobs=N_JOBS,
    )

# Compute

In [None]:
for tissue_name in TISSUE_NAMES:
    # get tissue data
    tissue_data = pd.read_pickle(DATA_BY_TISSUE_PATH / f"gtex_v8_data_{tissue_name}.pkl")
    
    # select random set of genes
    tissue_data = tissue_data.sample(n=N_GENES, replace=False, axis=0, random_state=RNG)
    n_genes, n_samples = tissue_data.shape

    print(tissue_name, n_genes, n_samples, flush=False)

    # shuffle samples across genes
    data_shuffled = tissue_data.apply(lambda x: x.sample(frac=1, replace=False, random_state=RNG).to_numpy(), axis=0)
    
    tissue_null = pd.DataFrame({
        "ccc": np.abs(ccc(data_shuffled)),
        "pearson": np.abs(pearson(data_shuffled)),
        "spearman": np.abs(spearman(data_shuffled)),
    })

    tissue_null_perc = tissue_null.quantile(PERCENTILES)

    # save
    output_filename = OUTPUT_FILE_TEMPLATE.format(tissue=tissue_name)
    print(f"  {output_filename}", flush=True)
    tissue_null_perc.to_pickle(OUTPUT_DIR / output_filename)

# Testing

In [None]:
INPUT_FILE = OUTPUT_DIR / OUTPUT_FILE_TEMPLATE.format(tissue="adipose_subcutaneous")
display(INPUT_FILE)

In [None]:
results = pd.read_pickle(INPUT_FILE)

In [None]:
results.shape

In [None]:
results

In [None]:
results.describe()