# Description

This notebook computes expected coefficient values for all tissues (using their sample sizes) using random data.
These thresholds are used to define "high" and "low" values.

# Modules

In [21]:
import pandas as pd
import numpy as np
import numpy.typing as npt
from scipy.spatial.distance import pdist

# Settings

In [30]:
N_GENES = 150
N_JOBS = 10
PERCENTILES = np.linspace(0, 1, 21)
RNG = np.random.RandomState(42)

In [31]:
PERCENTILES

array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
       0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])

In [23]:
print(f"Number of permutations is approximately: {N_GENES * (N_GENES - 1) / 2}")

Number of permutations is approximately: 11175.0


# Functions

In [24]:
def pearson(data: pd.DataFrame) -> npt.NDArray:
    """
    Compute the Pearson correlation coefficient.

    Args:
        data: genes in rows, samples in columns.
    """
    return (1 - pdist(data.to_numpy(), metric="correlation"))


def spearman(data: pd.DataFrame) -> npt.NDArray:
    """
    Compute the Spearman correlation coefficient.

    Args:
        data: genes in rows, samples in columns.
    """
    # compute ranks
    data = data.rank(axis=1)

    # corr_mat = 1 - pairwise_distances(data.to_numpy(), metric="correlation", n_jobs=1)
    return (1 - pdist(data.to_numpy(), metric="correlation"))


def ccc(data: pd.DataFrame, internal_n_clusters=None, n_jobs=1) -> npt.NDArray:
    """
    Compute the Clustermatch Correlation Coefficient (CCC).

    Args:
        data: genes in rows, samples in columns.
    """
    from ccc.coef import ccc

    return ccc(
        data.to_numpy(),
        internal_n_clusters=None,
        n_jobs=N_JOBS,
    )

# Compute

In [25]:
# FIXME: tersting
SOMETHING = [
    (
        "whole_blood",
        pd.DataFrame(np.random.normal(size=(200, 1000)))
    ),
    (
        "brain",
        pd.DataFrame(np.random.normal(size=(200, 1000)))
    ),
]

In [26]:
SOMETHING

[('whole_blood',
            0         1         2         3         4         5         6    \
  0   -0.578649  0.082898  1.488034 -0.696137  1.956245  0.869817  1.285580   
  1    0.706191 -0.246629  1.712848  2.306778  0.887925 -0.231925 -0.012749   
  2   -1.072080 -0.265796 -0.994249 -0.814531 -2.327694  0.404744  0.999878   
  3   -1.116453 -0.155793 -0.895061  0.320258 -0.259040  0.244020  0.503191   
  4   -0.884433 -0.348042  0.384952  0.844420 -1.243051  0.154400  0.966728   
  ..        ...       ...       ...       ...       ...       ...       ...   
  195  0.924760  0.862272  0.174875  1.660168  0.303428 -1.567732  1.845656   
  196  0.218720 -0.668614  0.482700 -0.894147  0.490225 -0.694640 -0.414794   
  197 -0.597261  0.422734 -0.551724  1.753918  0.659534  0.005275 -0.748196   
  198 -0.321003  0.914852  0.589672 -0.671371  0.396613 -0.080316 -0.873837   
  199 -0.777234  0.873600 -0.556678  0.947655  1.200473  0.380407  0.652032   
  
            7         8         

In [32]:
for tissue_name, tissue_data in SOMETHING:
    # n = tissue_data.shape[1]
    print(tissue_name, flush=False)
    
    data_shuffled = tissue_data.apply(lambda x: x.sample(frac=1, replace=False, random_state=RNG).to_numpy(), axis=0)
    
    # x, y = np.random.normal(size=n), np.random.normal(size=n)
    
    tissue_null = pd.DataFrame({
        "ccc": np.abs(ccc(data_shuffled)),
        "pearson": np.abs(pearson(data_shuffled)),
        "spearman": np.abs(spearman(data_shuffled)),
    })

whole_blood
brain


In [35]:
tissue_null

Unnamed: 0,pearson,spearman,ccc
0,0.031848,0.040816,0.002474
1,0.007902,0.011093,0.002299
2,0.028645,0.036300,0.002465
3,0.036494,0.046088,0.002049
4,0.012900,0.026650,0.005104
...,...,...,...
19895,0.052842,0.048576,0.000740
19896,0.002688,0.008368,0.001786
19897,0.001810,0.013608,0.001088
19898,0.048875,0.050628,0.005203


In [36]:
tissue_null.quantile(PERCENTILES)

Unnamed: 0,pearson,spearman,ccc
0.0,7.099476e-07,1.080001e-07,0.0
0.05,0.00212645,0.001944657,0.0009
0.1,0.004090131,0.003922667,0.001233
0.15,0.006135762,0.005938403,0.001481
0.2,0.008135215,0.007979046,0.001687
0.25,0.01023442,0.0101137,0.001868
0.3,0.01231917,0.01228551,0.002036
0.35,0.01443696,0.01446614,0.00219
0.4,0.01666944,0.01672382,0.002359
0.45,0.0189395,0.01897765,0.002529
