In [37]:
import re
import pandas as pd

from ccc.coef import ccc
from pathlib import Path

## Metadata

In [19]:
MANUSCRIPT_DIR = Path("/mnt/data/projs/manuscripts/ccc-gpu/")
assert MANUSCRIPT_DIR.exists()

METADATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gtex_metadata")
METADATA_FILE = METADATA_DIR / "gtex_v8-sample_metadata.pkl"
assert METADATA_FILE.exists()

In [53]:
gtex_metadata = pd.read_pickle(METADATA_FILE)
gtex_metadata.head()

In [52]:
gtex_metadata.shape

## Whole Blood Expression Data

In [7]:
EXPR_DATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/gene_selection/all")
assert EXPR_DATA_DIR.exists()

In [49]:
EXPR_DATA_DIR_FILES = EXPR_DATA_DIR.glob("*.pkl")

In [11]:
EXPR_FILE = EXPR_DATA_DIR / "gtex_v8_data_whole_blood-var_pc_log2.pkl"
assert EXPR_FILE.exists()

In [12]:
expr_data = pd.read_pickle(EXPR_FILE)

In [13]:
expr_data.head()

In [31]:
expr_data.shape

In [18]:
# Get column names from expr file
sample_ids = expr_data.columns
sample_ids.shape

In [17]:
# Select rows in gtex_metadata using sample_ids
gtex_metadata.loc[sample_ids]

## Gene Mapping

In [20]:
gene_map = pd.read_pickle(MANUSCRIPT_DIR / "data" / "gtex_gene_id_symbol_mappings.pkl")

In [21]:
gene_map.head()

In [27]:
gene_symbol = "RASSF2"
# Look up gene id that matches the gene symbol
gene_id = gene_map.loc[gene_map["gene_symbol"] == gene_symbol, "gene_ens_id"].values[0]
gene_id

## Compute Correlation

In [40]:
# Select expression row for gene
gene_expr_row = expr_data.loc[gene_id]
gene_expr_row

In [44]:
metadata_vector = gtex_metadata.loc[sample_ids]["COHORT"]
metadata_vector

In [50]:
# Compute ccc
PVALUE_N_PERMS = 1000000

ccc_val, ccc_pval = ccc(gene_expr_row, metadata_vector, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=24)

In [51]:
ccc_pval

In [54]:
ccc_val