# Description

According to the settings specified below, this notebook:
 1. reads all the data from one source (GTEx, recount2, etc) according to the gene selection method (`GENE_SELECTION_STRATEGY`),
 2. runs a quick performance test using the correlation coefficient specified (`CORRELATION_METHOD`), and
 3. computes the correlation matrix across all the genes using the correlation coefficient specified.

# Modules

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path

# Paths

In [6]:
TOP_N_GENES = "top_10k"

In [7]:
DATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8")
INPUT_DIR = DATA_DIR / "similarity_matrices/" / TOP_N_GENES
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/similarity_matrices/top_10k')

# Data loading

In [8]:
cpu_res = pd.read_pickle(
    INPUT_DIR / f"gtex_v8_data_whole_blood-var_pc_log2-ccc-{TOP_N_GENES}.pkl"
)
gpu_res = pd.read_pickle(
    INPUT_DIR / f"gtex_v8_data_whole_blood-var_pc_log2-ccc_gpu-{TOP_N_GENES}.pkl"
)

In [9]:
cpu_res.shape

(10000, 10000)

In [10]:
cpu_res.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000170325.14,ENSG00000058091.16,ENSG00000172766.18,ENSG00000173409.13,ENSG00000243607.3,ENSG00000161981.10,ENSG00000126247.10,ENSG00000261353.1,ENSG00000196456.11,ENSG00000018510.13
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.780181,0.401717,0.789583,0.139359,0.524274,0.048667,0.365544,0.799041,0.343411,...,0.048844,0.075404,0.058951,0.145388,0.35915,0.0143,0.00938,0.253667,0.269983,0.040992
ENSG00000135245.9,0.780181,1.0,0.429092,0.780181,0.159947,0.516615,0.042631,0.334136,0.752314,0.355974,...,0.039712,0.084468,0.055451,0.170768,0.385064,0.043171,0.018437,0.31002,0.292524,0.052466
ENSG00000163631.16,0.401717,0.429092,1.0,0.408476,0.127681,0.401717,0.020316,0.2327,0.381776,0.261762,...,0.010468,0.061421,0.011191,0.122032,0.217565,0.012853,0.005208,0.162083,0.212632,0.029745
ENSG00000277632.1,0.789583,0.780181,0.408476,1.0,0.151543,0.547588,0.055712,0.34653,0.761547,0.381776,...,0.025929,0.072495,0.049304,0.141355,0.34653,0.034167,0.018151,0.25905,0.248341,0.038559
ENSG00000239839.6,0.139359,0.159947,0.127681,0.151543,1.0,0.272752,0.008065,0.100699,0.135411,0.318958,...,0.006586,0.024391,0.005483,0.037671,0.107585,0.029908,0.008124,0.064105,0.104114,0.014695


In [11]:
gpu_res.shape

(10000, 10000)

In [12]:
gpu_res.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000170325.14,ENSG00000058091.16,ENSG00000172766.18,ENSG00000173409.13,ENSG00000243607.3,ENSG00000161981.10,ENSG00000126247.10,ENSG00000261353.1,ENSG00000196456.11,ENSG00000018510.13
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.780181,0.401717,0.789583,0.139359,0.524274,0.048667,0.365544,0.799041,0.343411,...,0.048844,0.075404,0.058951,0.145388,0.35915,0.0143,0.00938,0.253667,0.269983,0.040992
ENSG00000135245.9,0.780181,1.0,0.429092,0.780181,0.159947,0.516615,0.042631,0.334136,0.752314,0.355974,...,0.039712,0.084468,0.055451,0.170768,0.385064,0.043171,0.018437,0.31002,0.292524,0.052466
ENSG00000163631.16,0.401717,0.429092,1.0,0.408477,0.127681,0.401717,0.020316,0.2327,0.381776,0.261762,...,0.010468,0.061421,0.011191,0.122032,0.217565,0.012853,0.005208,0.162083,0.212632,0.029745
ENSG00000277632.1,0.789583,0.780181,0.408477,1.0,0.151543,0.547588,0.055712,0.34653,0.761547,0.381776,...,0.025929,0.072495,0.049304,0.141355,0.34653,0.034167,0.018151,0.25905,0.248341,0.038559
ENSG00000239839.6,0.139359,0.159947,0.127681,0.151543,1.0,0.272752,0.008065,0.100699,0.135411,0.318958,...,0.006586,0.024391,0.005483,0.037671,0.107585,0.029908,0.008124,0.064105,0.104114,0.014695


In [13]:
# Assert the results are close
gpu_res = gpu_res.astype(np.float64)  # convert gpu_res to float64
pd.testing.assert_frame_equal(cpu_res, gpu_res, atol=1e-7)  # default atol is 1e-8