# Description

According to the settings specified below, this notebook:
 1. reads all the data from one source (GTEx, recount2, etc) according to the gene selection method (`GENE_SELECTION_STRATEGY`),
 2. runs a quick performance test using the correlation coefficient specified (`CORRELATION_METHOD`), and
 3. computes the correlation matrix across all the genes using the correlation coefficient specified.

# Modules

In [14]:
import pandas as pd
import numpy as np
from pathlib import Path

# Paths

In [3]:
TOP_N_GENES = "top_5k"

In [4]:
DATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8")
INPUT_DIR = DATA_DIR / "similarity_matrices/" / TOP_N_GENES
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/similarity_matrices/top_5k')

# Data loading

In [7]:
cpu_res = pd.read_pickle(
    INPUT_DIR / "gtex_v8_data_whole_blood-var_pc_log2-ccc-top_5k.pkl"
)
gpu_res = pd.read_pickle(
    INPUT_DIR / "gtex_v8_data_whole_blood-var_pc_log2-ccc_gpu-top_5k.pkl"
)

In [12]:
cpu_res.shape

(5000, 5000)

In [10]:
cpu_res.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000122033.14,ENSG00000145779.7,ENSG00000196396.9,ENSG00000216490.3,ENSG00000135521.8,ENSG00000198478.7,ENSG00000168137.15,ENSG00000182197.10,ENSG00000111641.11,ENSG00000168528.11
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.780181,0.401717,0.789583,0.139359,0.524274,0.048667,0.365544,0.799041,0.343411,...,0.212632,0.037166,0.012264,0.016958,0.053866,0.133457,0.081476,0.212632,0.112897,0.022719
ENSG00000135245.9,0.780181,1.0,0.429092,0.780181,0.159947,0.516615,0.042631,0.334136,0.752314,0.355974,...,0.237858,0.051243,0.042777,0.035816,0.094566,0.145388,0.075009,0.217565,0.099542,0.01966
ENSG00000163631.16,0.401717,0.429092,1.0,0.408476,0.127681,0.401717,0.020316,0.2327,0.381776,0.261762,...,0.153623,0.041971,0.008775,0.009087,0.053706,0.090792,0.021106,0.153623,0.047663,0.009211
ENSG00000277632.1,0.789583,0.780181,0.408476,1.0,0.151543,0.547588,0.055712,0.34653,0.761547,0.381776,...,0.222554,0.036365,0.023933,0.017412,0.044225,0.129593,0.065463,0.188812,0.147426,0.011364
ENSG00000239839.6,0.139359,0.159947,0.127681,0.151543,1.0,0.272752,0.008065,0.100699,0.135411,0.318958,...,0.048843,0.012149,0.018039,0.005696,0.021103,0.022719,0.003596,0.100699,0.018589,0.010845


In [9]:
gpu_res.shape

(5000, 5000)

In [11]:
gpu_res.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000122033.14,ENSG00000145779.7,ENSG00000196396.9,ENSG00000216490.3,ENSG00000135521.8,ENSG00000198478.7,ENSG00000168137.15,ENSG00000182197.10,ENSG00000111641.11,ENSG00000168528.11
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.780181,0.401717,0.789583,0.139359,0.524274,0.048667,0.365544,0.799041,0.343411,...,0.212632,0.037166,0.012264,0.016958,0.053866,0.133457,0.081476,0.212632,0.112897,0.022719
ENSG00000135245.9,0.780181,1.0,0.429092,0.780181,0.159947,0.516615,0.042631,0.334136,0.752314,0.355974,...,0.237858,0.051243,0.042777,0.035816,0.094566,0.145388,0.075009,0.217565,0.099542,0.01966
ENSG00000163631.16,0.401717,0.429092,1.0,0.408477,0.127681,0.401717,0.020316,0.2327,0.381776,0.261762,...,0.153623,0.041971,0.008775,0.009087,0.053706,0.090792,0.021106,0.153623,0.047663,0.009211
ENSG00000277632.1,0.789583,0.780181,0.408477,1.0,0.151543,0.547588,0.055712,0.34653,0.761547,0.381776,...,0.222554,0.036365,0.023933,0.017412,0.044225,0.129593,0.065463,0.188812,0.147426,0.011364
ENSG00000239839.6,0.139359,0.159947,0.127681,0.151543,1.0,0.272752,0.008065,0.100699,0.135411,0.318958,...,0.048843,0.012149,0.018039,0.005696,0.021103,0.022719,0.003596,0.100699,0.018589,0.010845


In [18]:
# Assert the results are close
gpu_res = gpu_res.astype(np.float64)  # convert gpu_res to float64
pd.testing.assert_frame_equal(cpu_res, gpu_res, atol=1e-7)  # default atol is 1e-8