# Description

According to the settings specified below, this notebook:
 1. reads all the data from one source (GTEx, recount2, etc) according to the gene selection method (`GENE_SELECTION_STRATEGY`),
 2. runs a quick performance test using the correlation coefficient specified (`CORRELATION_METHOD`), and
 3. computes the correlation matrix across all the genes using the correlation coefficient specified.

# Modules

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Paths

In [2]:
TOP_N_GENES = "all"

In [3]:
DATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8")
INPUT_DIR = DATA_DIR / "similarity_matrices/" / TOP_N_GENES
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/similarity_matrices/all')

# Data loading

In [4]:
cpu_res = pd.read_pickle(
    INPUT_DIR / f"gtex_v8_data_whole_blood-var_pc_log2-ccc-{TOP_N_GENES}.pkl"
)
gpu_res = pd.read_pickle(
    INPUT_DIR / f"gtex_v8_data_whole_blood-var_pc_log2-ccc_gpu-{TOP_N_GENES}.pkl"
)

In [5]:
cpu_res.shape

(56200, 56200)

In [6]:
cpu_res.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000199224.1,ENSG00000271798.1,ENSG00000229686.1,ENSG00000234273.1,ENSG00000271368.1,ENSG00000278424.1,ENSG00000199024.1,ENSG00000278281.1,ENSG00000232661.1,ENSG00000206780.1
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.780181,0.401717,0.789583,0.139359,0.524274,0.048667,0.365544,0.799041,0.343411,...,,,,,,,,,,
ENSG00000135245.9,0.780181,1.0,0.429092,0.780181,0.159947,0.516615,0.042631,0.334136,0.752314,0.355974,...,,,,,,,,,,
ENSG00000163631.16,0.401717,0.429092,1.0,0.408476,0.127681,0.401717,0.020316,0.2327,0.381776,0.261762,...,,,,,,,,,,
ENSG00000277632.1,0.789583,0.780181,0.408476,1.0,0.151543,0.547588,0.055712,0.34653,0.761547,0.381776,...,,,,,,,,,,
ENSG00000239839.6,0.139359,0.159947,0.127681,0.151543,1.0,0.272752,0.008065,0.100699,0.135411,0.318958,...,,,,,,,,,,


In [7]:
gpu_res.shape

(56200, 56200)

In [8]:
gpu_res.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000199224.1,ENSG00000271798.1,ENSG00000229686.1,ENSG00000234273.1,ENSG00000271368.1,ENSG00000278424.1,ENSG00000199024.1,ENSG00000278281.1,ENSG00000232661.1,ENSG00000206780.1
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.780181,0.401717,0.789583,0.139359,0.524274,0.048667,0.365544,0.799041,0.343411,...,,,,,,,,,,
ENSG00000135245.9,0.780181,1.0,0.429092,0.780181,0.159947,0.516615,0.042631,0.334136,0.752314,0.355974,...,,,,,,,,,,
ENSG00000163631.16,0.401717,0.429092,1.0,0.408477,0.127681,0.401717,0.020316,0.2327,0.381776,0.261762,...,,,,,,,,,,
ENSG00000277632.1,0.789583,0.780181,0.408477,1.0,0.151543,0.547588,0.055712,0.34653,0.761547,0.381776,...,,,,,,,,,,
ENSG00000239839.6,0.139359,0.159947,0.127681,0.151543,1.0,0.272752,0.008065,0.100699,0.135411,0.318958,...,,,,,,,,,,


In [9]:
gpu_res["ENSG00000234741.7"]

gene_ens_id
ENSG00000169429.10    0.135411
ENSG00000135245.9     0.114192
ENSG00000163631.16    0.079873
ENSG00000277632.1     0.123901
ENSG00000239839.6     0.040029
                        ...   
ENSG00000278424.1          NaN
ENSG00000199024.1          NaN
ENSG00000278281.1          NaN
ENSG00000232661.1          NaN
ENSG00000206780.1          NaN
Name: ENSG00000234741.7, Length: 56200, dtype: float32

In [10]:
cpu_res["ENSG00000234741.7"]

gene_ens_id
ENSG00000169429.10    0.135411
ENSG00000135245.9     0.114192
ENSG00000163631.16    0.079873
ENSG00000277632.1     0.123901
ENSG00000239839.6     0.040029
                        ...   
ENSG00000278424.1          NaN
ENSG00000199024.1          NaN
ENSG00000278281.1          NaN
ENSG00000232661.1          NaN
ENSG00000206780.1          NaN
Name: ENSG00000234741.7, Length: 56200, dtype: float64

In [11]:
# Assert the results are close
gpu_res = gpu_res.astype(np.float64)  # convert gpu_res to float64
pd.testing.assert_frame_equal(cpu_res, gpu_res, atol=1e-7)  # default atol is 1e-8