# Description

According to the settings specified below, this notebook:
 1. reads all the data from one source (GTEx, recount2, etc) according to the gene selection method (`GENE_SELECTION_STRATEGY`),
 2. runs a quick performance test using the correlation coefficient specified (`CORRELATION_METHOD`), and
 3. computes the correlation matrix across all the genes using the correlation coefficient specified.

# Modules

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Paths

In [2]:
TOP_N_GENES = "all"

In [None]:
DATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8")
INPUT_DIR = DATA_DIR / "similarity_matrices/" / TOP_N_GENES
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/similarity_matrices/all')

# Data loading

In [4]:
cpu_res = pd.read_pickle(
    INPUT_DIR / f"gtex_v8_data_whole_blood-var_pc_log2-ccc-{TOP_N_GENES}.pkl"
)
gpu_res = pd.read_pickle(
    INPUT_DIR / f"gtex_v8_data_whole_blood-var_pc_log2-ccc_gpu-{TOP_N_GENES}.pkl"
)

In [5]:
cpu_res.shape

(56200, 56200)

In [None]:
cpu_res.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000199224.1,ENSG00000271798.1,ENSG00000229686.1,ENSG00000234273.1,ENSG00000271368.1,ENSG00000278424.1,ENSG00000199024.1,ENSG00000278281.1,ENSG00000232661.1,ENSG00000206780.1
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.780181,0.401717,0.789583,0.139359,0.524274,0.048667,0.365544,0.799041,0.343411,...,,,,,,,,,,
ENSG00000135245.9,0.780181,1.0,0.429092,0.780181,0.159947,0.516615,0.042631,0.334136,0.752314,0.355974,...,,,,,,,,,,
ENSG00000163631.16,0.401717,0.429092,1.0,0.408476,0.127681,0.401717,0.020316,0.2327,0.381776,0.261762,...,,,,,,,,,,
ENSG00000277632.1,0.789583,0.780181,0.408476,1.0,0.151543,0.547588,0.055712,0.34653,0.761547,0.381776,...,,,,,,,,,,
ENSG00000239839.6,0.139359,0.159947,0.127681,0.151543,1.0,0.272752,0.008065,0.100699,0.135411,0.318958,...,,,,,,,,,,


In [7]:
gpu_res.shape

(56200, 56200)

In [8]:
gpu_res.head()

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16,ENSG00000277632.1,ENSG00000239839.6,ENSG00000186652.9,ENSG00000129824.15,ENSG00000152463.14,ENSG00000123689.5,ENSG00000012223.12,...,ENSG00000199224.1,ENSG00000271798.1,ENSG00000229686.1,ENSG00000234273.1,ENSG00000271368.1,ENSG00000278424.1,ENSG00000199024.1,ENSG00000278281.1,ENSG00000232661.1,ENSG00000206780.1
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,1.0,0.780181,0.401717,0.789583,0.139359,0.524274,0.048667,0.365544,0.799041,0.343411,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000135245.9,0.780181,1.0,0.429092,0.780181,0.159947,0.516615,0.042631,0.334136,0.752314,0.355974,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000163631.16,0.401717,0.429092,1.0,0.408477,0.127681,0.401717,0.020316,0.2327,0.381776,0.261762,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000277632.1,0.789583,0.780181,0.408477,1.0,0.151543,0.547588,0.055712,0.34653,0.761547,0.381776,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000239839.6,0.139359,0.159947,0.127681,0.151543,1.0,0.272752,0.008065,0.100699,0.135411,0.318958,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
gpu_res["ENSG00000234741.7"]

gene_ens_id
ENSG00000169429.10    0.135411
ENSG00000135245.9     0.114192
ENSG00000163631.16    0.079873
ENSG00000277632.1     0.123901
ENSG00000239839.6     0.040029
                        ...   
ENSG00000278424.1     0.000000
ENSG00000199024.1     0.000000
ENSG00000278281.1     0.000000
ENSG00000232661.1     0.000000
ENSG00000206780.1     0.000000
Name: ENSG00000234741.7, Length: 56200, dtype: float32

In [10]:
cpu_res["ENSG00000234741.7"]

gene_ens_id
ENSG00000169429.10    0.135411
ENSG00000135245.9     0.114192
ENSG00000163631.16    0.079873
ENSG00000277632.1     0.123901
ENSG00000239839.6     0.040029
                        ...   
ENSG00000278424.1          NaN
ENSG00000199024.1          NaN
ENSG00000278281.1          NaN
ENSG00000232661.1          NaN
ENSG00000206780.1          NaN
Name: ENSG00000234741.7, Length: 56200, dtype: float64

In [11]:
# Assert the results are close
gpu_res = gpu_res.astype(np.float64)  # convert gpu_res to float64
pd.testing.assert_frame_equal(cpu_res, gpu_res, atol=1e-7)  # default atol is 1e-8

AssertionError: DataFrame.iloc[:, 0] (column name="ENSG00000169429.10") are different

DataFrame.iloc[:, 0] (column name="ENSG00000169429.10") values are different (5.16548 %)
[index]: [ENSG00000169429.10, ENSG00000135245.9, ENSG00000163631.16, ENSG00000277632.1, ENSG00000239839.6, ENSG00000186652.9, ENSG00000129824.15, ENSG00000152463.14, ENSG00000123689.5, ENSG00000012223.12, ENSG00000177606.6, ENSG00000158050.4, ENSG00000100448.3, ENSG00000196415.9, ENSG00000156575.2, ENSG00000163958.13, ENSG00000204388.6, ENSG00000172232.9, ENSG00000249138.1, ENSG00000197561.6, ENSG00000164821.4, ENSG00000115590.13, ENSG00000275302.1, ENSG00000203814.6, ENSG00000204936.9, ENSG00000005381.7, ENSG00000102837.6, ENSG00000090104.11, ENSG00000121053.5, ENSG00000168209.4, ENSG00000181631.6, ENSG00000220267.1, ENSG00000197582.5, ENSG00000147872.9, ENSG00000110245.11, ENSG00000240247.7, ENSG00000211896.7, ENSG00000158874.11, ENSG00000148346.11, ENSG00000146122.16, ENSG00000155659.14, ENSG00000164120.13, ENSG00000124469.11, ENSG00000183019.7, ENSG00000124102.4, ENSG00000109321.10, ENSG00000177575.12, ENSG00000128342.4, ENSG00000211893.3, ENSG00000110848.8, ENSG00000100450.12, ENSG00000123358.19, ENSG00000122861.15, ENSG00000180644.7, ENSG00000224114.1, ENSG00000276070.4, ENSG00000115523.16, ENSG00000143632.14, ENSG00000101916.11, ENSG00000100453.12, ENSG00000187554.11, ENSG00000243273.1, ENSG00000105205.6, ENSG00000274536.6, ENSG00000110203.8, ENSG00000104879.4, ENSG00000143878.9, ENSG00000134184.12, ENSG00000162551.13, ENSG00000132002.7, ENSG00000229598.1, ENSG00000118113.11, ENSG00000196565.13, ENSG00000123700.4, ENSG00000180739.13, ENSG00000225972.1, ENSG00000211895.4, ENSG00000121858.10, ENSG00000112715.21, ENSG00000277089.4, ENSG00000106565.17, ENSG00000012817.15, ENSG00000027869.11, ENSG00000120694.19, ENSG00000213058.3, ENSG00000169403.11, ENSG00000138061.11, ENSG00000149257.13, ENSG00000206047.2, ENSG00000211897.8, ENSG00000130208.9, ENSG00000211890.3, ENSG00000100385.13, ENSG00000225851.1, ENSG00000204933.3, ENSG00000106211.8, ENSG00000173597.8, ENSG00000270640.1, ENSG00000166523.7, ENSG00000183762.12, ...]
[left]:  [1.0, 0.7801812505004676, 0.4017169401327455, 0.7895827812136236, 0.13935942610597327, 0.5242741140481926, 0.0486672316250555, 0.3655442345188984, 0.799040524367516, 0.34341058597895696, 0.7801812505004676, 0.706992652661728, 0.28114125475325513, 0.28396592990025715, 0.49397560849129496, 0.20293569657879545, 0.7340167835457326, 0.3070692430429008, 0.7249525274769947, 0.25635156838852075, 0.19581076971546194, 0.2868046581574433, 0.6804744337443505, 0.706992652661728, 0.15782521288786724, 0.31895817425864076, 0.06823663546429484, 0.7615468263963645, 0.5475882238435975, 0.7708359322280478, 0.650175928187453, 0.0949449722650473, 0.03581144868948557, 0.7431372520552069, 0.33107195523732613, 0.1515434726355791, 0.21018710143378577, 0.3950136065749347, 0.2005466678475002, 0.28114125475325513, 0.1494776654385179, 0.30413214301442604, 0.2672286756710062, 0.43257757009700665, 0.024863255726547486, 0.5166151689978636, 0.17519485707540222, 0.7801812505004676, 0.16857584217869592, 0.6289557318094774, 0.4290923987713524, 0.7249525274769947, 0.6717474523200307, 0.5554720186568719, 0.052509081784462376, 0.6374016510301153, 0.43607679453284504, 0.01801234244919523, 0.5204376149679361, 0.5319892715392578, 0.5242741140481926, 0.41187733879584243, 0.15571724636025377, 0.608086863186104, 0.46820220441367427, 0.01889539885992161, 0.7431372520552069, 0.009305088958086634, 0.40508968657692707, 0.6717474523200307, 0.7615468263963645, 0.06546817275802916, 0.013218709093583327, 0.5515230946951426, 0.5319892715392578, 0.007401714868840781, 0.1515434726355791, 0.616392251304901, 0.7895827812136236, 0.7615468263963645, 0.013925811165296386, 0.044662053231224824, 0.6122325306904105, 0.5714082456056296, 0.021373799062681026, 0.667405041273147, 0.41872120345549413, 0.6892576276094067, 0.15994723252566478, 0.27833063271643715, 0.5554720186568719, 0.04542802741646708, 0.4865415132039114, 0.02799446364240703, 0.11833597327057493, 0.5875693223173329, 0.5281246662386331, 0.8085544799621448, 0.4754957685992169, 0.5754274351182793, ...]
[right]: [1.0, 0.7801812887191772, 0.401716947555542, 0.7895828485488892, 0.13935942947864532, 0.5242741703987122, 0.04866723343729973, 0.36554425954818726, 0.7990405559539795, 0.3434106111526489, 0.7801812887191772, 0.7069926857948303, 0.2811412811279297, 0.28396591544151306, 0.4939756393432617, 0.20293569564819336, 0.7340168356895447, 0.3070692718029022, 0.7249525785446167, 0.2563515603542328, 0.19581077992916107, 0.2868046760559082, 0.6804744601249695, 0.7069926857948303, 0.15782521665096283, 0.3189581632614136, 0.06823664158582687, 0.7615468502044678, 0.5475882291793823, 0.7708359360694885, 0.650175929069519, 0.09494496881961823, 0.03581145405769348, 0.7431373000144958, 0.33107197284698486, 0.15154346823692322, 0.21018710732460022, 0.39501360058784485, 0.20054668188095093, 0.2811412811279297, 0.14947767555713654, 0.3041321635246277, 0.2672286629676819, 0.43257761001586914, 0.02486325427889824, 0.5166151523590088, 0.1751948595046997, 0.7801812887191772, 0.16857583820819855, 0.6289557814598083, 0.4290923774242401, 0.7249525785446167, 0.6717475056648254, 0.5554720759391785, 0.05250908061861992, 0.6374017000198364, 0.436076819896698, 0.01801234483718872, 0.5204376578330994, 0.5319892764091492, 0.5242741703987122, 0.4118773639202118, 0.15571725368499756, 0.608086884021759, 0.46820223331451416, 0.01889539323747158, 0.7431373000144958, 0.00930508691817522, 0.40508967638015747, 0.6717475056648254, 0.7615468502044678, 0.06546817719936371, 0.01321870181709528, 0.5515231490135193, 0.5319892764091492, 0.007401714567095041, 0.15154346823692322, 0.6163922548294067, 0.7895828485488892, 0.7615468502044678, 0.013925805687904358, 0.04466204717755318, 0.6122325658798218, 0.5714082717895508, 0.021373802796006203, 0.6674050688743591, 0.41872119903564453, 0.6892576813697815, 0.1599472314119339, 0.27833065390586853, 0.5554720759391785, 0.04542802274227142, 0.4865415394306183, 0.027994466945528984, 0.11833597719669342, 0.5875693559646606, 0.5281246900558472, 0.8085545301437378, 0.4754957854747772, 0.5754274129867554, ...]
At positional index 53297, first diff: nan != 0.0