# Description

It analyzes different strategies to take the genes from GTEx data with the highest variability, being this variability measured with different strategies: variance (`var`), coefficient of variation (`cv`) and mean absolute variation (`mad`) applied on two different versions of the data: 1) the raw TPM-normalized gene expression data (here refered to as `raw`), and 2) the log2-transformed version of the raw data (here refered to as `log2` and `pc_log2`).

# Modules

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from tqdm import tqdm
from pathlib import Path

# Settings

# Paths

In [2]:
DATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8")
INPUT_DIR = DATA_DIR / "data_by_tissue"
display(INPUT_DIR)

PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/data_by_tissue')

In [3]:
OUTPUT_DIR = DATA_DIR / "gene_selection" / "all"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/gene_selection/all')

# Select top genes for each tissue data file

Based on the previous findings, I select genes with both strategies `var_raw` and `var_pc_log2`.

Then I save, for both, the raw data (note that I only use the strategies to select genes, not to log-transform the data).

In [7]:
input_files = sorted(list(INPUT_DIR.glob("*.pkl")))
assert len(input_files) == 54, len(input_files)

display(input_files[:5])

[PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/data_by_tissue/gtex_v8_data_adipose_subcutaneous.pkl'),
 PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/data_by_tissue/gtex_v8_data_adipose_visceral_omentum.pkl'),
 PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/data_by_tissue/gtex_v8_data_adrenal_gland.pkl'),
 PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/data_by_tissue/gtex_v8_data_artery_aorta.pkl'),
 PosixPath('/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8/data_by_tissue/gtex_v8_data_artery_coronary.pkl')]

## Run

In [8]:
pbar = tqdm(input_files, ncols=100)

for tissue_data_file in pbar:
    pbar.set_description(tissue_data_file.stem)

    tissue_data = pd.read_pickle(tissue_data_file)

    # var_raw
    top_genes_var = (
        tissue_data.var(axis=1).sort_values(ascending=False)
        # .head(N_TOP_GENES_MAX_VARIANCE)
    )
    selected_tissue_data = tissue_data.loc[top_genes_var.index]

    output_filename = f"{tissue_data_file.stem}-var_raw.pkl"
    selected_tissue_data.to_pickle(path=OUTPUT_DIR / output_filename)

    # var_pc_log2
    log2_tissue_data = np.log2(tissue_data + 1)

    top_genes_var = (
        log2_tissue_data.var(axis=1).sort_values(ascending=False)
        # .head(N_TOP_GENES_MAX_VARIANCE)
    )
    # save the same raw data, but with genes selected by var_pc_log2
    selected_tissue_data = tissue_data.loc[top_genes_var.index]

    output_filename = f"{tissue_data_file.stem}-var_pc_log2.pkl"
    selected_tissue_data.to_pickle(path=OUTPUT_DIR / output_filename)

gtex_v8_data_whole_blood: 100%|█████████████████████████████████████| 54/54 [00:40<00:00,  1.33it/s]


## Check results

In [9]:
genes = pd.read_pickle(OUTPUT_DIR / "gtex_v8_data_whole_blood-var_pc_log2.pkl")

In [10]:
genes.shape

(56200, 755)

In [11]:
genes.head()

Unnamed: 0_level_0,GTEX-111YS-0006-SM-5NQBE,GTEX-1122O-0005-SM-5O99J,GTEX-1128S-0005-SM-5P9HI,GTEX-113IC-0006-SM-5NQ9C,GTEX-113JC-0006-SM-5O997,GTEX-117XS-0005-SM-5PNU6,GTEX-117YW-0005-SM-5NQ8Z,GTEX-1192W-0005-SM-5NQBQ,GTEX-1192X-0005-SM-5NQC3,GTEX-11DXW-0006-SM-5NQ7Y,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,0.5623,0.8067,116.9,4.047,211.0,58.11,68.38,249.5,5.095,295.9,...,39.96,0.1393,0.2238,245.0,513.6,1626.0,0.5633,515.7,1.194,1163.0
ENSG00000135245.9,0.6529,1.385,199.2,2.266,116.7,192.3,161.5,263.5,23.54,251.9,...,114.3,1.833,0.4115,149.0,935.3,233.6,0.8882,134.0,1.12,295.7
ENSG00000163631.16,1.848,0.2503,0.08429,1.251,1348.0,9.971,101.3,95.09,1.264,119.3,...,2.092,2.11,0.03588,171.8,107.1,71.25,1.772,309.6,0.07361,17.75
ENSG00000277632.1,1.696,1.345,235.1,11.77,141.7,199.1,525.5,659.9,10.91,209.3,...,61.34,2.25,0.7231,261.2,400.0,288.5,2.696,287.5,3.323,618.9
ENSG00000239839.6,185.2,1.779,694.3,23.84,297.3,3122.0,2521.0,1504.0,80.06,652.0,...,1010.0,253.8,94.52,6083.0,2768.0,52.06,34.57,17.36,352.3,63.85
