# Description

It analyzes different strategies to take the genes from GTEx data with the highest variability, being this variability measured with different strategies: variance (`var`), coefficient of variation (`cv`) and mean absolute variation (`mad`) applied on two different versions of the data: 1) the raw TPM-normalized gene expression data (here refered to as `raw`), and 2) the log2-transformed version of the raw data (here refered to as `log2` and `pc_log2`).

# Modules

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from tqdm import tqdm
from pathlib import Path

# Settings

# Paths

In [None]:
DATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8")
INPUT_DIR = DATA_DIR / "data_by_tissue"
display(INPUT_DIR)

In [None]:
OUTPUT_DIR = DATA_DIR / "gene_selection" / "all"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

# Select top genes for each tissue data file

Based on the previous findings, I select genes with both strategies `var_raw` and `var_pc_log2`.

Then I save, for both, the raw data (note that I only use the strategies to select genes, not to log-transform the data).

In [None]:
input_files = sorted(list(INPUT_DIR.glob("*.pkl")))
assert len(input_files) == 54, len(input_files)

display(input_files[:5])

## Run

In [None]:
pbar = tqdm(input_files, ncols=100)

for tissue_data_file in pbar:
    pbar.set_description(tissue_data_file.stem)

    tissue_data = pd.read_pickle(tissue_data_file)

    # var_raw
    top_genes_var = (
        tissue_data.var(axis=1).sort_values(ascending=False)
        # .head(N_TOP_GENES_MAX_VARIANCE)
    )
    selected_tissue_data = tissue_data.loc[top_genes_var.index]

    output_filename = f"{tissue_data_file.stem}-var_raw.pkl"
    selected_tissue_data.to_pickle(path=OUTPUT_DIR / output_filename)

    # var_pc_log2
    log2_tissue_data = np.log2(tissue_data + 1)

    top_genes_var = (
        log2_tissue_data.var(axis=1).sort_values(ascending=False)
        # .head(N_TOP_GENES_MAX_VARIANCE)
    )
    # save the same raw data, but with genes selected by var_pc_log2
    selected_tissue_data = tissue_data.loc[top_genes_var.index]

    output_filename = f"{tissue_data_file.stem}-var_pc_log2.pkl"
    selected_tissue_data.to_pickle(path=OUTPUT_DIR / output_filename)

## Check results

In [9]:
genes = pd.read_pickle(OUTPUT_DIR / "gtex_v8_data_whole_blood-var_pc_log2.pkl")

In [None]:
genes.shape