# Description
This notebook helps you download and preprocess the GTEx v8 data.

Please follow the instructions in the [README](../../README.md), section "Quick Install with pip" to install CCC-GPU with a conda environment `ccc-gpu-env`.

Then activate the environment and start the jupyter notebook server in order to run this notebook.

```bash
conda activate ccc-gpu-env
pip install notebook
jupyter notebook
```

In [22]:
import os
import re
import pandas as pd
import urllib.request
from tqdm import tqdm
from pathlib import Path

from ccc.utils import simplify_string
from ccc import conf

In [39]:
# Set this path to the directory where you want to save the intermediate data and results
ANALYSIS_DIR = Path("/mnt/data/proj_data/ccc-gpu/data/tutorial")

## Data Fetching and Preprocessing
This section downloads:
1. the public GTEx v8 gene TPMs data (https://www.gtexportal.org/home/downloads/adult-gtex/bulk_tissue_expression)
2. the GTEx sample attributes file (https://www.gtexportal.org/home/downloads/adult-gtex/metadata)
3. the GTEx subject attributes file (https://www.gtexportal.org/home/downloads/adult-gtex/metadata)

and perform preprocessing to prepare the data for the analysis.

### Download GTEx v8 gene expression data and split by tissue

In [24]:
# Create analysis directory if it doesn't exist
os.makedirs(ANALYSIS_DIR, exist_ok=True)

# Define files to download
files_to_download = {
    "gtex_all_sample_ids_with_expr_data": "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
    "gtex_sample_attrs": "https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
    "gtex_subject_attrs": "https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"
}

# Dictionary to store file paths
file_paths = {}

# Download files
for var_name, url in files_to_download.items():
    filename = Path(url).name
    file_path = Path(ANALYSIS_DIR) / filename
    file_paths[var_name] = file_path
    
    if not file_path.exists():
        print(f"Downloading {var_name} to {file_path}")
        urllib.request.urlretrieve(url, file_path)
        print("Download completed!")
    else:
        print(f"{var_name} already exists at {file_path}")

gtex_all_sample_ids_with_expr_data already exists at /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
gtex_sample_attrs already exists at /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
Downloading gtex_subject_attrs to /mnt/data/proj_data/ccc-gpu/data/tutorial/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt
Download completed!


In [25]:
gtex_sample_attrs = pd.read_csv(file_paths["gtex_sample_attrs"], sep="\t")
print(f"GTEx sample attributes shape: {gtex_sample_attrs.shape}")
print(f"GTEx sample attributes columns: {gtex_sample_attrs.columns}")

GTEx sample attributes shape: (22951, 63)
GTEx sample attributes columns: Index(['SAMPID', 'SMATSSCR', 'SMCENTER', 'SMPTHNTS', 'SMRIN', 'SMTS', 'SMTSD',
       'SMUBRID', 'SMTSISCH', 'SMTSPAX', 'SMNABTCH', 'SMNABTCHT', 'SMNABTCHD',
       'SMGEBTCH', 'SMGEBTCHD', 'SMGEBTCHT', 'SMAFRZE', 'SMGTC', 'SME2MPRT',
       'SMCHMPRS', 'SMNTRART', 'SMNUMGPS', 'SMMAPRT', 'SMEXNCRT', 'SM550NRM',
       'SMGNSDTC', 'SMUNMPRT', 'SM350NRM', 'SMRDLGTH', 'SMMNCPB', 'SME1MMRT',
       'SMSFLGTH', 'SMESTLBS', 'SMMPPD', 'SMNTERRT', 'SMRRNANM', 'SMRDTTL',
       'SMVQCFL', 'SMMNCV', 'SMTRSCPT', 'SMMPPDPR', 'SMCGLGTH', 'SMGAPPCT',
       'SMUNPDRD', 'SMNTRNRT', 'SMMPUNRT', 'SMEXPEFF', 'SMMPPDUN', 'SME2MMRT',
       'SME2ANTI', 'SMALTALG', 'SME2SNSE', 'SMMFLGTH', 'SME1ANTI', 'SMSPLTRD',
       'SMBSMMRT', 'SME1SNSE', 'SME1PCTS', 'SMRRNART', 'SME1MPRT', 'SMNUM5CD',
       'SMDPMPRT', 'SME2PCTS'],
      dtype='object')


In [26]:
# Get tissue names
gtex_tissues = gtex_sample_attrs["SMTSD"].unique()
print(len(gtex_tissues))
print(gtex_tissues)

55
['Whole Blood' 'Brain - Frontal Cortex (BA9)' 'Adipose - Subcutaneous'
 'Muscle - Skeletal' 'Artery - Tibial' 'Artery - Coronary'
 'Heart - Atrial Appendage' 'Adipose - Visceral (Omentum)' 'Ovary'
 'Uterus' 'Vagina' 'Breast - Mammary Tissue'
 'Skin - Not Sun Exposed (Suprapubic)' 'Minor Salivary Gland'
 'Brain - Cortex' 'Adrenal Gland' 'Thyroid' 'Lung' 'Spleen' 'Pancreas'
 'Esophagus - Muscularis' 'Esophagus - Mucosa'
 'Esophagus - Gastroesophageal Junction' 'Stomach' 'Colon - Sigmoid'
 'Small Intestine - Terminal Ileum' 'Colon - Transverse' 'Prostate'
 'Testis' 'Skin - Sun Exposed (Lower leg)' 'Nerve - Tibial'
 'Heart - Left Ventricle' 'Pituitary' 'Brain - Cerebellum'
 'Cells - Cultured fibroblasts' 'Artery - Aorta'
 'Cells - EBV-transformed lymphocytes' 'Brain - Cerebellar Hemisphere'
 'Brain - Caudate (basal ganglia)'
 'Brain - Nucleus accumbens (basal ganglia)'
 'Brain - Putamen (basal ganglia)' 'Brain - Hypothalamus'
 'Brain - Spinal cord (cervical c-1)' 'Liver' 'Brain - Hippoc

#### Get sample IDs for each tissue

In [27]:
# first, get all sample IDs with expression data
gtex_all_sample_ids_with_expr_data = set(
    pd.read_csv(
        file_paths["gtex_all_sample_ids_with_expr_data"],
        sep="\t",
        skiprows=2,
        nrows=1,
        usecols=lambda x: x not in ("Name", "Description"),
    ).columns
)

print(f"Number of samples with expression data: {len(gtex_all_sample_ids_with_expr_data)}")
print(f"Sample IDs with expression data: {list(gtex_all_sample_ids_with_expr_data)[:10]}")

Number of samples with expression data: 17382
Sample IDs with expression data: ['GTEX-1HFI7-2426-SM-B2LXV', 'GTEX-11TTK-0226-SM-5N9EC', 'GTEX-11UD2-1226-SM-5EQMI', 'GTEX-X4EO-0006-SM-3P5ZF', 'GTEX-13O21-0326-SM-5J1N9', 'GTEX-XBED-1526-SM-4AT5W', 'GTEX-13NZ8-0011-R8b-SM-5KM48', 'GTEX-1H3O1-0005-SM-ACKV8', 'GTEX-13JVG-0011-R5a-SM-5MR4O', 'GTEX-1F88F-1126-SM-7MKHL']


In [28]:
# get sample IDs by tissue
sample_ids_by_tissue = {
    tissue_name: sorted(
        list(
            gtex_all_sample_ids_with_expr_data.intersection(
                set(
                    gtex_sample_attrs[gtex_sample_attrs["SMTSD"] == tissue_name][
                        "SAMPID"
                    ].tolist()
                )
            )
        )
    )
    for tissue_name in gtex_tissues
}

assert len(gtex_tissues) == len(sample_ids_by_tissue)

In [29]:
sample_ids_by_tissue["Whole Blood"][:10]

['GTEX-111YS-0006-SM-5NQBE',
 'GTEX-1122O-0005-SM-5O99J',
 'GTEX-1128S-0005-SM-5P9HI',
 'GTEX-113IC-0006-SM-5NQ9C',
 'GTEX-113JC-0006-SM-5O997',
 'GTEX-117XS-0005-SM-5PNU6',
 'GTEX-117YW-0005-SM-5NQ8Z',
 'GTEX-1192W-0005-SM-5NQBQ',
 'GTEX-1192X-0005-SM-5NQC3',
 'GTEX-11DXW-0006-SM-5NQ7Y']

In [30]:
# Ensure all IDs are unique
assert all(
    [
        len(sample_ids_by_tissue[tissue_name])
        == len(set(sample_ids_by_tissue[tissue_name]))
        for tissue_name in sample_ids_by_tissue.keys()
    ]
)

#### Show sample size by tissue

In [31]:
tissue_sample_size = pd.DataFrame(
    [{"tissue": k, "sample_size": len(v)} for k, v in sample_ids_by_tissue.items()]
)

tissue_sample_size = tissue_sample_size.sort_values("sample_size", ascending=False)
display(tissue_sample_size)

Unnamed: 0,tissue,sample_size
3,Muscle - Skeletal,803
0,Whole Blood,755
29,Skin - Sun Exposed (Lower leg),701
4,Artery - Tibial,663
2,Adipose - Subcutaneous,663
16,Thyroid,653
30,Nerve - Tibial,619
12,Skin - Not Sun Exposed (Suprapubic),604
17,Lung,578
21,Esophagus - Mucosa,555


In [32]:
# Simple validations
_tmp = tissue_sample_size.set_index("tissue").squeeze()
assert _tmp.loc["Muscle - Skeletal"] == 803
assert _tmp.loc["Whole Blood"] == 755
assert _tmp.loc["Skin - Not Sun Exposed (Suprapubic)"] == 604
assert _tmp.loc["Kidney - Medulla"] == 4

These numbers match those you can find here: https://gtexportal.org/home/tissueSummaryPage#sampleCountsPerTissue

### Split expression data by tissue

In [33]:
TISSUE_DATA_DIR = ANALYSIS_DIR / "data_by_tissue"
TISSUE_DATA_DIR.mkdir(parents=True, exist_ok=True)

pbar = tqdm(tissue_sample_size["tissue"])

gene_id_symbol_map_tuples = set()

for tissue_name in pbar:
    pbar.set_description(tissue_name)

    tissue_ids = sample_ids_by_tissue[tissue_name]
    if len(tissue_ids) == 0:
        continue

    # Generate output filename
    tissue_name_simple = simplify_string(simplify_string(tissue_name.lower()))
    output_file = TISSUE_DATA_DIR / f"gtex_v8_data_{tissue_name_simple}.pkl"
    output_gene_mappings = ANALYSIS_DIR / "gtex_gene_id_symbol_mappings.pkl"
    
    # Skip if file already exists
    if output_file.exists() and output_gene_mappings.exists():
        print(f"Skipping {tissue_name} - file already exists")
        continue

    try:
        tissue_data = pd.read_csv(
            file_paths["gtex_all_sample_ids_with_expr_data"],
            sep="\t",
            skiprows=2,
            usecols=["Name", "Description"] + tissue_ids,
        )

        tissue_data = tissue_data.rename(
            columns={
                "Name": "gene_ens_id",
                "Description": "gene_symbol",
            }
        )

        # Validate data before processing
        if tissue_data.empty:
            print(f"Warning: No data found for {tissue_name}")
            continue

        # add gene id / gene symbol to mapping variable
        gene_id_symbol_map_tuples.update(
            tissue_data[["gene_ens_id", "gene_symbol"]].itertuples(index=False)
        )

        tissue_data = tissue_data.drop(columns=["gene_symbol"]).set_index("gene_ens_id")

        # Data quality checks
        assert not tissue_data.isna().any().any(), f"NaN values found in {tissue_name}"
        assert tissue_data.index.is_unique, f"Non-unique gene IDs in {tissue_name}"
        assert tissue_data.columns.is_unique, f"Non-unique sample IDs in {tissue_name}"

        # save
        tissue_data.to_pickle(path=output_file)
        
    except Exception as e:
        print(f"Error processing {tissue_name}: {str(e)}")
        continue

Cells - Leukemia cell line (CML): 100%|█████████████████████████████████████████████████████████████████████████████████| 55/55 [00:00<00:00, 4357.51it/s]

Skipping Muscle - Skeletal - file already exists
Skipping Whole Blood - file already exists
Skipping Skin - Sun Exposed (Lower leg) - file already exists
Skipping Artery - Tibial - file already exists
Skipping Adipose - Subcutaneous - file already exists
Skipping Thyroid - file already exists
Skipping Nerve - Tibial - file already exists
Skipping Skin - Not Sun Exposed (Suprapubic) - file already exists
Skipping Lung - file already exists
Skipping Esophagus - Mucosa - file already exists
Skipping Adipose - Visceral (Omentum) - file already exists
Skipping Esophagus - Muscularis - file already exists
Skipping Cells - Cultured fibroblasts - file already exists
Skipping Breast - Mammary Tissue - file already exists
Skipping Heart - Left Ventricle - file already exists
Skipping Artery - Aorta - file already exists
Skipping Heart - Atrial Appendage - file already exists
Skipping Colon - Transverse - file already exists
Skipping Esophagus - Gastroesophageal Junction - file already exists
Ski




In [34]:
# Simple validations
_tmp = pd.read_pickle(TISSUE_DATA_DIR / "gtex_v8_data_brain_cerebellar_hemisphere.pkl")

assert "GTEX-11DXY-0011-R11a-SM-DNZZN" in _tmp.columns
assert "GTEX-WL46-0011-R11A-SM-3MJFT" in _tmp.columns
assert "GTEX-ZF28-0011-R11a-SM-4WWEI" in _tmp.columns

_v = _tmp.loc["ENSG00000223972.5", "GTEX-11DXY-0011-R11a-SM-DNZZN"]
assert _v == 0.04045, _v
_v = _tmp.loc["ENSG00000278267.1", "GTEX-11DXY-0011-R11a-SM-DNZZN"]
assert _v == 0.0, _v

_v = _tmp.loc["ENSG00000233327.10", "GTEX-WL46-0011-R11A-SM-3MJFT"]
assert _v == 146.4000, _v
_v = _tmp.loc["ENSG00000237118.2", "GTEX-WL46-0011-R11A-SM-3MJFT"]
assert _v == 0.3357, _v

_v = _tmp.loc["ENSG00000233327.10", "GTEX-ZF28-0011-R11a-SM-4WWEI"]
assert _v == 30.7200, _v
_v = _tmp.loc["ENSG00000186907.7", "GTEX-ZF28-0011-R11a-SM-4WWEI"]
assert _v == 0.94720, _v

### Save gene mappings

In [37]:
output_gene_mappings = ANALYSIS_DIR / "gtex_gene_id_symbol_mappings.pkl"

if output_gene_mappings.exists():
    gene_mappings = pd.read_pickle(output_gene_mappings)
    print(f"Loaded existing gene mappings from {output_gene_mappings}")
else:
    gene_mappings = pd.DataFrame(gene_id_symbol_map_tuples)
    gene_mappings.to_pickle(output_gene_mappings)
    print(f"Created and saved gene mappings to {output_gene_mappings}")

print(f"gene_mappings.shape: {gene_mappings.shape}")
print(gene_mappings.head())

Loaded existing gene mappings from /mnt/data/proj_data/ccc-gpu/data/tutorial/gtex_gene_id_symbol_mappings.pkl
gene_mappings.shape: (56200, 2)
          gene_ens_id  gene_symbol
0  ENSG00000144278.14      GALNT13
1   ENSG00000260976.1    LINC01633
2  ENSG00000186660.14        ZFP91
3  ENSG00000123560.13         PLP1
4   ENSG00000227371.1  RP11-3L10.2


In [43]:
# Simple validations
# no null
assert gene_mappings.dropna(how="any").shape == gene_mappings.shape
# no duplicates
assert gene_mappings.drop_duplicates().shape == gene_mappings.shape

_tmp = gene_mappings.set_index("gene_ens_id").squeeze()
assert _tmp.loc["ENSG00000223972.5"] == "DDX11L1"
assert _tmp.loc["ENSG00000243485.5"] == "MIR1302-2HG"
assert _tmp.loc["ENSG00000274059.1"] == "5S_rRNA"  # repeated gene
assert _tmp.loc["ENSG00000275305.1"] == "5S_rRNA"  # repeated gene