# Description

It reads GTEx v8 metadata on samples and subjects and writes a file with that info.

# Modules

In [3]:
import re
import pandas as pd

from ccc import conf
from pathlib import Path

# Settings

# Paths

In [None]:
DATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gene_expr/data/gtex_v8")
TISSUE_DIR = DATA_DIR / "data_by_tissue"
assert TISSUE_DIR.exists()

In [None]:
RAW_METADATA_DIR = Path("/mnt/data/proj_data/ccc-gpu/gtex_metadata")
assert RAW_METADATA_DIR.exists()

OUTPUT_DIR = RAW_METADATA_DIR

In [16]:
# Files
SAMPLE_ATTRS_FILE = (
    RAW_METADATA_DIR / "GTEx_Analysis_2017-06-05_v8_Annotations_SampleAttributesDS.txt"
)
assert SAMPLE_ATTRS_FILE.exists()

SUBJ_ATTRS_FILE = (
    RAW_METADATA_DIR / "GTEx_Analysis_2017-06-05_v8_Annotations_SubjectPhenotypesDS.txt"
)
assert SUBJ_ATTRS_FILE.exists()

# Data

## GTEx samples info

In [None]:
gtex_samples = pd.read_csv(
    SAMPLE_ATTRS_FILE,
    sep="\t",
    index_col="SAMPID",
)

In [None]:
display(gtex_samples.shape)
assert gtex_samples.index.is_unique

In [None]:
gtex_samples.head()

## GTEx subject phenotypes

In [17]:
gtex_phenotypes = pd.read_csv(
    SUBJ_ATTRS_FILE,
    sep="\t",
)

In [None]:
gtex_phenotypes.shape

In [None]:
gtex_phenotypes.head()

## GTEx gene expression sample

In [None]:
TISSUE_DIR.glob("*.pkl")

In [None]:
pd.read_pickle(next(TISSUE_DIR.glob("*.pkl"))).head()

# Get GTEx sample metadata

In [None]:
gtex_samples_ids = gtex_samples.index.to_list()
display(gtex_samples_ids[:5])

In [26]:
gtex_samples_ids = pd.Series(gtex_samples_ids).rename("SAMPID")

In [None]:
gtex_samples_ids

In [28]:
gtex_subjects_ids = gtex_samples_ids.str.extract(
    r"([\w\d]+\-[\w\d]+)", flags=re.IGNORECASE, expand=True
)[0].rename("SUBJID")

In [None]:
gtex_subjects_ids

In [30]:
gtex_metadata = pd.concat([gtex_samples_ids, gtex_subjects_ids], axis=1)

In [None]:
gtex_metadata

In [None]:
gtex_phenotypes

In [33]:
gtex_metadata = pd.merge(gtex_metadata, gtex_phenotypes).set_index("SAMPID")

In [None]:
gtex_metadata

In [35]:
gtex_metadata = pd.merge(gtex_metadata, gtex_samples, left_index=True, right_index=True)

In [36]:
gtex_metadata = gtex_metadata.replace(
    {
        "SEX": {
            1: "Male",
            2: "Female",
        }
    }
)

In [37]:
gtex_metadata = gtex_metadata.sort_index()

In [None]:
gtex_metadata.head()

# Testing

In [None]:
gtex_metadata.shape

In [40]:
assert not gtex_metadata["SUBJID"].isna().any()

In [41]:
assert not gtex_metadata["SMTS"].isna().any()
assert not gtex_metadata["SMTSD"].isna().any()

In [42]:
assert not gtex_metadata["SEX"].isna().any()
assert gtex_metadata["SEX"].unique().shape[0] == 2
assert set(gtex_metadata["SEX"].unique()) == {"Female", "Male"}

# Save

In [None]:
output_filename = OUTPUT_DIR / "gtex_v8-sample_metadata.pkl"
display(output_filename)

In [44]:
gtex_metadata.to_pickle(output_filename)