# GTEx model building with NFM

## Libraries

In [6]:
import csv
import gc
from pathlib import Path
import numpy as np
import pandas as pd
from pyprojroot.here import here
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score
import cupy as cp
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.decomposition import NMF


## Input

In [7]:
gct_path = here('output/gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz')

gtex_tpm = pd.read_csv(
    gct_path,
    sep='\t',
    compression='gzip',
    header=2,
    low_memory=False
)

print(gtex_tpm.shape)

gtex_tpm.head()

(56200, 17384)


Unnamed: 0,Name,Description,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENSG00000223972.5,DDX11L1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.03629,0.0,0.0,0.0,0.0,0.0,0.0,0.01965,0.02522
1,ENSG00000227232.5,WASH7P,8.764,3.861,7.349,11.07,3.306,5.389,11.99,16.95,...,1.606,2.268,5.386,2.31,2.456,4.023,1.922,2.857,0.8696,2.167
2,ENSG00000278267.1,MIR6859-1,0.0,0.0,1.004,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ENSG00000243485.5,MIR1302-2HG,0.07187,0.0,0.0,0.06761,0.0,0.0,0.0,0.0,...,0.0,0.0,0.06073,0.0,0.08464,0.1435,0.0,0.05216,0.0,0.0
4,ENSG00000237613.2,FAM138A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03904,...,0.02429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# aggregate by Description
aggregated_gtex = gtex_tpm.groupby('Description', as_index=False).sum(numeric_only=True)

# samples are all columns except Description
samples = [c for c in aggregated_gtex.columns if c != 'Description']

# create a pandas DataFrame (genes as index, samples as columns) with float dtype
gtex_data = aggregated_gtex.set_index('Description')[samples].astype(float)

# set index name to 'Gene'
gtex_data.index.name = 'Gene'

genes = gtex_data.index.tolist()

print("aggregated_gtex.shape:", aggregated_gtex.shape)
print("data_mat.shape:", gtex_data.shape)
print("n_genes:", len(genes), "n_samples:", len(samples))

gtex_data.head()

aggregated_gtex.shape: (54592, 17383)
data_mat.shape: (54592, 17382)
n_genes: 54592 n_samples: 17382


Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5S_rRNA,0.9801,0.0,0.5735,0.0,0.6586,0.4858,1.26,0.0,1.1293,0.0,...,0.5126,0.247,0.8914,0.0,1.7536,2.225,1.3008,0.3828,0.0,0.0
5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK,0.0,0.0,0.229,0.0,0.0,0.0,0.5369,0.3194,0.0,0.1821,...,0.1185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1BG,5.445,0.662,10.15,11.18,5.513,5.445,13.84,4.0,5.214,4.788,...,4.99,2.005,2.741,1.123,1.464,2.599,0.8323,7.989,1.305,2.455
A1BG-AS1,1.632,0.2,0.919,5.934,1.377,2.664,7.767,1.99,2.875,1.73,...,1.805,1.467,2.87,0.8808,0.8052,2.025,0.7385,3.425,0.1685,0.989


In [9]:
path = here('data/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt')

gtex_meta = pd.read_csv(
    path,
    sep='\t',
    header=0,
    dtype=str,
    quoting=csv.QUOTE_NONE,   
    engine='python',
    comment=None,            
    keep_default_na=False,
    on_bad_lines='warn'     
)

print(gtex_meta.shape)
gtex_meta.head()

(22951, 63)


Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
0,GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188,,...,,,,,,,,,,
1,GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188,,...,,,,,,,,,,
2,GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188,,...,,,,,,,,,,
3,GTEX-1117F-0011-R10a-SM-AHZ7F,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),9834,1193,,...,,,,,,,,,,
4,GTEX-1117F-0011-R10b-SM-CYKQ8,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),9834,1193,,...,,,,,,,,,,


In [10]:
meta = gtex_meta.copy()
if meta.index.name != "SAMPID":
    meta = meta.set_index("SAMPID")

sample_ids_expr = pd.Index(gtex_data.columns.astype(str).str.strip(), name="SAMPID")
sample_ids_meta = meta.index.astype(str).str.strip()
common_ids = sample_ids_expr.intersection(sample_ids_meta)

if len(common_ids) == 0:
    raise ValueError("No overlapping sample IDs between expression and metadata.")

gtex_data_common = gtex_data.loc[:, common_ids]    
meta_common = meta.loc[common_ids] 

In [11]:
samples = gtex_data.columns
genes = gtex_data.index

scaler = StandardScaler()

gtex_data_t = gtex_data_common.T 

gtex_data_scaled = pd.DataFrame(
    scaler.fit_transform(gtex_data_t),
    index=gtex_data_t.index,
    columns=gtex_data_t.columns
)

# PCA

In [12]:
n_components = 412

pca = PCA(n_components=n_components, svd_solver="randomized", random_state=0)

W = pca.fit_transform(gtex_data_scaled)     
H = pca.components_         

pc_names = [f"PC{i+1}" for i in range(W.shape[1])]

gtex_pca_scores = pd.DataFrame(W, index=samples, columns=pc_names)
gtex_pca_loadings = pd.DataFrame(H, index=pc_names, columns=genes)

gtex_pca_B = gtex_pca_scores.T 
gtex_pca_B.index.name = "PC"

# NMF  

In [None]:
if np.min(X) < 0:
    X_nonneg = X - np.min(X) + 1e-9
else:
    X_nonneg = X

n_components = min(412, X_nonneg.shape[0], X_nonneg.shape[1])

nmf = NMF(
    n_components=n_components,
    init="nndsvd",
    random_state=0,
    max_iter=1000,
    tol=1e-4
)

W = nmf.fit_transform(X_nonneg)
H = nmf.components_

comp_names = [f"LV{i+1}" for i in range(W.shape[1])]

gtex_nmf_scores = pd.DataFrame(W, index=samples, columns=comp_names)
gtex_nmf_loadings = pd.DataFrame(H, index=comp_names, columns=genes)
gtex_nmf_B = gtex_nmf_scores.T
gtex_nmf_B.index.name = "LV"
