# GTEx Data Exploration

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm.auto import tqdm
from cmapPy.pandasGEXpress.parse_gct import parse
from cmapPy.pandasGEXpress.write_gct import write
from sklearn.model_selection import train_test_split

In [2]:
# data_path = '../../data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz'
# data_path = '../../data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct'
data_path = '../../data/GTEx_v7_tpm_first5k_rand.gct'
data = parse(data_path)

In [3]:
# remove nan values from row_metadata (description column)
data.row_metadata_df.dropna(inplace=True)
# remove the entries of .data_df where nan values are in row_metadata
data.data_df = data.data_df.loc[data.row_metadata_df.index]

In [4]:
df = data.data_df
df

cid,GTEX-1117F-0226-SM-5GZZ7,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-5NQ91,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,GTEX-11DXW-0326-SM-5H11W,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000204380.2,1.734000,1.05600,1.147000,1.672000,2.21300,1.4650,1.954000,1.161000,0.97310,2.858000,...,0.20320,0.048830,0.05889,0.071670,0.064340,0.163300,0.018230,0.01424,0.104600,0.127100
ENSG00000265514.1,0.000000,0.00000,0.000000,0.000000,0.00000,0.0000,0.000000,0.000000,0.00000,0.000000,...,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
ENSG00000183696.9,110.400002,52.16000,27.740000,86.870003,30.99000,31.9900,57.299999,21.780001,29.90000,33.119999,...,47.59000,243.600006,192.00000,69.959999,207.800003,69.269997,397.600006,55.66000,158.600006,141.399994
ENSG00000115297.9,0.185200,0.00000,0.013510,0.000000,0.00000,0.0000,0.000000,0.040060,0.01554,0.000000,...,0.02315,0.010430,0.00000,0.000000,0.015700,0.074370,0.000000,0.01217,0.041240,0.000000
ENSG00000242435.1,0.652000,0.00000,0.507400,0.561700,0.15910,0.2798,0.185400,0.376100,0.43760,1.034000,...,0.00000,0.391600,0.15740,0.287400,1.179000,0.872800,0.000000,0.11420,0.129100,1.020000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000165023.5,0.084150,0.07887,0.032740,0.516600,0.04108,0.0632,0.047870,0.048540,0.17890,0.077860,...,0.04208,0.056860,0.15240,0.000000,0.066600,0.157700,0.000000,0.05160,0.141600,0.075200
ENSG00000266052.1,0.000000,0.00000,0.000000,0.000000,0.00000,0.0000,0.000000,0.000000,0.00000,0.000000,...,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
ENSG00000205670.6,13.260000,13.01000,16.889999,10.920000,8.96900,10.2800,10.050000,20.690001,8.15800,12.730000,...,0.91690,5.391000,1.56700,2.083000,1.372000,1.851000,1.424000,1.91500,2.555000,2.505000
ENSG00000231801.2,0.000000,0.00000,0.395200,0.072910,0.00000,0.0000,0.000000,0.195300,0.00000,0.000000,...,0.00000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000


In [5]:
# df = df.T  # Transpose for samples as rows
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
tensor_data = torch.FloatTensor(scaled_data)

In [6]:
tensor_data.shape

torch.Size([5000, 11688])

In [7]:
final_df = pd.DataFrame(tensor_data.numpy(), index=df.index, columns=df.columns)
final_df.index.names = ['gene_id']

In [8]:
final_df

cid,GTEX-1117F-0226-SM-5GZZ7,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-5NQ91,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,GTEX-11DXW-0326-SM-5H11W,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000204380.2,-0.131422,-0.133552,-0.114792,-0.113290,-0.090511,-0.106679,-0.112888,-0.118415,-0.100195,-0.098472,...,-0.100682,-0.107148,-0.101972,-0.087774,-0.089884,-0.110266,-0.096875,-0.093164,-0.100779,-0.108599
ENSG00000265514.1,-0.148965,-0.144431,-0.125452,-0.128385,-0.116012,-0.122463,-0.134258,-0.128100,-0.109036,-0.131323,...,-0.104830,-0.107551,-0.102612,-0.088569,-0.090349,-0.112196,-0.097005,-0.093312,-0.101751,-0.109546
ENSG00000183696.9,0.967971,0.392949,0.132345,0.655894,0.241092,0.222189,0.492420,0.053593,0.162617,0.249377,...,0.866793,1.904293,1.984438,0.687319,1.412027,0.706221,2.729956,0.488261,1.371876,0.943481
ENSG00000115297.9,-0.147092,-0.144431,-0.125326,-0.128385,-0.116012,-0.122463,-0.134258,-0.127766,-0.108895,-0.131323,...,-0.104358,-0.107465,-0.102612,-0.088569,-0.090235,-0.111317,-0.097005,-0.093185,-0.101368,-0.109546
ENSG00000242435.1,-0.142369,-0.144431,-0.120736,-0.123314,-0.114178,-0.119448,-0.132230,-0.124962,-0.105060,-0.119438,...,-0.104830,-0.104317,-0.100901,-0.085381,-0.081825,-0.101884,-0.097005,-0.092119,-0.100551,-0.101950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000165023.5,-0.148114,-0.143619,-0.125147,-0.123721,-0.115538,-0.121782,-0.133734,-0.127695,-0.107411,-0.130428,...,-0.103971,-0.107082,-0.100956,-0.088569,-0.089867,-0.110333,-0.097005,-0.092773,-0.100435,-0.108986
ENSG00000266052.1,-0.148965,-0.144431,-0.125452,-0.128385,-0.116012,-0.122463,-0.134258,-0.128100,-0.109036,-0.131323,...,-0.104830,-0.107551,-0.102612,-0.088569,-0.090349,-0.112196,-0.097005,-0.093312,-0.101751,-0.109546
ENSG00000205670.6,-0.014811,-0.010395,0.031513,-0.029797,-0.012660,-0.011709,-0.024343,0.044500,-0.034918,0.015003,...,-0.086111,-0.063028,-0.085579,-0.065467,-0.080429,-0.090326,-0.086880,-0.073303,-0.078011,-0.090891
ENSG00000231801.2,-0.148965,-0.144431,-0.121779,-0.127727,-0.116012,-0.122463,-0.134258,-0.126471,-0.109036,-0.131323,...,-0.104830,-0.107551,-0.102612,-0.088569,-0.090349,-0.112196,-0.097005,-0.093312,-0.101751,-0.109546


In [9]:
final_df.describe()

cid,GTEX-1117F-0226-SM-5GZZ7,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-5NQ91,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,GTEX-11DXW-0326-SM-5H11W,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,-2.479553e-09,-2.670288e-09,-3.433227e-09,4.005432e-09,-7.629394e-10,-4.196167e-09,-3.814697e-10,-2.670288e-09,-7.629394e-10,1.525879e-09,...,4.005432e-09,3.433227e-09,1.525879e-09,-2.098083e-09,-3.623962e-09,4.005432e-09,-4.196167e-09,-2.861023e-09,1.144409e-09,-1.525879e-09
std,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,...,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001
min,-0.1489653,-0.1444314,-0.1254517,-0.1283849,-0.1160117,-0.1224628,-0.134258,-0.1281,-0.109036,-0.1313233,...,-0.1048305,-0.1075512,-0.1026121,-0.08856887,-0.09034884,-0.1121958,-0.0970051,-0.09331249,-0.101751,-0.109546
25%,-0.1489653,-0.1444314,-0.1254517,-0.1283849,-0.1160117,-0.1224628,-0.134258,-0.1281,-0.109036,-0.1313233,...,-0.1048305,-0.1075512,-0.1026121,-0.08856887,-0.09034884,-0.1121958,-0.0970051,-0.09331249,-0.101751,-0.109546
50%,-0.1481381,-0.1442203,-0.1250503,-0.1278681,-0.1160117,-0.1224628,-0.134258,-0.1278755,-0.1088008,-0.1308783,...,-0.1048305,-0.1075512,-0.1026121,-0.08856887,-0.09034884,-0.1121958,-0.0970051,-0.09331249,-0.101751,-0.109546
75%,-0.1161224,-0.1199835,-0.09960691,-0.1038169,-0.09265414,-0.1000588,-0.111772,-0.1010734,-0.09084256,-0.1013627,...,-0.09752594,-0.1009429,-0.09726784,-0.08252956,-0.08534989,-0.102455,-0.09328528,-0.08735516,-0.09476568,-0.1029141
max,31.83144,30.09354,35.94195,46.88143,52.14173,45.39662,31.75742,31.37201,40.89326,40.7549,...,35.62411,36.09904,48.07346,42.18822,38.31493,31.06729,38.90879,43.77028,43.26152,34.38565


In [10]:
train_df, test_df = train_test_split(final_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [11]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

3600
400
1000


In [12]:
class GTExDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        features = torch.tensor(self.dataframe.iloc[idx], dtype=torch.float32)

        if self.transform:
            features = self.transform(features)

        return features, features # for Autoencoder

In [24]:
# Set hyperparameters
input_size = len(df.columns)
encoding_size = 512
learning_rate = 1e-9
epochs = 10
batch_size = 256

In [25]:
train_dataset = GTExDataset(train_df)
val_dataset = GTExDataset(val_df)
test_dataset = GTExDataset(test_df)

# Define DataLoader for each set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [26]:
# Define Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size, encoding_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_size, encoding_size)
        self.decoder = nn.Linear(encoding_size, input_size)
        self.nonlin = nn.ReLU()

    def forward(self, x):
        x = self.encoder(x)
        x = self.nonlin(x)
        x = self.decoder(x)
        x = self.nonlin(x)
        return x

In [27]:
len(df.columns)

11688

In [28]:
# Initialize model, loss function, and optimizer
model = Autoencoder(input_size, encoding_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Extract embeddings
with torch.no_grad():
    model.eval()
    for inputs, _ in train_loader:
        embeddings = model.encoder(inputs)

  features = torch.tensor(self.dataframe.iloc[idx], dtype=torch.float32)


Epoch [1/10], Loss: 9.6574
Epoch [2/10], Loss: 0.0186
Epoch [3/10], Loss: 0.2401
Epoch [4/10], Loss: 88.0050
Epoch [5/10], Loss: 0.3985
Epoch [6/10], Loss: 0.0200
Epoch [7/10], Loss: 0.3783
Epoch [8/10], Loss: 0.2084
Epoch [9/10], Loss: 0.9829
Epoch [10/10], Loss: 0.5966


In [29]:
embeddings

tensor([[ 0.0285,  0.0497,  0.0309,  ..., -0.0495,  0.1222,  0.0285],
        [ 0.0276,  0.0494,  0.0315,  ..., -0.0498,  0.1239,  0.0279],
        [ 0.0286,  0.0499,  0.0315,  ..., -0.0494,  0.1234,  0.0281],
        ...,
        [ 0.0284,  0.0495,  0.0311,  ..., -0.0502,  0.1231,  0.0284],
        [ 0.0269,  0.0504,  0.0304,  ..., -0.0499,  0.1195,  0.0273],
        [ 0.0274,  0.0499,  0.0318,  ..., -0.0501,  0.1239,  0.0287]])