# GTEx Data Exploration

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm.auto import tqdm
from cmapPy.pandasGEXpress.parse_gct import parse
from cmapPy.pandasGEXpress.write_gct import write
from sklearn.model_selection import train_test_split

In [2]:
# data_path = '../../data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz'
# data_path = '../../data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct'
data_path = '../../data/GTEx_mini.gct'
data = parse(data_path)

In [3]:
# remove nan values from row_metadata (description column)
data.row_metadata_df.dropna(inplace=True)
# remove the entries of .data_df where nan values are in row_metadata
data.data_df = data.data_df.loc[data.row_metadata_df.index]

In [4]:
df = data.data_df
df

cid,GTEX-1117F-0226-SM-5GZZ7,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-5NQ91,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,GTEX-11DXW-0326-SM-5H11W,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
rid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972.4,0.10820,0.11580,0.02104,0.023290,0.00000,0.04641,0.03076,0.09358,0.12100,0.02859,...,0.09012,0.14620,0.10450,0.00000,0.6603,0.69500,0.12130,0.41690,0.23550,0.14500
ENSG00000227232.4,21.40000,11.03000,16.75000,8.172000,7.65800,9.37200,10.08000,13.56000,9.88900,9.12100,...,3.92600,13.13000,5.53700,5.78900,8.4390,7.84300,12.39000,12.53000,8.02700,12.76000
ENSG00000243485.2,0.16020,0.06433,0.04674,0.000000,0.05864,0.00000,0.13670,0.20790,0.05375,0.06351,...,0.08008,0.03607,0.00000,0.10590,0.0000,0.06432,0.05388,0.00000,0.04756,0.05367
ENSG00000237613.2,0.05045,0.00000,0.02945,0.032600,0.00000,0.00000,0.08610,0.13100,0.06773,0.00000,...,0.00000,0.06818,0.07309,0.03336,0.0000,0.08105,0.00000,0.05304,0.02996,0.03381
ENSG00000268020.2,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.11080,0.05619,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,0.08739,0.00000,0.00000,0.04353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000266580.1,0.00000,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.0000,0.00000,0.00000,0.00000,0.00000,0.00000
ENSG00000254545.1,0.15930,0.00000,0.00000,0.000000,0.00000,0.00000,0.00000,0.00000,0.10690,0.00000,...,0.00000,0.00000,0.11540,0.00000,0.0000,0.00000,0.00000,0.00000,0.00000,0.00000
ENSG00000134668.8,1.64200,0.72910,0.70630,36.099998,1.28700,3.64500,3.22100,0.33660,0.46410,1.93100,...,1.00100,4.01100,4.55000,0.84770,3.8900,0.54390,3.67400,0.52250,6.13400,0.22210
ENSG00000203620.2,0.08175,0.00000,0.01590,0.000000,0.03991,0.01754,0.02325,0.00000,0.00000,0.02161,...,0.02725,0.01227,0.00000,0.00000,0.0000,0.00000,0.00000,0.01432,0.00000,0.00000


In [5]:
# df = df.T  # Transpose for samples as rows
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)
tensor_data = torch.FloatTensor(scaled_data)

In [6]:
tensor_data.shape

torch.Size([1000, 11688])

In [7]:
final_df = pd.DataFrame(tensor_data.numpy())

In [8]:
final_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11678,11679,11680,11681,11682,11683,11684,11685,11686,11687
0,0.000076,0.000043,0.000007,0.000010,0.000000,0.000011,0.000011,0.000025,0.000027,0.000008,...,0.000140,0.000067,0.000063,0.000000,0.000316,0.000433,0.000045,0.000376,0.000125,0.000070
1,0.015070,0.004094,0.005463,0.003360,0.001689,0.002218,0.003457,0.003660,0.002191,0.002564,...,0.006090,0.006028,0.003336,0.007138,0.004036,0.004890,0.004599,0.011298,0.004249,0.006132
2,0.000113,0.000024,0.000015,0.000000,0.000013,0.000000,0.000047,0.000056,0.000012,0.000018,...,0.000124,0.000017,0.000000,0.000131,0.000000,0.000040,0.000020,0.000000,0.000025,0.000026
3,0.000036,0.000000,0.000010,0.000013,0.000000,0.000000,0.000030,0.000035,0.000015,0.000000,...,0.000000,0.000031,0.000044,0.000041,0.000000,0.000051,0.000000,0.000048,0.000016,0.000016
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000038,0.000015,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000032,0.000000,0.000000,0.000021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
996,0.000112,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000024,0.000000,...,0.000000,0.000000,0.000070,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
997,0.001156,0.000271,0.000230,0.014844,0.000284,0.000863,0.001105,0.000091,0.000103,0.000543,...,0.001553,0.001842,0.002741,0.001045,0.001860,0.000339,0.001364,0.000471,0.003247,0.000107
998,0.000058,0.000000,0.000005,0.000000,0.000009,0.000004,0.000008,0.000000,0.000000,0.000006,...,0.000042,0.000006,0.000000,0.000000,0.000000,0.000000,0.000000,0.000013,0.000000,0.000000


In [9]:
final_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11678,11679,11680,11681,11682,11683,11684,11685,11686,11687
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.013962,0.008359,0.005909,0.009186,0.00408,0.004321,0.006488,0.007252,0.004357,0.004797,...,0.009815,0.009967,0.008932,0.01086,0.008219,0.00809,0.008486,0.009772,0.008657,0.00959
std,0.062084,0.047849,0.037129,0.048666,0.035514,0.035181,0.041085,0.051286,0.035199,0.035299,...,0.058122,0.057237,0.055305,0.061358,0.055029,0.052435,0.057311,0.058408,0.050332,0.057225
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.000271,7.7e-05,7.1e-05,0.000123,3e-05,3.5e-05,6.4e-05,7.7e-05,4.6e-05,7.3e-05,...,7.9e-05,4.6e-05,3.8e-05,7.1e-05,4.4e-05,6.1e-05,2.2e-05,8e-05,4.8e-05,4.1e-05
75%,0.006473,0.002738,0.002352,0.003221,0.001119,0.001449,0.002097,0.002222,0.001268,0.00179,...,0.001593,0.001066,0.000964,0.0021,0.00104,0.001619,0.0006,0.001849,0.001068,0.001409
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
train_df, test_df = train_test_split(final_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [11]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

720
80
200


In [12]:
class GTExDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        features = torch.tensor(self.dataframe.iloc[idx], dtype=torch.float32)

        if self.transform:
            features = self.transform(features)

        return features, features # for Autoencoder

In [18]:
# Set hyperparameters
input_size = len(df.columns)
encoding_size = 500
learning_rate = 1e-9
epochs = 10
batch_size = 256

In [14]:
train_dataset = GTExDataset(train_df)
val_dataset = GTExDataset(val_df)
test_dataset = GTExDataset(test_df)

# Define DataLoader for each set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
# Define Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size, encoding_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_size, encoding_size)
        self.decoder = nn.Linear(encoding_size, input_size)
        self.nonlin = nn.ReLU()

    def forward(self, x):
        x = self.encoder(x)
        x = self.nonlin(x)
        x = self.decoder(x)
        x = self.nonlin(x)
        return x

In [16]:
len(df.columns)

11688

In [19]:
# Initialize model, loss function, and optimizer
model = Autoencoder(input_size, encoding_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Extract embeddings
with torch.no_grad():
    model.eval()
    for inputs, _ in train_loader:
        embeddings = model.encoder(inputs)

Epoch [1/10], Loss: 0.0017
Epoch [2/10], Loss: 0.0008
Epoch [3/10], Loss: 0.0046
Epoch [4/10], Loss: 0.0010
Epoch [5/10], Loss: 0.0015
Epoch [6/10], Loss: 0.0012
Epoch [7/10], Loss: 0.0014
Epoch [8/10], Loss: 0.0045
Epoch [9/10], Loss: 0.0047
Epoch [10/10], Loss: 0.0018


In [20]:
embeddings

tensor([[ 0.0011,  0.0046,  0.0065,  ...,  0.0043,  0.0149, -0.0037],
        [-0.0003,  0.0040,  0.0040,  ...,  0.0012,  0.0077, -0.0050],
        [ 0.0184,  0.0366, -0.0071,  ..., -0.0292,  0.0294,  0.0263],
        ...,
        [-0.0008, -0.0002, -0.0005,  ...,  0.0011,  0.0073, -0.0039],
        [-0.0002,  0.0040,  0.0040,  ...,  0.0013,  0.0078, -0.0049],
        [ 0.0003,  0.0106, -0.0335,  ..., -0.0144,  0.0188,  0.0110]])