# GTEx Data Exploration

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm.auto import tqdm
from cmapPy.pandasGEXpress.parse_gct import parse
from sklearn.model_selection import train_test_split


In [None]:
# The full dataset is not tracked in the repo. You need to download it
full_data_path = '../../data/input/GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct'
full_data_path = '../../data/input/GTEx/GTEx_mini.gct'

In [None]:
# calculate data split and metadata
file = open(full_data_path)
file.readline()
metadata = file.readline()
metadata = metadata.split('\t')
data_nr_rows = int(metadata[0])
data_nr_cols = int(metadata[1])
train_chunks_size = int(data_nr_rows * 0.6)
val_chunks_size = int(data_nr_rows * 0.2)
test_chunks_size = int(data_nr_rows * 0.2)

In [None]:
def process_batch(batch):
    # drop name and describtion
    batch.drop(batch.columns[[0, 1]], axis=1, inplace=True)
    scaler = MinMaxScaler()
    df_scaled = scaler.fit_transform(batch.to_numpy())
    df = pd.DataFrame(df_scaled)
    return df
    

In [None]:
class GTExDataset(Dataset):
    def __init__(self, nr_rows, type, transform=None):
        self.type = type
        self.chunks_reader = None
        self.nr_rows = nr_rows
        self.transform = transform
        self.current_chunk_nr = 0

    def __len__(self):
        return self.nr_rows

    def __getitem__(self, idx):
        if idx == 0:
            self.refresh_reader()
            chunk = self.chunks_reader.get_chunk()
            self.current_chunk = process_batch(chunk)
            self.current_chunk_nr = 0
        if (int(idx / self.chunks_reader.chunksize)) > self.current_chunk_nr:
            self.current_chunk_nr += 1
            chunk = self.chunks_reader.get_chunk()
            self.current_chunk = process_batch(chunk)
        index = idx % self.chunks_reader.chunksize
        features = torch.tensor(self.current_chunk.iloc[index], dtype=torch.float32)

        if self.transform:
            features = self.transform(features)

        return features, features # for Autoencoder
    
    def refresh_reader(self):
        if self.type == 'train':
            self.chunks_reader = pd.read_csv(full_data_path, skiprows=2, sep="\t", chunksize=256, nrows= int(data_nr_rows * 0.6))
        elif self.type == 'val':
            self.chunks_reader = pd.read_csv(full_data_path, skiprows=2 + int(data_nr_rows * 0.6), sep="\t", chunksize=256, nrows= int(data_nr_rows * 0.2))
        else:
            self.chunks_reader = pd.read_csv(full_data_path, skiprows=2 + int(data_nr_rows * 0.8), sep="\t", chunksize=256, nrows= int(data_nr_rows * 0.2))



In [None]:
# Set hyperparameters
input_size = data_nr_cols
encoding_size = 5000
learning_rate = 1e-4
epochs = 50
batch_size = 265

In [None]:
train_dataset = GTExDataset(nr_rows=train_chunks_size, type='train')
train_loader = DataLoader(train_dataset, batch_size=batch_size)

val_dataset = GTExDataset(nr_rows=val_chunks_size, type='val')
val_loader = DataLoader(train_dataset, batch_size=batch_size)

test_dataset = GTExDataset(nr_rows=test_chunks_size, type='test')
test_loader = DataLoader(train_dataset, batch_size=batch_size)

In [None]:
# Define Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size, encoding_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_size, encoding_size)
        self.decoder = nn.Linear(encoding_size, input_size)
        self.nonlin = nn.ReLU()

    def forward(self, x):
        x = self.encoder(x)
        x = self.nonlin(x)
        x = self.decoder(x)
        x = self.nonlin(x)
        return x

In [None]:

# Initialize model, loss function, and optimizer
model = Autoencoder(input_size, encoding_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

    # Val step
    with torch.no_grad():
        for inputs, targets in val_loader:
            model.eval()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
        print(f'Validation Loss: {loss.item():.4f}')

# Extract embeddings
with torch.no_grad():
    model.eval()
    for inputs, _ in train_loader:
        embeddings = model.encoder(inputs)

In [None]:
embeddings