In [None]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import anndata as ad
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


In [None]:

path = '../../data/input/GTEx/GTEx_mini_random.gct'
gtex_df = pd.read_csv(path, skiprows=2, sep="\t")


In [None]:

# Drop zero columns
gtex_df = gtex_df.loc[:, (gtex_df != 0).any(axis=0)]
gtex_df

In [None]:
# drop first 2 columns
gtex_df.drop(gtex_df.columns[[0, 1]], axis=1, inplace=True)
gtex_df

In [None]:
# scale and transform
scaler = StandardScaler()
df_scaled = scaler.fit_transform(gtex_df.to_numpy())
gtex_df = pd.DataFrame(df_scaled)

In [None]:
train_df, test_df = train_test_split(gtex_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [None]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

In [None]:
class GTExDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        features = torch.tensor(self.df.iloc[idx], dtype=torch.float32)

        if self.transform:
            features = self.transform(features)

        return features, features # for Autoencoder

In [None]:
# Set hyperparameters
input_size = len(gtex_df.columns)
encoding_size = 5000
learning_rate = 1e-3
epochs = 10
batch_size = 256

In [None]:
train_dataset = GTExDataset(train_df)
val_dataset = GTExDataset(val_df)
test_dataset = GTExDataset(test_df)

# Define DataLoader for each set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Define Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size, encoding_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_size, encoding_size)
        self.decoder = nn.Linear(encoding_size, input_size)
        self.nonlin = nn.ReLU()

    def forward(self, x):
        x = self.encoder(x)
        x = self.nonlin(x)
        x = self.decoder(x)
        x = self.nonlin(x)
        return x

In [None]:
# Initialize model, loss function, and optimizer
model = Autoencoder(input_size, encoding_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_loss_values = []
val_loss_values = []
# Training loop
for epoch in range(epochs):
    train_epoch_losses= []
    val_epoch_losses=[]
    # Train step
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        train_epoch_losses.append(loss.item())
        loss.backward()
        optimizer.step()
    train_loss_values.append(np.mean(train_epoch_losses))
    print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {np.mean(train_epoch_losses):.4f}')

    # Val step
    with torch.no_grad():
        for inputs, targets in train_loader:
            model.eval()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_epoch_losses.append(loss.item())
        val_loss_values.append(np.mean(val_epoch_losses))
        print(f'Validation Loss: {np.mean(val_epoch_losses):.4f}')

In [None]:
with torch.no_grad():
    plt.plot(np.array(train_loss_values), 'r')
    plt.plot(np.array(val_loss_values), 'b')
