In [1]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import anndata as ad



In [2]:

path = '../../data/TS.h5ad'
ts_ann_data = ad.read_h5ad(path)


In [3]:

ts_df = ts_ann_data.to_df()
# Drop zero columns
ts_df = ts_df.loc[:, (ts_df != 0).any(axis=0)]
ts_df

ensemblid,ENSG00000227232,ENSG00000243485,ENSG00000186092,ENSG00000238009,ENSG00000239945,ENSG00000233750,ENSG00000268903,ENSG00000269981,ENSG00000241860,ENSG00000222623,...,ENSG00000212907,ENSG00000198886,ENSG00000210176,ENSG00000210191,ENSG00000198786,ENSG00000198695,ENSG00000210194,ENSG00000198727,ENSG00000210195,ENSG00000210196
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGAGCAAGA_TSP12_Heart_Atria_10X_1_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.492649,3.470269,0.0,0.0,2.023797,2.026197,0.000000,3.764314,0.0,0.000000
AAACCCAAGATGGCGT_TSP12_Heart_Atria_10X_1_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.903640,3.251729,0.0,0.0,2.110188,1.450374,0.000000,3.375192,0.0,0.000000
AAACCCAAGGGTTAAT_TSP12_Heart_Atria_10X_1_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.971113,3.264437,0.0,0.0,1.785083,0.000000,0.000000,3.837567,0.0,0.000000
AAACCCAAGTATGCAA_TSP12_Heart_Atria_10X_1_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.249330,2.353345,0.0,0.0,0.793493,0.000000,0.000000,3.151370,0.0,0.000000
AAACCCAAGTCGTTAC_TSP12_Heart_Atria_10X_1_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.378214,3.787100,0.0,0.0,2.702034,0.000000,0.000000,3.961854,0.0,0.574412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O5_L004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,1.909840,0.0,0.0,0.093893,1.092484,0.000000,0.030039,0.0,0.000000
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O6_L004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.193500,1.511308,0.0,0.0,1.353644,1.499545,0.009120,2.161131,0.0,0.000000
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O8_L004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.111410,0.517196,0.0,0.0,0.000000,0.000000,0.000000,2.462929,0.0,0.000000
TSP12_Heart_ventricle_SS2_B133716_B134037_LIve_O9_L004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.318307,1.186609,0.0,0.0,0.000000,0.322826,0.032304,3.788884,0.0,0.110261


In [4]:
train_df, test_df = train_test_split(ts_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [5]:
print(len(train_df))
print(len(val_df))
print(len(test_df))

8283
921
2301


In [6]:
class TSDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        features = torch.tensor(self.df.iloc[idx], dtype=torch.float32)

        if self.transform:
            features = self.transform(features)

        return features, features # for Autoencoder

In [7]:
# Set hyperparameters
input_size = len(ts_df.columns)
encoding_size = 5000
learning_rate = 1e-4
epochs = 50
batch_size = 265

In [8]:
train_dataset = TSDataset(train_df)
val_dataset = TSDataset(val_df)
test_dataset = TSDataset(test_df)

# Define DataLoader for each set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [9]:
# Define Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_size, encoding_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_size, encoding_size)
        self.decoder = nn.Linear(encoding_size, input_size)
        self.nonlin = nn.ReLU()

    def forward(self, x):
        x = self.encoder(x)
        x = self.nonlin(x)
        x = self.decoder(x)
        x = self.nonlin(x)
        return x

In [10]:
# Initialize model, loss function, and optimizer
model = Autoencoder(input_size, encoding_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    # Train step
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {loss.item():.4f}')

    # Val step
    with torch.no_grad():
        for inputs, targets in train_loader:
            model.eval()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
        print(f'Validation Loss: {loss.item():.4f}')

KeyError: 1351