In [None]:
!git clone https://git.wur.nl/bioinformatics/fte40306-advanced-machine-learning-project-data data

In [None]:
import torch
from torch import nn, optim
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset

transform = transforms.ToPILImage()

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.pyplot import imshow
%matplotlib inline 

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import decomposition
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Import two files: pretrain.embedding.edit and pretrain.labels.edit
embedding_data = pd.read_csv("data/pretrain.embedding.edit", header = None, index_col=False) 

# 1648 numerical descriptors per protein
label_data = pd.read_csv("data/pretrain.labels.edit", header = None, index_col=False)

# Information on the proteins; first column contains identifier (can be ignored)
GO_data = pd.read_csv("data/pretrain.go", header = None, index_col=False)

# First column: protein identifier, followed by 1/0 to indicate assignment of protein to three different functions:
# membrane [GO:0016020], ATP binding [GO:0005524], DNA binding [GO:0003677]
print(embedding_data.shape,label_data.shape,GO_data.shape)
nfeatures = embedding_data.shape[1]

## Step 1. Visualize and analyze the contents of the provided datasets.

In [None]:
class ShapemerDataset(Dataset):
  def __init__(self, data, transform=None):
    self.data = data
    self.transform = transform
    self.X = self.data

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    Xi = self.X[idx]
    if self.transform is not None:
      Xi = self.transform(Xi)
    return Xi

X_train, X_test, y_train, y_test = train_test_split(embedding_data, label_data, test_size=0.33, random_state=42)
train_dataset=ShapemerDataset(torch.tensor(X_train.to_numpy()).to(torch.float32))
test_dataset=ShapemerDataset(torch.tensor(X_test.to_numpy()).to(torch.float32))

train_loader = torch.utils.data.DataLoader(
  train_dataset  , batch_size=128, shuffle=True, num_workers=0, pin_memory=True
)

test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=32, shuffle=False, num_workers=0
)

Add your code and analysis below.

## Step 2. Train autoencoders.

As an example, we use here a simple autoencoder with both the encoder and the decoder containing two linear layers. Make sure to vary on this design.

In [None]:
class AE(nn.Module):
  
    def __init__(self, **kwargs):
        super().__init__()
        self.encoder_hidden_layer = nn.Linear(
            in_features=kwargs["input_shape"], out_features=64
        )
        self.encoder_output_layer = nn.Linear(
            in_features=64, out_features=64
        )
        self.decoder_hidden_layer = nn.Linear(
            in_features=64, out_features=64
        )
        self.decoder_output_layer = nn.Linear(
            in_features=64, out_features=kwargs["input_shape"]
        )

    def forward(self, features):
        activation = self.encoder_hidden_layer(features)
        activation = torch.relu(activation)
        code = self.encoder_output_layer(activation)
        activation = self.decoder_hidden_layer(code)
        activation = torch.relu(activation)
        activation = self.decoder_output_layer(activation)
        reconstructed = torch.relu(activation)
        return reconstructed

We now define an instance of the AE model, as well as the optimization approach and the loss function.

In [None]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create a model from `AE` autoencoder class and load it to the specified device, either gpu or cpu
model = AE(input_shape=nfeatures).to(device)

# Create an optimizer object: Adam with learning rate 1e-3
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Mean-squared error loss
criterion = nn.MSELoss()

We can then train an autoencoder as follows:

In [None]:
epochs=10 

for epoch in range(epochs):

    loss = 0
 
    for batch_features in train_loader:
        # Load it to the active device
        batch_features = batch_features.view(-1, nfeatures).to(device)
 
        # Reset the gradients back to zero
        # PyTorch accumulates gradients on subsequent backward passes
        optimizer.zero_grad()
        
        # Compute reconstructions
        outputs = model(batch_features)
        
        # Compute training reconstruction loss
        train_loss = criterion(outputs, batch_features)
        
        # Compute accumulated gradients
        train_loss.backward()
        
        # Perform parameter update based on current gradients
        optimizer.step()
        
        # Add the mini-batch training loss to epoch loss
        loss += train_loss.item()
    
    # Compute the epoch training loss
    loss = loss / len(train_loader)
    
    # Display the epoch training loss
    print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

## Step 3. Visualization using t-SNE or UMAP.

Add your code and analysis below.

## Step 4. Supervised prediction.

Add your code and analysis below.

## Step 5. Kernel PCA (optional).

Add your code and analysis below.