In [11]:
import torch
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math



In [12]:
class TmAntibodyClass(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Get the embedding as a torch tensor
        embedding = torch.tensor(self.data.iloc[idx]['embedding'], dtype=torch.float32)
        
        # Get the target (e.g., 'target' column)
        target = torch.tensor(self.data.iloc[idx]['target'], dtype=torch.float32)
        
        # Optionally, return 'split' or 'ID' if needed
        # ID = self.data.iloc[idx]['ID']
        # split = self.data.iloc[idx]['split']
        
        return embedding, target


In [13]:

# Define the PyTorch model
class AntibodyTmPredictor(nn.Module):
    def __init__(self, input_size):
        super(AntibodyTmPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, 256)
        self.fc5 = nn.Linear(256, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = self.fc5(x)  # Linear output for regression
        return x


In [14]:
df = pd.read_csv("tm_esm650M.csv")

# Convert 'embedding' column from string to a list of floats
# Assuming the embeddings were saved as strings of lists in the CSV
df['embedding'] = df['embedding'].apply(eval)

In [15]:
df

Unnamed: 0,ID,target,split,embedding
0,ADI-38502,66.0,train,"[-0.0682915598154068, -0.04206767678260803, -0..."
1,ADI-38501,64.5,train,"[-0.07431265711784363, -0.026158228516578674, ..."
2,ADI-47173,64.5,train,"[-0.0496402233839035, -0.011181279085576534, -..."
3,ADI-47054,71.0,train,"[-0.038109079003334045, -0.018030624836683273,..."
4,ADI-47278,71.5,train,"[-0.04723335802555084, -0.04840954393148422, -..."
...,...,...,...,...
478,urelumab,66.0,holdout,"[-0.07430675625801086, -0.030699612572789192, ..."
479,veltuzumab,70.0,holdout,"[-0.04518481343984604, -0.0554242841899395, -0..."
480,visilizumab,71.0,train,"[-0.05436510592699051, -0.05524484068155289, -..."
481,zalutumumab,72.5,train,"[-0.0461481511592865, -0.04504658654332161, -0..."


In [17]:
train_dataset = TmAntibodyClass(df[df.split=='train'])
valid_dataset = TmAntibodyClass(df[df.split=='test'])
test_dataset = TmAntibodyClass(df[df.split=='holdout'])


# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# Example usage in a training loop
for batch_embeddings, batch_targets in train_dataloader:
    # Your training code here
    print(batch_embeddings)
    pass


tensor([[-0.0637, -0.0283, -0.0520,  ..., -0.1338,  0.0049,  0.1105],
        [-0.0680, -0.0147, -0.0675,  ..., -0.1471,  0.0335,  0.1059],
        [-0.0664, -0.0329, -0.0691,  ..., -0.1478,  0.0109,  0.1355],
        ...,
        [-0.0427, -0.0586, -0.0866,  ..., -0.1709, -0.0011,  0.1476],
        [-0.0703, -0.0436, -0.0533,  ..., -0.1251, -0.0102,  0.1371],
        [-0.0717, -0.0313, -0.0603,  ..., -0.1409,  0.0460,  0.1359]])
tensor([[-0.0774, -0.0440, -0.0495,  ..., -0.1164,  0.0187,  0.1233],
        [-0.0709, -0.0102, -0.0447,  ..., -0.0868,  0.0055,  0.1335],
        [-0.0450, -0.0407, -0.0659,  ..., -0.1643, -0.0019,  0.1082],
        ...,
        [-0.0477, -0.0413, -0.0748,  ..., -0.1553, -0.0027,  0.0943],
        [-0.0394, -0.0398, -0.0554,  ..., -0.1494,  0.0069,  0.1190],
        [-0.0403, -0.0474, -0.0843,  ..., -0.1421,  0.0085,  0.1101]])
tensor([[-0.0394, -0.0425, -0.0648,  ..., -0.1587,  0.0130,  0.1033],
        [-0.0538, -0.0408, -0.0668,  ..., -0.1396,  0.0092,  0

In [None]:




# Initialize model, loss function, and optimizer
input_size = x_train.shape[1]  # Input size is the number of features in x_train
model = AntibodyPredictor(input_size)

# Loss function and optimizer
criterion = nn.L1Loss()  # Equivalent to 'mae'
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Convert training and test data to PyTorch tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

# Training loop
epochs = 1500
batch_size = 32
for epoch in range(epochs):
    model.train()
    permutation = torch.randperm(x_train_tensor.size()[0])
    
    for i in range(0, x_train_tensor.size()[0], batch_size):
        optimizer.zero_grad()
        
        indices = permutation[i:i + batch_size]
        batch_x, batch_y = x_train_tensor[indices], y_train_tensor[indices]

        outputs = model(batch_x).squeeze()  # Forward pass
        loss = criterion(outputs, batch_y)  # Calculate loss

        loss.backward()  # Backpropagation
        optimizer.step()  # Optimization step

    # Optionally print loss every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}")

# Evaluation
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    y_pred_test = model(x_test_tensor).cpu().numpy()

# Convert the tensors back to NumPy arrays
y_test_np = y_test_tensor.cpu().numpy()

# Calculate metrics
mae = mean_absolute_error(y_test_np, y_pred_test)
rmse = math.sqrt(mean_squared_error(y_test_np, y_pred_test))
r2 = r2_score(y_test_np, y_pred_test)

print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}, R^2: {r2:.4f}")