In [1]:
import pandas as pd

from epam.dnsm import train_model


In [2]:
pcp_df = pd.read_csv("~/data/wyatt-10x-1p5m_pcp_2023-10-07.csv")

In [3]:
nhead = 4
dim_feedforward = 2048
layer_count = 3
model = train_model(pcp_df, nhead=nhead, dim_feedforward=dim_feedforward, layer_count=layer_count, batch_size=32, num_epochs=10, learning_rate=0.001, checkpoint_dir="./_checkpoints", log_dir="./_logs")

preparing data...
training model...


  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


Epoch [1/10], Training Loss: 0.34231334924697876, Validation Loss: 0.3454280197620392
Epoch [2/10], Training Loss: 0.3213242292404175, Validation Loss: 0.33188706636428833
Epoch [3/10], Training Loss: 0.3087613880634308, Validation Loss: 0.32142314314842224
Epoch [4/10], Training Loss: 0.30056893825531006, Validation Loss: 0.3096000552177429
Epoch [5/10], Training Loss: 0.30200135707855225, Validation Loss: 0.29866546392440796
Epoch [6/10], Training Loss: 0.264438271522522, Validation Loss: 0.2887866199016571
Epoch [7/10], Training Loss: 0.24836869537830353, Validation Loss: 0.27943721413612366
Epoch [8/10], Training Loss: 0.26803213357925415, Validation Loss: 0.27047407627105713
Epoch [9/10], Training Loss: 0.2465275675058365, Validation Loss: 0.26193925738334656
Epoch [10/10], Training Loss: 0.2605518102645874, Validation Loss: 0.25384682416915894


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np


In [5]:
# Initialize random seed for reproducibility
torch.manual_seed(0)

# Hyperparameters and Data Preparation
n_sequences = 10000  # Number of sequences
sequence_length = 20  # Length of each sequence
n_aa = 20  # Number of amino acids

# Define polar amino acids by their indices in AA_STR_SORTED = "ACDEFGHIKLMNPQRSTVWY"
polar_aa_indices = [10, 7, 3, 11, 4, 8, 9, 15, 16, 19]

# Randomly generate parent sequences
parent_sequences = torch.randint(0, n_aa, (n_sequences, sequence_length))

# Create mutation masks based on polar amino acids
mutation_masks = torch.zeros((n_sequences, sequence_length))

for i in range(1, sequence_length - 1):
    is_prev_polar = torch.any(torch.eq(parent_sequences[:, i - 1].unsqueeze(1), torch.tensor(polar_aa_indices)), dim=1)
    is_next_polar = torch.any(torch.eq(parent_sequences[:, i + 1].unsqueeze(1), torch.tensor(polar_aa_indices)), dim=1)
    mutation_masks[:, i] = is_prev_polar | is_next_polar

# One-hot encode parent sequences
parent_sequences = F.one_hot(parent_sequences, num_classes=n_aa).float()

# Convert to float and add some noise
mutation_masks = mutation_masks.float()
mutation_masks += torch.randn_like(mutation_masks) * 0.1  # adding noise

# Clip to [0, 1] range
mutation_masks = torch.clamp(mutation_masks, 0, 1)


In [6]:


# Transformer Model
class MutationPredictor(nn.Module):
    def __init__(self, d_model, nhead, num_layers):
        super(MutationPredictor, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model, nhead)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers)
        self.linear = nn.Linear(d_model, 1)  # Linear layer to output probabilities

    def forward(self, src):
        out = self.encoder(src.transpose(0, 1))
        out = self.linear(out)
        return torch.sigmoid(out).transpose(0, 1).squeeze(-1)

# Initialize model, optimizer, and loss function
d_model = n_aa  # This assumes that your one-hot encoding size matches d_model
nhead = 4
num_layers = 2
model = MutationPredictor(d_model, nhead, num_layers)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

# Training Loop
n_epochs = 20
for epoch in range(n_epochs):
    optimizer.zero_grad()
    output = model(parent_sequences)
    loss = criterion(output, mutation_masks)
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}")


