In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn

from netam import framework, models
from epam.torch_common import pick_device

In [2]:
all_df = pd.read_csv("/Users/matsen/data/shmoof_edges_11-Jan-2023_NoNode0_iqtree_K80+R_masked.csv")

# Here's the fraction of sequences of length more than 410
(all_df["parent"].str.len() > 410).sum() / len(all_df)

0.00403216242498992

In [3]:
train_df, val_df = framework.load_shmoof_dataframes("/Users/matsen/data/shmoof_edges_11-Jan-2023_NoNode0_iqtree_K80+R_masked.csv") #, sample_count=5000)


In [4]:
kmer_length = 5
max_length = 410

train_dataset = framework.SHMoofDataset(train_df, kmer_length=kmer_length, max_length=max_length)
val_dataset = framework.SHMoofDataset(val_df, kmer_length=kmer_length, max_length=max_length)

device = pick_device()
train_dataset.to(device)
val_dataset.to(device)

print(f"we have {len(train_dataset)} training examples and {len(val_dataset)} validation examples")

we have 35830 training examples and 13186 validation examples


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNModel(nn.Module):
    def __init__(self, dataset, embedding_dim, num_filters, kernel_size):
        super(CNNModel, self).__init__()
        self.kmer_count = len(dataset.kmer_to_index)

        self.kmer_embedding = nn.Embedding(self.kmer_count, embedding_dim)

        # Convolutional layer
        # Assuming the input to the convolution is a sequence of embeddings
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=kernel_size, padding='same')

        # Linear layer to map the convolved features to log rates
        # Adjust the in_features based on the output of the convolutional layer
        self.linear = nn.Linear(in_features=num_filters, out_features=1)

    def forward(self, encoded_parents, masks):
        # Get embeddings for each kmer in the batch
        kmer_embeds = self.kmer_embedding(encoded_parents)
        
        # Reshape for convolutional layer (batch_size, embedding_dim, seq_length)
        kmer_embeds = kmer_embeds.permute(0, 2, 1)

        # Apply convolutional layer
        conv_out = F.relu(self.conv(kmer_embeds))

        # Reshape conv_out to (batch_size, seq_length, num_filters)
        conv_out = conv_out.permute(0, 2, 1)

        # Apply linear layer to each position in the sequence
        log_rates = self.linear(conv_out).squeeze(-1)

        # Exponentiate to get rates
        rates = torch.exp(log_rates)

        return rates

model = CNNModel(train_dataset, embedding_dim=10, num_filters=10, kernel_size=5)

model.to(device)


In [12]:
burrito = framework.Burrito(train_dataset, val_dataset, model, batch_size=1024, learning_rate=0.1, l2_regularization_coeff=1e-6)
print("starting training...")
losses = burrito.train(epochs=4)

Using Metal Performance Shaders
starting training...
Epoch [1/4]	 Loss: 0.058346011	 Val Loss: 0.066361873
Epoch [2/4]	 Loss: 0.058051389	 Val Loss: 0.066390379
Epoch [3/4]	 Loss: 0.058048603	 Val Loss: 0.066471299
Epoch [4/4]	 Loss: 0.058039054	 Val Loss: 0.066398375


In [None]:
torch.save(model, "_ignore/noof_model.pt")