In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from netam import framework, models
from epam.torch_common import pick_device

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
shmoof_data_path = "/Users/matsen/data/shmoof_edges_11-Jan-2023_NoNode0_iqtree_K80+R_masked.csv"
train_df, val_df = framework.load_shmoof_dataframes(shmoof_data_path, val_nickname="51") #, sample_count=5000)


In [3]:
kmer_length = 3
max_length = 410

train_dataset = framework.SHMoofDataset(train_df, kmer_length=kmer_length, max_length=max_length)
val_dataset = framework.SHMoofDataset(val_df, kmer_length=kmer_length, max_length=max_length)

print(f"we have {len(train_dataset)} training examples and {len(val_dataset)} validation examples")

we have 26592 training examples and 22424 validation examples


In [None]:
class NoofModel(nn.Module):
    def __init__(
        self, dataset, embedding_dim, nhead, dim_feedforward, layer_count, dropout=0.5
    ):
        super(NoofModel, self).__init__()
        self.kmer_count = len(dataset.kmer_to_index)
        self.embedding_dim = embedding_dim
        self.site_count = dataset.max_length

        self.kmer_embedding = nn.Embedding(self.kmer_count, self.embedding_dim)
        self.pos_encoder = PositionalEncoding(self.embedding_dim, dropout)

        self.encoder_layer = TransformerEncoderLayer(
            d_model=self.embedding_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
        )
        self.encoder = TransformerEncoder(self.encoder_layer, layer_count)
        self.linear = nn.Linear(self.embedding_dim, 1)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, encoded_parents, masks):
        """
        The forward method.

        encoded_parents is expected to be an integer tensor of [batch_size, sequence_length].
        """
        kmer_embeddings = self.kmer_embedding(encoded_parents)
        kmer_embeddings = self.pos_encoder(kmer_embeddings)

        # Pass through the transformer encoder
        transformer_output = self.encoder(kmer_embeddings)

        # Apply the linear layer and squeeze out the last dimension.
        # After the linear layer, the dimensions will be [batch_size, sequence_length, 1].
        # We squeeze out the last dimension to make it [batch_size, sequence_length].
        log_rates = self.linear(transformer_output).squeeze(-1)
        rates = torch.exp(log_rates)
        return rates



In [4]:
model = NoofModel(train_dataset, embedding_dim=10, nhead=2, dim_feedforward=256, layer_count=2, dropout=0.1)

device = pick_device()
train_dataset.to(device)
val_dataset.to(device)
model.to(device)

burrito = framework.Burrito(train_dataset, val_dataset, model, batch_size=1024, learning_rate=0.1, l2_regularization_coeff=1e-6)
print("starting training...")
losses = burrito.train(epochs=100)
losses

Using Metal Performance Shaders
starting training...


Epoch:  17%|█▋        | 17/100 [11:18<56:32, 40.87s/it, loss_diff=1.934e-06, lr=0.004] 