In this notebook I will try to assess if the dynamic batching improves the training speed

In [1]:
from encoded_protein_dataset import EncodedProteinDataset, collate_fn, get_embedding, dynamic_collate_fn
from pseudolikelihood import get_npll
import torch
import numpy as np
from potts_decoder import PottsDecoder
from torch.utils.data import DataLoader, RandomSampler
from torch.utils.tensorboard import SummaryWriter
from functools import partial
from tqdm import tqdm
import os
#import pandas as pd
import csv
from torch.autograd import profiler 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_loss(decoder, inputs, eta, device):
    """eta is the multiplicative term in front of the penalized negative pseudo-log-likelihood"""
    msas, encodings, padding_mask  = [input.to(device) for input in inputs]
    B, M, N = msas.shape
    couplings, fields = decoder(encodings, padding_mask)

    # embed and reshape to (B, M, N*q)
    msas_embedded = embedding(msas).view(B, M, -1)

    # get npll
    npll = get_npll(msas_embedded, couplings, fields, N, q)


    padding_mask_inv = (~padding_mask)

    # multiply with the padding mask to filter non-existing residues (this is probably not necessary)       
    npll = npll * padding_mask_inv.unsqueeze(1)
    penalty = eta*(torch.sum(couplings**2) + torch.sum(fields**2))/B

    # the padding mask does not contain the msa dimension so we need to multiply by M
    npll_mean = torch.sum(npll) / (M * torch.sum(padding_mask_inv))
    loss_penalty = npll_mean  + penalty

    return loss_penalty, npll_mean.item()  ##we can just add the penalty since we have set already to 0 couplings and fields of padded elements



def get_loss_loader_dyn(decoder, loader, eta, device):
    decoder.eval()
    losses = []
    with torch.no_grad():
        for effective_batch_size, inputs_packed in loader:
            npll_full = 0
            for inputs in inputs_packed:
                mini_batch_size = inputs[0].shape[0]
                _, npll = get_loss(decoder, inputs, eta, device) 
                npll_full += npll*mini_batch_size/effective_batch_size
            losses.append(npll_full)
            #del inputs
    
    return np.mean(losses)

def train_dyn(decoder, inputs_packed, eta, device):
    effective_batch_size = inputs_packed[0]
    loss_penalty_full = 0
    train_loss_full = 0
    for inputs in inputs_packed[1]:
        mini_batch_size = inputs[0].shape[0]
        loss_penalty, train_batch_loss = get_loss(decoder, inputs, eta, device)    ## get the current loss for the batch
        loss_penalty = loss_penalty * mini_batch_size/effective_batch_size
        train_batch_loss = train_batch_loss * mini_batch_size/effective_batch_size
        loss_penalty.backward()                         ## Get gradients
        loss_penalty_full += loss_penalty.detach()
        train_loss_full += train_batch_loss
    
    optimizer.step()   
    optimizer.zero_grad()                           ## set previous gradients to 0

    return loss_penalty_full, train_loss_full

def get_loss(decoder, inputs, eta, device):
    """eta is the multiplicative term in front of the penalized negative pseudo-log-likelihood"""
    msas, encodings, padding_mask  = [input.to(device) for input in inputs]
    B, M, N = msas.shape
    #print(f"encodings' shape{encodings.shape}, padding mask:{padding_mask.shape}")
    couplings, fields = decoder(encodings, padding_mask)

    # embed and reshape to (B, M, N*q)
    msas_embedded = embedding(msas).view(B, M, -1)

    # get npll
    npll = get_npll(msas_embedded, couplings, fields, N, q)


    padding_mask_inv = (~padding_mask)

    # multiply with the padding mask to filter non-existing residues (this is probably not necessary)       
    npll = npll * padding_mask_inv.unsqueeze(1)
    penalty = eta*(torch.sum(couplings**2) + torch.sum(fields**2))/B

    # the padding mask does not contain the msa dimension so we need to multiply by M
    npll_mean = torch.sum(npll) / (M * torch.sum(padding_mask_inv))
    loss_penalty = npll_mean + penalty

    return loss_penalty, npll_mean.item()  ##we can just add the penalty since we have set already to 0 couplings and fields of padded elements



def get_loss_loader(decoder, loader, eta, device):

    decoder.eval()
    losses = []
    with torch.no_grad():
        for inputs in loader:
            _, npll = get_loss(decoder, inputs, eta, device) 
            losses.append(npll)
    
    return np.mean(losses)



In [3]:
## The max_msas optional parameter in the EncodedProteinDataset library allows to select just a subset of the folders of that size 
### Takes roughly 35 minutes... sometimes 4... makes no sense...
##lenght of training data should be 22559
train_dataset = EncodedProteinDataset('/Data/InverseFoldingData/msas/train', '/Data/InverseFoldingData/structure_encodings', max_msas=1000, noise=0.02)
sequence_test_dataset = EncodedProteinDataset('/Data/InverseFoldingData/msas/test/sequence', '/Data/InverseFoldingData/structure_encodings', max_msas=1000)
structure_test_dataset = EncodedProteinDataset('/Data/InverseFoldingData/msas/test/structure', '/Data/InverseFoldingData/structure_encodings', max_msas=1000)
superfamily_test_dataset = EncodedProteinDataset('/Data/InverseFoldingData/msas/test/superfamily', '/Data/InverseFoldingData/structure_encodings', max_msas=1000)


Counter is:3, Counter fail 1:0, Counter fail 2:0, length data:3

  encodings = torch.tensor(read_encodings(encoding_path, trim=False))


Counter is:1002, Counter fail 1:0, Counter fail 2:0, length data:999

In [10]:
batch_structure_size = 4   ### I think with empty GPU we can go up to 10
perc_subset_test = 0.1     ## During the training, for every dataset available we select a random 10% of its samples
batch_msa_size = 16
q = 21 ##isn't always 21

## Static Loader
collate_fn_stat = partial(collate_fn, q=q, batch_msa_size=batch_msa_size)
train_loader = DataLoader(train_dataset, batch_size=batch_structure_size, collate_fn=collate_fn_stat, shuffle=True)

sequence_test_loader = DataLoader(sequence_test_dataset, batch_size=batch_structure_size, collate_fn=collate_fn_stat, shuffle=False, 
sampler=RandomSampler(sequence_test_dataset, replacement=True, num_samples=int(perc_subset_test*len(sequence_test_dataset))))

structure_test_loader = DataLoader(structure_test_dataset, batch_size=batch_structure_size, collate_fn=collate_fn_stat, shuffle=False, 
sampler=RandomSampler(structure_test_dataset, replacement=True, num_samples=int(perc_subset_test*len(structure_test_dataset))))

superfamily_test_loader = DataLoader(superfamily_test_dataset, batch_size=batch_structure_size, collate_fn=collate_fn_stat, shuffle=False, 
sampler=RandomSampler(superfamily_test_dataset, replacement=True, num_samples=int(perc_subset_test*len(superfamily_test_dataset))))


In [8]:
## Dynamic loader
q=21
perc_subset_test = 0.1     ## During the training, for every dataset available we select a random 10% of its samples
batch_msa_size = 16
batch_structure_size_dyn = 20   ### I think with empty GPU we can go up to 10
collate_fn_dyn = partial(dynamic_collate_fn, q=q, batch_size=batch_structure_size_dyn, batch_msa_size=batch_msa_size)

train_loader_dyn = DataLoader(train_dataset, batch_size=batch_structure_size_dyn, collate_fn=collate_fn_dyn, shuffle=True)

sequence_test_loader_dyn = DataLoader(sequence_test_dataset, batch_size=batch_structure_size_dyn, collate_fn=collate_fn_dyn, shuffle=False, 
sampler=RandomSampler(sequence_test_dataset, replacement=True, num_samples=int(perc_subset_test*len(sequence_test_dataset))))

structure_test_loader_dyn = DataLoader(structure_test_dataset, batch_size=batch_structure_size_dyn, collate_fn=collate_fn_dyn, shuffle=False, 
sampler=RandomSampler(structure_test_dataset, replacement=True, num_samples=int(perc_subset_test*len(structure_test_dataset))))

superfamily_test_loader_dyn = DataLoader(superfamily_test_dataset, batch_size=batch_structure_size_dyn, collate_fn=collate_fn_dyn, shuffle=False, 
sampler=RandomSampler(superfamily_test_dataset, replacement=True, num_samples=int(perc_subset_test*len(superfamily_test_dataset))))

In [9]:
decoder = None
embedding = None
torch.cuda.empty_cache()

seed = 0
torch.random.manual_seed(seed)
np.random.seed(seed)



update_steps = 200                 ##Usual values are update steps=10^5, test_steps=10^2
test_steps = 20
n_epochs = update_steps//(len(train_dataset)//batch_structure_size_dyn)   ## the other update steps will be used for "partial epochs", I want to save the last complet epoch

input_encoding_dim = train_dataset.encoding_dim
param_embed_dim = 512
n_param_heads = 4
d_model = 128
n_heads = 2
n_layers = 2
## Check before running which is the GPU which is free the most and put it as the running device
device = 3
eta = 1e-3
dropout = 0.0
#print(check_gpu_mem())


decoder = PottsDecoder(q, n_layers, d_model, input_encoding_dim, param_embed_dim, n_heads, n_param_heads, dropout=dropout)
decoder.to(device)
embedding = get_embedding(q)
embedding.to(device)

optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0001)


with tqdm(total = update_steps) as pbar: ##This is used to have the nice loading bar while training
    train_loss = 0
    max_gpu = 0
    update_step = 0
    bk_iter = int(1e4)                             ## This tells us how ofter we save a model(default values is every ten-thousand updates)
    bk_dir = "./../IntermediateModels/"       ## Folder to where we save the intermediate models
    train_batch_losses = []
    epoch = 0.0
    while update_step < update_steps:
        for inputs_packed in train_loader_dyn:
            ##This packs a batch in a good way for memory reasons
            update_step += 1                                ## Increase update step (the update steps will count also different batches within the same epoch)
            epoch = update_step / len(train_loader_dyn)
            #for inputs in inputs_packed:
            loss_penalty, train_batch_loss = train_dyn(decoder, inputs_packed, eta, device)


            train_batch_losses.append(train_batch_loss) ## Here we append the lossess in the different batches within the same epoch
            
            ## We want to keep track of the test loss not at every batch, too costrly otherwise. Usually set to once every 100.
            if (update_step==1 or update_step % test_steps == 0) or update_step == update_steps:
                train_loss = np.mean(train_batch_losses)
                del loss_penalty
                del train_batch_losses
                
                ## Lossess for the different test sets, want to use a subset of this only. Also want to pass only a random subset of it if possible
                structure_test_loss = get_loss_loader_dyn(decoder, structure_test_loader_dyn, eta, device)
                sequence_test_loss = get_loss_loader_dyn(decoder, sequence_test_loader_dyn, eta, device)
                superfamily_test_loss = get_loss_loader_dyn(decoder, superfamily_test_loader_dyn, eta, device)

                
                train_batch_losses = []
            if update_step >= update_steps:
                break
            pbar.set_description(f'update_step: {update_step}, epoch: {epoch:.2f} train_batch: {train_batch_loss:.2f} train: {train_loss:.2f}, sequence: {sequence_test_loss:.2f}, structure: {structure_test_loss:.2f}, superfamily: {superfamily_test_loss:.2f}, max_gpu:{max_gpu}')#, GPU total memory: {check_gpu_mem().values[device, 0]}, GPU used: {check_gpu_mem().values[device, 1]}')
            pbar.update(1)


                

update_step: 199, epoch: 3.98 train_batch: 71.30 train: 67.57, sequence: 61.91, structure: 73.49, superfamily: 62.27, max_gpu:0: 100%|█████████▉| 199/200 [04:54<00:01,  1.48s/it]       


In [11]:
device

3

In [11]:
decoder = None
embedding = None
torch.cuda.empty_cache()

seed = 0
torch.random.manual_seed(seed)
np.random.seed(seed)



update_steps = 5*200                 ##Usual values are update steps=10^5, test_steps=10^2
test_steps = 5*20
n_epochs = update_steps//(len(train_dataset)//batch_structure_size_dyn)   ## the other update steps will be used for "partial epochs", I want to save the last complet epoch

input_encoding_dim = train_dataset.encoding_dim
param_embed_dim = 512
n_param_heads = 4
d_model = 128
n_heads = 2
n_layers = 2
## Check before running which is the GPU which is free the most and put it as the running device
device = 3
eta = 1e-3
dropout = 0.0
#print(check_gpu_mem())


decoder = PottsDecoder(q, n_layers, d_model, input_encoding_dim, param_embed_dim, n_heads, n_param_heads, dropout=dropout)
decoder.to(device)
embedding = get_embedding(q)
embedding.to(device)

optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0001)


with tqdm(total = update_steps) as pbar: ##This is used to have the nice loading bar while training
    train_loss = 0
    max_gpu = 0
    update_step = 0
    bk_iter = int(1e4)                             ## This tells us how ofter we save a model(default values is every ten-thousand updates)
    bk_dir = "./../IntermediateModels/"       ## Folder to where we save the intermediate models
    train_batch_losses = []
    epoch = 0.0
    while update_step < update_steps:
        for inputs in train_loader:
            ##This packs a batch in a good way for memory reasons
            update_step += 1                                ## Increase update step (the update steps will count also different batches within the same epoch)
            epoch = update_step / len(train_loader)
            #for inputs in inputs_packed:
            loss_penalty, train_batch_loss = get_loss(decoder, inputs, eta, device)    ## get the current loss for the batch
            optimizer.zero_grad()                           ## set previous gradients to 0
            loss_penalty.backward()    
            loss_penalty.detach()
            optimizer.step()     

            train_batch_losses.append(train_batch_loss) ## Here we append the lossess in the different batches within the same epoch
            
            ## We want to keep track of the test loss not at every batch, too costrly otherwise. Usually set to once every 100.
            if (update_step==1 or update_step % test_steps == 0) or update_step == update_steps:
                train_loss = np.mean(train_batch_losses)
                del loss_penalty
                del train_batch_losses
                
                ## Lossess for the different test sets, want to use a subset of this only. Also want to pass only a random subset of it if possible
                structure_test_loss = get_loss_loader(decoder, structure_test_loader, eta, device)
                sequence_test_loss = get_loss_loader(decoder, sequence_test_loader, eta, device)
                superfamily_test_loss = get_loss_loader(decoder, superfamily_test_loader, eta, device)

                
                train_batch_losses = []
            if update_step>update_steps:
                break
            pbar.set_description(f'update_step: {update_step}, epoch: {epoch:.2f} train_batch: {train_batch_loss:.2f} train: {train_loss:.2f}, sequence: {sequence_test_loss:.2f}, structure: {structure_test_loss:.2f}, superfamily: {superfamily_test_loss:.2f}, max_gpu:{max_gpu}')#, GPU total memory: {check_gpu_mem().values[device, 0]}, GPU used: {check_gpu_mem().values[device, 1]}')
            pbar.update(1)


                

update_step: 1000, epoch: 4.00 train_batch: 15.34 train: 10.42, sequence: 9.73, structure: 10.94, superfamily: 8.68, max_gpu:0: 100%|██████████| 1000/1000 [08:41<00:00,  1.92it/s]        
