In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
import numpy as np
import tqdm
import random
import torch.nn.functional as F

from utils import *
from train import *

# for tensorboard
from torch.utils.tensorboard import SummaryWriter
#writer = SummaryWriter()

# Experiment with cross attention
In this experiment, we sample data not only from uniform distribution but from variety of distributions. We then use this data for encoder and feed its output throught cross-attention mechanism to the decoder. The decoder takes the sequence with missing tokens and tries to fill in the gaps.

### Architecture
- We use multihead attention for the encoder and vanilla attention for the decoder.
- In the encoder, before splitting heads I "merge" the 5 sequences through linear layer.

In [2]:
import os
print(os.getcwd())

/Users/mariayuffa/semester-project


In [3]:
# Sequences for decoder
final_chains_train = np.load('data/final_chains_T=1_num_iters=400_train.npy')
print("Loaded train sequences of proteins sampled from Boltzmann distribution:", final_chains_train.shape)

final_chains_test = np.load('data/final_chains_T=1_num_iters=400_test.npy')
print("Loaded test sequences of proteins sampled from Boltzmann distribution:", final_chains_test.shape)


# Sequences for encoder
k = 0
final_chains_encoder_train = np.zeros((1000,5,200))
final_chains_encoder_train[:,0,:] = np.load('data/final_chains_T=1_num_iters=400_J=10_test.npy') 
final_chains_encoder_test = np.zeros((1000,5,200))
final_chains_encoder_test[:,0,:] = np.load('data/final_chains_T=1_num_iters=400_J=10_test.npy') 
for i in range(20,100,20):
    k+=1
    final_chains_encoder_train[:,k,:] = np.load('data/final_chains_T=1_num_iters=400_J='+str(i)+'_train.npy')
    final_chains_encoder_test[:,k,:] = np.load('data/final_chains_T=1_num_iters=400_J='+str(i)+'_test.npy') 

print("Loaded train sequences of proteins for encoder distribution:", final_chains_encoder_train.shape)
print("Loaded test sequences of proteins for encoder distribution:", final_chains_encoder_test.shape)

tensor_samples_train = torch.tensor(final_chains_encoder_train, dtype=torch.float32) 
tensor_samples_test = torch.tensor(final_chains_encoder_test, dtype=torch.float32) 

Loaded train sequences of proteins sampled from Boltzmann distribution: (1000, 200)
Loaded test sequences of proteins sampled from Boltzmann distribution: (1000, 200)
Loaded train sequences of proteins for encoder distribution: (1000, 5, 200)
Loaded test sequences of proteins for encoder distribution: (1000, 5, 200)


In [4]:
# Define the parameters for different distributions
distributions = [{"type": "normal", "mean": 0, "std": 1},
        {"type": "uniform", "low": -1, "high": 1},
        {"type": "exponential", "scale": 1},
        {"type": "gamma", "scale": 1},
        {"type": "poisson", "lam": 1.0}]

# Example usage
#tensor_samples_train = sample_data(1000, distributions, 200)  # 5 distributions, 1000 samples each, length of sequence 10
#print(tensor_samples_train.shape)

#tensor_samples_test = sample_data(1000, distributions, 200)  # 5 distributions, 1000 samples each, length of sequence 10
#print(tensor_samples_test.shape)

In [5]:
# Random data

#data_train = torch.tensor([np.random.choice([-1, 1], size=20) for _ in range(1000)])
#data_train_dec = torch.tensor([np.random.choice([-1, 1], size=20) for _ in range(1000)])
#data_test = torch.tensor([np.random.choice([-1, 1], size=20) for _ in range(400)])
#data_test_dec = torch.tensor([np.random.choice([-1, 1], size=20) for _ in range(400)])

In [6]:
class Transformer(nn.Module):
    def __init__(self, embed_dim, a, max_seq_length, num_spins, proj_layer_dim, dropout, num_distr=5):
        super(Transformer, self).__init__()
        #self.encoder_embedding = nn.Embedding(num_spins, embed_dim)
        #self.decoder_embedding = nn.Embedding(num_spins, embed_dim)
        self.encoder_embeddings = nn.Linear(num_spins, embed_dim)
        self.decoder_embeddings = nn.Linear(num_spins, embed_dim)

        #self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        #self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.encoder_layer = EncoderLayer(embed_dim, proj_layer_dim, dropout, num_distr)
        self.decoder_layer = DecoderLayer(embed_dim, a, max_seq_length, num_spins, proj_layer_dim, dropout)
        self.fc = nn.Linear(embed_dim, num_spins)

    def forward(self, src, tgt):
        #src_mask, tgt_mask = self.generate_mask(src, tgt)
        #src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        #tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))
        src_embedded = self.encoder_embeddings(src)
        tgt_embedded = self.decoder_embeddings(tgt)
        enc_output = self.encoder_layer(src_embedded)
        dec_output = self.decoder_layer(tgt_embedded, enc_output)
        output = self.fc(dec_output)
        #print("output of the transformer:", output.shape)
        return output

## Training and validation

In [8]:
# Define the parameters 
vocab_size = 3
vocab = {-1:0,1:1,2:2} 
L = 200
embedding_dim = 200
proj_layer_dim = 128
hidden_dim = 200
num_layers = 1 # have to adapt the model for 2 and 3 layers
dropout_rate = 0.0
lr = 1e-3
num_sequences = 1000
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


In [9]:
# Example usage:

model = Transformer(embed_dim=embedding_dim, a=0, max_seq_length=L, num_spins=3, proj_layer_dim=128, dropout=dropout_rate)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

path = 'runs/data_exp_distr_run_11_mask_pos=25'
#train(model, tensor_samples_train, final_chains_train, tensor_samples_test, final_chains_test, vocab, optimizer, criterion, one_hot_encoding=True, writer, device=device)
training_script(path, model, data_train=tensor_samples_train, data_train_dec=final_chains_train,
                data_test=tensor_samples_test, data_test_dec=final_chains_test, vocab=vocab, optimizer=optimizer,
                criterion=criterion, one_hot_flag=True, num_epochs=15, device=0)
#torch.save(model.state_dict(), 'models/lstm_scratch.pt')
#evaluate(model, test_dataloader, criterion, device=device)

  input_encoder_one_hot = torch.tensor(input_encoder_one_hot, dtype=torch.float)
  return torch.tensor(one_hot, dtype=torch.float), torch.tensor(mask_positions, dtype=torch.long)
100%|██████████| 1000/1000 [00:12<00:00, 82.20it/s]


Epoch 1 | Train Loss: 104.7974


  input_encoder_one_hot = torch.tensor(input_encoder_one_hot, dtype=torch.float)
100%|██████████| 1000/1000 [00:06<00:00, 161.33it/s]
  7%|▋         | 1/15 [00:18<04:17, 18.37s/it]

Epoch 1 | Eval Loss: 0.5045


100%|██████████| 1000/1000 [00:10<00:00, 92.79it/s]


Epoch 2 | Train Loss: 102.3271


 59%|█████▊    | 587/1000 [00:03<00:02, 154.87it/s]
                                              

KeyboardInterrupt: 

In [None]:
# To save only the decoder layer weights
torch.save(model.decoder_layer.state_dict(), 'model_decoder/decoder_weights.pth')

# If you need to load these weights later
decoder_weights = torch.load('model_decoder/decoder_weights.pth')
model.decoder_layer.load_state_dict(decoder_weights)

# Save the weights of the FC layer
torch.save(model.fc.state_dict(), 'model_decoder/transformer_fc_weights.pth')

# To load these weights back into the FC layer later
fc_weights = torch.load('model_decoder/transformer_fc_weights.pth')
model.fc.load_state_dict(fc_weights)

<All keys matched successfully>

# Ablation study
In this study we remove the encoder when testing the model

In [None]:
class TransformerAblated(nn.Module):
    def __init__(self, embed_dim, a, max_seq_length, num_spins, proj_layer_dim, dropout):
        super(TransformerAblated, self).__init__()
        self.word_embeddings = nn.Linear(num_spins, embed_dim)
        self.decoder_layer = DecoderLayer(embed_dim, a, max_seq_length, num_spins, proj_layer_dim, dropout)
        self.fc = nn.Linear(embed_dim, num_spins)

    def forward(self, tgt):
        tgt_embedded = self.word_embeddings(tgt)
        dec_output = self.decoder_layer(tgt_embedded, tgt_embedded)
        output = self.fc(dec_output)
        return output
    
# Create an instance of the new model
new_model = TransformerAblated(embed_dim=embedding_dim, a=0, max_seq_length=L, num_spins=3, proj_layer_dim=128, dropout=dropout_rate)

# Load the saved decoder weights
decoder_weights = torch.load('model_decoder/decoder_weights.pth')
new_model.decoder_layer.load_state_dict(decoder_weights)

# Load the saved FC weights
fc_weights = torch.load('model_decoder/transformer_fc_weights.pth')
new_model.fc.load_state_dict(fc_weights)


<All keys matched successfully>

In [None]:
writer = SummaryWriter('runs/transformer_ablation_run_3')

def evaluate(new_model, data_test, data_test_dec, vocab, criterion, device=0):
    new_model.eval()
    model.eval()
    epoch_loss = 0
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    for i, data in tqdm.tqdm(enumerate(data_test), total=len(data_test)):
        # Get the inputs
        input_seq_dec = data_test_dec[i]
        input_decoder_one_hot = one_hot_encoding(input_seq_dec.tolist(), vocab)
        # mask a token
        masked_sequence_dec, positions = mask_random_spins(input_seq_dec, vocab, mask_token=2)
        
        # Forward pass
        outputs = new_model.forward(masked_sequence_dec)
        predicted_tokens = F.softmax(outputs, dim=-1)
        predictions = predicted_tokens[positions]
        target_tokens = torch.where(input_decoder_one_hot[positions]==1)[1]

        # Compute loss
        loss = criterion(predictions, target_tokens)
        epoch_loss += loss.item()

    return epoch_loss / len(data_test)

def train(model, new_model, data_train, data_train_dec, data_test, data_test_dec, vocab, optimizer, criterion, num_epochs=20, device=0):
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    # Training loop
    model.train()
    best_eval_loss = 1e-3 # used to do early stopping

    for epoch in tqdm.tqdm(range(num_epochs), leave=False, position=0):
        running_loss = 0
        epoch_loss = 0
        
        for i, data in tqdm.tqdm(enumerate(data_train), total=len(data_train)):
            # Get the inputs
            input_seq_enc = data
            input_seq_dec = data_train_dec[i]

            input_encoder_one_hot = torch.stack([one_hot_encoding(input_seq_enc[i].tolist(), vocab) for i in range(len(input_seq_enc))], dim=0)
            input_encoder_one_hot = torch.tensor(input_encoder_one_hot, dtype=torch.float)

            input_decoder_one_hot = one_hot_encoding(input_seq_dec.tolist(), vocab)

            # mask a token in decoder
            masked_sequence_dec, positions = mask_random_spins(input_seq_dec, vocab, mask_token=2)

            # Forward pass
            prediction = model.forward(input_encoder_one_hot, masked_sequence_dec) #masked_sequence[masked_position]
            
            predicted_tokens = F.softmax(prediction, dim=-1)
            predictions = predicted_tokens[positions]
            target_tokens = torch.where(input_decoder_one_hot[positions]==1)[1] #input_seq[masked_position]
            
            # Compute loss
            #print("model prediction:", prediction.shape)
            #print("target:", target_token.shape)
            loss = criterion(predictions, target_tokens)
            epoch_loss += loss.item()
            
            # Zero gradients, perform a backward pass, and update the weights.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            if i % 10 == 9. :    # print every 10 mini-batches
                writer.add_scalar("Running Loss", running_loss / 100, epoch)
                #print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
                running_loss = 0.0

        new_model.decoder_layer = model.decoder_layer
        new_model.fc = model.fc
        print(f'Epoch {epoch + 1} | Train Loss: {(epoch_loss / len(data)):.4f}')
        writer.add_scalar("Train Loss", epoch_loss / len(data), epoch)
        eval_loss = evaluate(new_model, data_test, data_test_dec, vocab, criterion, device=device)
        writer.add_scalar("Eval Loss", eval_loss, epoch)
        print(f'Epoch {epoch + 1} | Eval Loss: {(eval_loss):.4f}')
        
        # Perform early stopping based on eval loss
        if eval_loss < best_eval_loss:
            return epoch_loss / len(data_train)
    return epoch_loss / len(data_train)

writer.flush()
writer.close()

In [None]:
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

train(model, new_model, tensor_samples_train, final_chains_train, tensor_samples_test, final_chains_test, vocab, optimizer, criterion, device=device)

# Define the parameters 
vocab_size = 3
vocab = {-1:0,1:1,2:2} 
L = 200
embedding_dim = 200
proj_layer_dim = 128
hidden_dim = 200
num_layers = 1 # have to adapt the model for 2 and 3 layers
dropout_rate = 0.0
lr = 1e-3
num_sequences = 1000
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

NameError: name 'new_model' is not defined