## Understanding Amino Acid Seq Data --> creating input Tensor

1. Loading the data 
2. Create a Tokenizer for amino acids
3. Create a Tensor object 


### 0. notebook init

In [46]:
import numpy as np
import pandas as pd
import torch
import math

import string
from typing import Iterable, Tuple


### 1. Load the data

- The data is in .txt file, somewhat in a format for two columns. the first column is species-code and the next one is the amino-acid-seq  
- The simplest way to get the data is create lists. 

In [3]:
file_path = 'X_set.txt'

In [4]:
# Initialize lists to hold the phylogenetic position strings and amino acid sequences
specie_code = []
amino_acid_sequences = []

# Read the file
with open(file_path, 'r') as file:
    for line in file:
        parts = line.strip().split(' ')
        specie_code.append(parts[0])
        amino_acid_sequences.append(parts[1])

In [5]:
specie_code[0:3]

['111133333333333333333333333333',
 '111211333333333333333333333333',
 '111212333333333333333333333333']

In [6]:
amino_acid_sequences[0:3]

['---LSQF--LLMLWVPGSKGEIVLTQSPASVSVSPGERVTISCQASESVGNTYLNWLQQKSGQSPRWLIYQVSKLESGIPARFRGSGSGTDFTFTISRVEAEDVAHYYSQQ-----',
 'MESLSQC--LLMLWVPVSRGAIVLTQSPALVSVSPGERVTISCKASQSVGNTYLSWFRQKPGQSPRGLIYKVSNLPSGVPSRFRGSGAEKDFTLTISRVEAVDGAVYYCAQASYSP',
 'MESLSQC--LLMLWVPVSRGAIVLTQSPASVSVSPGERVTISCKASQSLGNTYLHWFQQKPGQSPRRLIYQVSNLLSGVPSRFSGSGAGKDFSLTISSVEAGDGAVYYCFQGSYDP']

### 2. Create a Tokenizer for amino acids

- There are 20 amino acids, each letter in the chain represents one of them. 
- Converting them into 20 tokens, meaning each amino acid would get a number associated with it. 
- Would also need a special character token, which is "-", something related to multiple-sequence-alignment 

In [7]:
# Creating a set of all amino-acids

amino_acid_set = set()

for seq in amino_acid_sequences:
    for acid in seq:
        if acid != "-":
            amino_acid_set.add(acid)

# 20 amino acids
print(f"Num of Amino Acids: {len(amino_acid_set) }")
amino_acids_list = list(amino_acid_set)

Num of Amino Acids: 20


In [8]:
# Creating a Tokenzer class, which ennodes and decodes an amino acid sequence 

class Tokenizer:
    ''' 
    To encode and decode any amino acid string
    '''
    # class attribute 
    amino_acids = amino_acids_list

    def __init__(self, special_tokens = Iterable[str]):
        # define a vocab
        self.vocab = Tokenizer.amino_acids + list(special_tokens)
        # mapping each vocab to a token (a numeric value)
        self.token2idx = {token:i for i, token in enumerate(self.vocab)} 
        # mapping numeric value back to a token
        self.idx2token = {i:token for token, i  in self.token2idx.items()}

    def encode(self, inputs: Iterable[str]) -> Iterable[int]:
        return [self.token2idx[token] for token in inputs]
    
    def decode(self, inputs: Iterable[int]) -> Iterable[str]:
        return [self.idx2token[idx] for idx in inputs]

    def __len__(self):
        return len(self.vocab)

In [9]:
# creating an instance of the Tokenizer. 
amino_acid_tokenizer = Tokenizer(special_tokens=["-", "[MASK]"])

In [10]:
# let's encode the first amino-acid-sequence and see the first 10 positions
print(f"First 20 amino acids         : {[i for i in amino_acid_sequences[0][0:20]]}")
print(f"First 20 encoded amino acids : {amino_acid_tokenizer.encode(amino_acid_sequences[0])[0:20]}")
print(f"First 20 decoded amino acids : {amino_acid_tokenizer.decode(amino_acid_tokenizer.encode(amino_acid_sequences[0])[0:20])}")

First 20 amino acids         : ['-', '-', '-', 'L', 'S', 'Q', 'F', '-', '-', 'L', 'L', 'M', 'L', 'W', 'V', 'P', 'G', 'S', 'K', 'G']
First 20 encoded amino acids : [20, 20, 20, 19, 3, 12, 2, 20, 20, 19, 19, 4, 19, 1, 9, 16, 10, 3, 13, 10]
First 20 decoded amino acids : ['-', '-', '-', 'L', 'S', 'Q', 'F', '-', '-', 'L', 'L', 'M', 'L', 'W', 'V', 'P', 'G', 'S', 'K', 'G']


In [11]:
len(amino_acid_tokenizer)

22

In [12]:
print(amino_acid_tokenizer.token2idx)

{'A': 0, 'W': 1, 'F': 2, 'S': 3, 'M': 4, 'C': 5, 'Y': 6, 'I': 7, 'D': 8, 'V': 9, 'G': 10, 'H': 11, 'Q': 12, 'K': 13, 'R': 14, 'N': 15, 'P': 16, 'T': 17, 'E': 18, 'L': 19, '-': 20, '[MASK]': 21}


In [13]:
amino_acid_tokenizer.encode(["A", "[MASK]"])

[0, 21]

### 3. Creating a Tensor object

In [14]:
# making sure that the size of each amino-acid-seq is same

len_amino_acid_seq = set()
for seq in amino_acid_sequences:
    len_amino_acid_seq.add(len(seq))

# this set should have only one value 
len_amino_acid_seq
# perfect! all the seq are 116 character long

{116}

In [15]:

def create_amino_acids_tensor(amino_acid_sequences:list, my_tokenizer:Tokenizer):

    amino_acid_tensors = []

    for seq in amino_acid_sequences:
        amino_acid_tensors.append(torch.Tensor(my_tokenizer.encode(seq)).to(torch.int64))

    # stacking them 
    stacked_tensor =  torch.stack(amino_acid_tensors)

    return stacked_tensor


In [16]:
all_amino_acids_tensor = create_amino_acids_tensor(amino_acid_sequences, amino_acid_tokenizer)

In [17]:
all_amino_acids_tensor

tensor([[20, 20, 20,  ..., 20, 20, 20],
        [ 4, 18,  3,  ...,  6,  3, 16],
        [ 4, 18,  3,  ...,  6,  8, 16],
        ...,
        [20, 20, 20,  ..., 18,  8, 16],
        [20, 20, 20,  ..., 18,  8, 16],
        [20, 20, 20,  ..., 18,  8, 16]])

In [18]:
all_amino_acids_tensor.shape
# the shape is 1001 species * 116 amino acids

torch.Size([1001, 116])

## Create Training data

In [19]:
def create_training_data_old(input_tensor:torch.Tensor, batch_size:int, mask_token:int):

    rows, cols = input_tensor.shape

    idx = torch.randint(rows-1, (batch_size,))

    input_seqs = []
    target_amino_acids = []
    mask_positions = []
    for i in idx:
        # select one amino acid seq
        selected_amino_seq = input_tensor[i].clone()
        # randomly choose a position to mask
        mask_position = torch.randint(cols-1, (1,)) 
        target_amino_acid = selected_amino_seq[mask_position]
        # replace the mask posiiton with mask-token
        selected_amino_seq[mask_position] = mask_token
        train_input_seq = selected_amino_seq

        input_seqs.append(train_input_seq)
        target_amino_acids.append(target_amino_acid)
        mask_positions.append(mask_position)

    return input_seqs, target_amino_acids, mask_positions
        

# create_training_data(all_amino_acids_tensor, batch_size=64, mask_token=21)


In [20]:
def create_training_data(input_tensor: torch.Tensor, batch_size: int, mask_token: int):
    """
    Creates masked training data efficiently using vectorized operations.

    Args:
      input_tensor (torch.Tensor): Input tensor of shape (num_sequences, sequence_length)
      batch_size (int): The desired batch size.
      mask_token (int): The token used for masking.

    Returns:
      tuple: (input_seqs, target_amino_acids, mask_positions)
             - input_seqs: Tensor of shape (batch_size, sequence_length) with masked sequences.
             - target_amino_acids: Tensor of shape (batch_size,) containing the masked amino acids.
             - mask_positions: Tensor of shape (batch_size,) indicating mask positions.
    """

    rows = input_tensor.shape[0]
    seq_len = input_tensor.shape[1]
    # Randomly select 'batch_size' rows (amino acid sequences)
    idx = torch.randint(rows, size=(batch_size,))
    input_seqs = input_tensor[idx].clone()

    # Generate random mask positions within each selected sequence
    mask_positions = torch.randint(seq_len, size=(batch_size, 1))

    # Get the target amino acids at the mask positions
    target_amino_acids = input_seqs.gather(1, mask_positions).squeeze()

    # Create a mask for the selected positions 
    mask = torch.zeros(input_seqs.size(), dtype=torch.bool)
    mask.scatter_(1, mask_positions, 1)

    # Apply the mask to replace the target positions with the mask_token
    input_seqs[mask] = mask_token

    return input_seqs, target_amino_acids, mask_positions.squeeze()

In [21]:
input_seqs, targets, mask_pos = create_training_data(all_amino_acids_tensor, batch_size=32, mask_token=21)

In [22]:
input_seqs.shape

torch.Size([32, 116])

In [23]:
# exactly one masked value
(input_seqs[0] == 21).sum()

tensor(1)

In [24]:
input_seqs[0]

tensor([ 4, 14,  2, 16,  0, 12, 19, 20, 20, 19, 19,  4, 19,  1,  0, 16, 10,  3,
         3, 10,  8,  7,  9,  4, 17, 12, 17, 16, 19,  3, 19,  3,  9, 17, 16, 10,
        13, 16,  9,  3,  7,  3,  5, 14,  0,  3, 21,  3, 19, 10,  4, 15,  6, 19,
         6,  1,  6, 19, 12, 13, 16, 10, 12,  3, 16, 12,  3, 19,  7,  6, 19,  0,
         3,  3, 14,  6, 16, 10,  9, 16,  8, 14,  2,  3, 10, 14, 10,  3, 10, 17,
         8,  2, 17, 19, 17,  7,  3,  3,  9, 18,  0, 18,  8,  9, 10,  9,  6,  6,
         5, 19, 12,  3, 19, 18,  2, 16])

In [25]:
mask_pos

tensor([46, 32, 94, 24, 23, 56, 57, 65, 39, 96, 65, 88, 72, 39, 94, 97, 43, 13,
        43,  6, 78, 97, 83, 71, 98, 84, 85, 17, 59,  8,  0, 60])

In [26]:
targets

tensor([12,  0, 17, 17,  4,  6, 12, 13, 17,  3, 14, 10,  3, 17, 17,  3, 13,  1,
        14, 19,  7,  3, 17,  0, 19, 10,  3,  3, 13, 20,  4, 16])

## Creating a Dataset Class

In [27]:
from torch.utils.data import Dataset, DataLoader

class MaskedAminoSeqDataset(Dataset):
    def __init__(self, input_tensor: torch.Tensor, mask_token: int):
            """
            Dataset for masked amino acid sequence prediction.

            Args:
            input_tensor (torch.Tensor): Input tensor of shape (num_sequences, sequence_length).
            mask_token (int): The token used for masking.
            """
            self.input_tensor = input_tensor
            self.mask_token = mask_token

    def __len__(self):
        return self.input_tensor.shape[0] 

    def __getitem__(self, idx):
        input_seqs, target_amino_acids, mask_positions = \
            self._create_training_data(self.input_tensor, batch_size=1, mask_token=self.mask_token)
        return input_seqs.squeeze(0), target_amino_acids.squeeze(0), mask_positions.squeeze(0)

    def _create_training_data(self, input_tensor: torch.Tensor, batch_size: int, mask_token: int):
        """
        Creates masked training data efficiently using vectorized operations.

        Args:
        input_tensor (torch.Tensor): Input tensor of shape (num_sequences, sequence_length)
        batch_size (int): The desired batch size.
        mask_token (int): The token used for masking.

        Returns:
        tuple: (input_seqs, target_amino_acids, mask_positions)
            - input_seqs: Tensor of shape (batch_size, sequence_length) with masked sequences.
            - target_amino_acids: Tensor of shape (batch_size,) containing the masked amino acids.
            - mask_positions: Tensor of shape (batch_size,) indicating mask positions.
        """
        rows = input_tensor.shape[0]
        seq_len = input_tensor.shape[1]
        # Randomly select 'batch_size' rows (amino acid sequences)
        idx = torch.randint(rows, size=(batch_size,))
        input_seqs = input_tensor[idx].clone()

        # Generate random mask positions within each selected sequence
        mask_positions = torch.randint(seq_len, size=(batch_size, 1))

        # Get the target amino acids at the mask positions
        target_amino_acids = input_seqs.gather(1, mask_positions).squeeze()

        # Create a mask for the selected positions 
        mask = torch.zeros(input_seqs.size(), dtype=torch.bool)
        mask.scatter_(1, mask_positions, 1)

        # Apply the mask to replace the target positions with the mask_token
        input_seqs[mask] = mask_token

        return input_seqs, target_amino_acids, mask_positions.squeeze()


In [28]:
# Assuming input_tensor is your tensor of amino acid sequences
masked_amino_seq_dataset = MaskedAminoSeqDataset(all_amino_acids_tensor, mask_token=21) # Assuming 0 is your mask token
masked_amino_seq_dataloader = DataLoader(masked_amino_seq_dataset, batch_size=32, shuffle=True)

In [29]:
## each iteration now gives a batch with 32 data points.
for i in masked_amino_seq_dataloader:
    print(f"amino seq with masked: {i[0]}")
    print(f"target amino acid: {i[1]}")
    print(f"mask posittion: {i[2]}")
    break

amino seq with masked: tensor([[ 4, 18,  0,  ..., 15, 12, 16],
        [ 4, 18,  0,  ..., 15, 19,  3],
        [ 4, 14,  9,  ..., 15, 19, 16],
        ...,
        [ 4, 14,  9,  ...,  3, 15, 16],
        [ 4,  3,  9,  ..., 10, 17, 16],
        [ 4,  9,  3,  ..., 21, 17, 16]])
target amino acid: tensor([ 8,  5,  0,  3, 19, 11,  6, 10, 16,  3,  9, 10, 18,  2, 10,  9, 20, 17,
        18, 18, 14,  1,  9,  8, 14,  3, 19,  8,  6,  6, 16,  3])
mask posittion: tensor([102, 108,   4,  76,  30, 111,  69,  88,  64, 113,  71,  16, 101,  93,
         88, 114,  49,  39,  36,  75,  81,  13,  48,  20,  43,  85,  66,  90,
         56,  69, 100, 113])


## Embedding 

- Amino Acid Embneddings
- Position Embeddings

In [36]:
print(amino_acid_tokenizer.token2idx)

{'A': 0, 'W': 1, 'F': 2, 'S': 3, 'M': 4, 'C': 5, 'Y': 6, 'I': 7, 'D': 8, 'V': 9, 'G': 10, 'H': 11, 'Q': 12, 'K': 13, 'R': 14, 'N': 15, 'P': 16, 'T': 17, 'E': 18, 'L': 19, '-': 20, '[MASK]': 21}


In [40]:
len(amino_acid_tokenizer)

22

In [42]:
amino_acid_tokenizer.encode('-')

[20]

In [44]:

torch.nn.Embedding(len(amino_acid_tokenizer), 100, padding_idx=20) # 20 is for '-', we dont want to learn embedding for '-'

Embedding(22, 100, padding_idx=20)

In [78]:
class SinusoidalPositionEncoding(nn.Module):
    def __init__(self, embed_size, max_seq_length=5000):
        super().__init__()
        self.embed_size = embed_size
        
        pe = torch.zeros(max_seq_length, embed_size)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-math.log(10000.0) / embed_size))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return self.pe[:, :x.size(1)]


In [90]:
x = torch.randn(32, 100, 10)
x.shape

torch.Size([32, 100, 10])

In [93]:
pos = SinusoidalPositionEncoding(embed_size=10, max_seq_length=9)
pos(x).shape

torch.Size([1, 9, 10])

In [None]:
class AminoBERTEmbedding(torch.nn.module):

    def __init__(self, vocab_size, embed_size, max_seq_length, dropout=0.1):

        self.embed_size = embed_size
        self.token = torch.nn.Embedding(vocab_size, embed_size)
        self.position = SinusoidalPositionEncoding(embed_size, max_seq_length=max_seq_length)
        self.dropout = torch.nn.Dropout(p=dropout)

    def forward(self, seq, )

In [52]:
# understanding positioning encoding 

pos_ids = torch.arange(10, dtype=torch.long)
pos_ids

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [55]:
x = torch.Tensor([[3,4,7,8,1,3,4,7,4,8], [3,4,7,8,1,3,4,7,4,8]])
pos_ids.unsqueeze(0).expand_as(x)

tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [56]:
x

tensor([[3., 4., 7., 8., 1., 3., 4., 7., 4., 8.],
        [3., 4., 7., 8., 1., 3., 4., 7., 4., 8.]])

## Training part

In [30]:
import torch.nn as nn
import math

In [31]:
class ProteinPredictor(nn.Module):
    def init(self, num_variants, seq_length, num_amino_acids, emd_dim=128, nhead=8, num_layers=3):
        super().init()
        
        self.embed = nn.Embedding(num_amino_acids, emd_dim)
        self.pos_encoder = PositionalEncoding(emd_dim, seq_length)
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=emd_dim, nhead=nhead)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.fc = nn.Linear(emd_dim, num_amino_acids)

In [32]:
len(amino_acid_tokenizer)

22

In [33]:
vocab_size = len(amino_acid_tokenizer)
emb_dim = 8


embed = nn.Embedding(vocab_size, emb_dim)
pos_emb = nn.Embedding(vocab_size, )

TypeError: Embedding.__init__() missing 1 required positional argument: 'embedding_dim'

In [None]:
input_seqs.shape

torch.Size([32, 116])

In [None]:
embed(input_seqs).shape

torch.Size([32, 116, 8])

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_dim, max_len=5000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_dim, 2) * (-math.log(10000.0) / emb_dim))
        pe = torch.zeros(max_len, 1, emb_dim)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, seq_len, emb_dim)
        Returns:
            Tensor of shape (batch_size, seq_len, emb_dim) with positional encodings added.
        """
        return x + self.pe[:x.size(1)]

In [None]:
pos_encoder = PositionalEncoding(emb_dim, max_len=116)

In [None]:
pos_encoder(embed(input_seqs))

RuntimeError: The size of tensor a (32) must match the size of tensor b (116) at non-singleton dimension 0