### Notes

First, we need to decide what library can be used for handling data. I will encode sequences numerically, and want to have the option to handle them that way, scale to add different information (BPP, physical properties, distance, etc), and to handle them as graphs. The best one for this seems to be PyTorch (datatype - tensor), it also has the option of PyTorch Geometric. Another option would be TensorFlow/Keras, though it seems a bit harder to handle graphs. 

TO DO:
- set up first NN with X as input and y (coordinates) as output
- incorporate MSA

### Prepare data (X & y)
For now, these are prepared as tensors of one-hot-encoded sequence (padded to make sure they are of same length), and tensors of coordinates. MSA are not yet considered.
Update: since embedding is used, the sequences are instead converted to tensors. The one-hot-encode code is kept below for now.

In [193]:
import pandas as pd
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.nn import Module, MSELoss
from torch.optim import Adam
from torch.utils.data import random_split

train_seq = pd.read_csv("../toy_data/train_sequences.csv")
train_lbl = pd.read_csv("../toy_data/train_labels.csv")

In [178]:
all(train_lbl["base_ID"].unique() == train_seq['target_id']) # Always good to check

True

In [304]:
def make_coord_tensor(train_lbl):
    train_lbl['base_ID'] = train_lbl['ID'].str.rsplit('_', n=1).str[0] # sequence ID for each nt
    main_id_list = train_lbl['ID']
    y_list = []
    og_id_list_temp = [] # not extended list
    for idx in list(train_lbl['base_ID'].unique()):
        subset = train_lbl[train_lbl['base_ID'] == idx]
        coords = []
        for res in range(len(subset['ID'])):
            coord = list(subset.iloc[res, 3:6])
            coords.append(coord)
        
        og_id_list_temp.append(list(subset['ID']))
        
        y_list.append(torch.tensor(coords, dtype=torch.float32))
        
    y_tensor = pad_sequence(y_list, batch_first=True)

    og_id_list = []
    for list_ids in og_id_list_temp:
        extension = ['0'] * ( int(y_tensor.size()[1]) - len(list_ids) )
        list_ids.extend(extension)
        og_id_list.append(list_ids)

    return y_list, y_tensor, og_id_list

temp1, temp2, temp3 = make_coord_tensor(train_lbl)
temp2.size()



torch.Size([30, 35, 3])

35

In [311]:
# Create Dataset & Dataloader

nts = ['G', 'U', 'C', 'A', 'X', '-']
mapping = {nt: idx+1 for idx, nt in enumerate(nts)}


def tokenise_seq(seq, mapping=mapping):
    seq_idx = [mapping[nt] for nt in seq]
    seq_idx = torch.tensor(seq_idx)
    return seq_idx

def make_coord_tensor(train_lbl):
    train_lbl['base_ID'] = train_lbl['ID'].str.rsplit('_', n=1).str[0] # sequence ID for each nt
    main_id_list = train_lbl['ID']
    y_list = []
    og_id_list_temp = [] # not extended list
    for idx in list(train_lbl['base_ID'].unique()):
        subset = train_lbl[train_lbl['base_ID'] == idx]
        coords = []
        for res in range(len(subset['ID'])):
            coord = list(subset.iloc[res, 3:6])
            coords.append(coord)
        
        og_id_list_temp.append(list(subset['ID']))
        
        y_list.append(torch.tensor(coords, dtype=torch.float32))
        
    y_tensor = pad_sequence(y_list, batch_first=True)

    og_id_list = []
    for list_ids in og_id_list_temp:
        extension = ['0'] * ( int(y_tensor.size()[1]) - len(list_ids) )
        list_ids.extend(extension)
        og_id_list.append(list_ids)

    return y_list, y_tensor, og_id_list

class Rnadataset(Dataset):
    def __init__(self, train_seq, train_lbl):
        super().__init__()
        self.X_list = [tokenise_seq(seq) for seq in train_seq['sequence']]
        self.X_tensor = pad_sequence(self.X_list, batch_first=True)
        self.y_list, self.y_tensor, self.ids = make_coord_tensor(train_lbl)
        #self.ids = train_seq['target_id']

    def __len__(self):
        return len(self.X_tensor)
    
    def __getitem__(self, index) :
        return self.X_tensor[index], self.y_tensor[index], self.ids[index]
    

dataset = Rnadataset(train_seq, train_lbl)

train_size = int(len(dataset)*0.8)
test_size = int(len(dataset)-train_size)

train_data, test_data = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_data, batch_size=15, shuffle=False)
test_loader = DataLoader(test_data, batch_size=15, shuffle=False)

len(train_data)


24

In [320]:
#list_temp = next(iter(train_loader))

list_temp[1].size()
list_temp[2]

[('1ATW_A_1',
  '1RNG_A_1',
  '1MME_D_1',
  '1ZIG_A_1',
  '1TLR_A_1',
  '1SLO_A_1',
  '1AFX_A_1',
  '1KPD_A_1',
  '1SCL_A_1',
  '1HMH_E_1',
  '1IKD_A_1',
  '1AQO_A_1',
  '1RHT_A_1',
  '1ATO_A_1',
  '1KAJ_A_1'),
 ('1ATW_A_2',
  '1RNG_A_2',
  '1MME_D_2',
  '1ZIG_A_2',
  '1TLR_A_2',
  '1SLO_A_2',
  '1AFX_A_2',
  '1KPD_A_2',
  '1SCL_A_2',
  '1HMH_E_2',
  '1IKD_A_2',
  '1AQO_A_2',
  '1RHT_A_2',
  '1ATO_A_2',
  '1KAJ_A_2'),
 ('1ATW_A_3',
  '1RNG_A_3',
  '1MME_D_3',
  '1ZIG_A_3',
  '1TLR_A_3',
  '1SLO_A_3',
  '1AFX_A_3',
  '1KPD_A_3',
  '1SCL_A_3',
  '1HMH_E_3',
  '1IKD_A_3',
  '1AQO_A_3',
  '1RHT_A_3',
  '1ATO_A_3',
  '1KAJ_A_3'),
 ('1ATW_A_4',
  '1RNG_A_4',
  '1MME_D_4',
  '1ZIG_A_4',
  '1TLR_A_4',
  '1SLO_A_4',
  '1AFX_A_4',
  '1KPD_A_4',
  '1SCL_A_4',
  '1HMH_E_4',
  '1IKD_A_4',
  '1AQO_A_4',
  '1RHT_A_4',
  '1ATO_A_4',
  '1KAJ_A_4'),
 ('1ATW_A_5',
  '1RNG_A_5',
  '1MME_D_5',
  '1ZIG_A_5',
  '1TLR_A_5',
  '1SLO_A_5',
  '1AFX_A_5',
  '1KPD_A_5',
  '1SCL_A_5',
  '1HMH_E_5',
  '1IKD_A_5',
  

Index mapping:

In [None]:
# nts = ['G', 'U', 'C', 'A', 'X', '-']
# mapping = {nt: idx+1 for idx, nt in enumerate(nts)}


# def tokenise_seq(seq, mapping=mapping):
#     seq_idx = [mapping[nt] for nt in seq]
#     seq_idx = torch.tensor(seq_idx)
#     return seq_idx

# X_list = [tokenise_seq(seq) for seq in train_seq['sequence']]
# X_tensor = pad_sequence(X_list, batch_first=True)

# X_tensor[0] # QC


In [None]:
# # y: Convert coordinates to tensor

# train_lbl['base_ID'] = train_lbl['ID'].str.rsplit('_', n=1).str[0]

# y_list = []
# for idx in list(train_lbl['base_ID'].unique()):

#     coords = []
#     for res in range(len(train_lbl[train_lbl['ID'].str.startswith(idx)])):
#         coord = list(train_lbl.iloc[res, 3:6])
#         coords.append(coord)
    
#     y_list.append(torch.tensor(coords, dtype=torch.float32))
    
# y_tensor = pad_sequence(y_list, batch_first=True)

# y_tensor.size()[0:2] == X_tensor.size()[0:2] # check that it's formatted correctly 

In [170]:
# !!!! Make a padding mask

attn_mask = []
for seq in X_list:
    mask = [False if i < len(seq) else True for i in range(X_tensor.size()[1])]
    attn_mask.append(mask)

attn_mask = torch.tensor(attn_mask)
padding_mask = ~attn_mask

In [15]:
# # Create Dataset & Dataloader

# from torch.utils.data import random_split

# class Rnadataset(Dataset):
#     def __init__(self, X_tensor, y_tensor):
#         super().__init__()
#         self.X_tensor = X_tensor
#         self.y_tensor = y_tensor
    
#     def __len__(self):
#         return len(self.X_tensor)
    
#     def __getitem__(self, index) :
#         return self.X_tensor[index], self.y_tensor[index]
    
# dataset = Rnadataset(X_tensor, y_tensor)

# train_size = int(len(dataset)*0.8)
# test_size = int(len(dataset)-train_size)

# train_data, test_data = random_split(dataset, [train_size, test_size])

# train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
# test_loader = DataLoader(test_data, batch_size=16, shuffle=True)

### Note on loss function

The competition uses TM-Score to evaluate predictions, which among other things is based on distance rather than absolute differences. As such, for my task, I will be converting both ground truth and predicted coordinates to distance matrices, and minimising loss between the two. Since it leverages  squared difference in distances, we'll use MSE (for now).

### Build initial simple model
The architecture will consist of:
- embedding: mapping integers corresponding to nucleotides in sequence to vectors representing semantic meanings
- sequence encoder:  inspired by RibonanzaNet: 9 layers of 1D conv + residual, multi-head self-attention, and a feed-forward network

In [195]:
# Define blocks of the model

class SeqEncoder(nn.Module): # Define single encoder block
    def __init__(self, hidden_size=256, kernel_size=3):
        super().__init__()
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        self.conv = nn.Conv1d(hidden_size, hidden_size, kernel_size=kernel_size, padding = kernel_size // 2)
        self.attn = nn.MultiheadAttention(hidden_size, 8)
        self.norm1 = nn.LayerNorm(hidden_size)
        self.norm2 = nn.LayerNorm(hidden_size)
        self.norm3 = nn.LayerNorm(hidden_size)
        self.ff = nn.Sequential(
            nn.Linear(hidden_size, 4*hidden_size),
            nn.GELU(),
            nn.Linear(4*hidden_size, hidden_size)
        )

    def forward(self, X):
        X = X + self.conv(X.transpose(1,2)).transpose(1,2) # 1D conv with residual connection + Layer Norm; transpose to expected input
        X = self.norm1(X)
        res = X
        attn_out, _ = self.attn(X.transpose(0,1), X.transpose(0,1), X.transpose(0,1))
        attn_out = attn_out.transpose(0,1) + res
        X = self.norm2(attn_out)
        res = X
        X = self.norm3(res + self.ff(X))
        return X
        
class ConvEncoder(nn.Module): # define a whole transformer pipeline
    def __init__(self, n_blocks = 9, **kwargs):
        super().__init__()
        self.layers = nn.ModuleList([SeqEncoder(**kwargs) for _ in range(n_blocks)])
    
    def forward(self, X):
        for layer in self.layers:
            X = layer(X)
        return X


In [196]:
# Define model 

class InitModel(Module): # define rest of model
    def __init__(self, seq_length=35, vocab=6, n_blocks=9, hidden_size=256):
        super().__init__()
        #self.b, self.l = X.size()
        self.l = seq_length
        self.b = vocab
        self.embedding = nn.Embedding(self.l , hidden_size, padding_idx=0) # map each base to a vector representation of size 256
        self.pos_embedding = nn.Embedding(self.l , hidden_size)
        self.convencoder = ConvEncoder(n_blocks=n_blocks, hidden_size=hidden_size)
        self.output = nn.Linear(hidden_size, 3)

    def forward(self, X):

        # Make embeddings (+ positional embeddings)

        X = self.embedding(X)
        positions = torch.arange(self.l).unsqueeze(0).expand(X.size(0), self.l)
        pos_embd = self.pos_embedding(positions)
        X = X + pos_embd

        # Pass through convolutional transformer

        X = self.convencoder(X)

        out = self.output(X)
        return(out)

        ## TO DO: add padding masks, add layers which map the encoded representations to coords, add distance calculation, minimise loss btwn og dist & encoded dist 


initmodel = InitModel()       

In [197]:
# Define custom loss function on distance matrices rather than coords

def pairwise_distance_matrix(X):
    diff = X.unsqueeze(2) - X.unsqueeze(1)  # shape: (batch, 35, 35, 5)
    return torch.norm(diff, dim=-1)

class DistanceMatrixLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.loss = MSELoss()
    
    def forward(self, y_true, y_pred):
        y_true_m = pairwise_distance_matrix(y_true)
        y_pred_m = pairwise_distance_matrix(y_pred)
        loss = self.loss(y_true_m, y_pred_m)
        return loss 


In [None]:
from func import score

n_epochs = 10

# Define function to convert coordinates to dataframe for TMScore calculation
def coords_to_df_train(tensor_list):
    flat_tensor = torch.cat(tensor_list, dim=0).flatten(0,1) # fuse tensors in list, then flatten (batch + seq)

    n_seq = 0
    seq_length = tensor_list[0].size()[1]
    for i in tensor_list: # calculate number of sequences 
        n_seq = n_seq + i.size()[0] 
    
    seq_ids = torch.arange(n_seq).repeat_interleave(seq_length).unsqueeze(1) # create ID for each seq in flat tensor
    pred_idxs = torch.cat([seq_ids, flat_tensor], dim=1) # fuse IDs with tensor itself
    df = pd.DataFrame(pred_idxs.detach().numpy()) # convert to dataframe
    df.columns = ['seq_ID_int', "x_1", "y_1", "z_1"] 
    df['seq_ID_int'] = df['seq_ID_int'].astype(int) # change to integer
    return df 


initmodel = InitModel()
criterion = DistanceMatrixLoss()
optimiser = Adam(initmodel.parameters())

cols = ["Epoch", "Train_Loss", "Test_Loss", "Train_TMScore", "Test_TMScore"]
perf = pd.DataFrame(index=range(n_epochs), columns=cols)

for epoch in range(n_epochs):
    loss_train = []
    epoch_pred_train = []
    epoch_true_train = []
    target_ids = []
    seq_idx = []
    initmodel.train()
    for seq, coords, ids in train_loader:
        optimiser.zero_grad()
        pred_coords = initmodel(seq)
        loss = criterion(coords,pred_coords)
        loss_train.append(loss.item())
        epoch_pred_train.append(pred_coords.detach())
        epoch_true_train.append(coords.detach())
        target_ids.extend(list(ids))
        seq_idx.extend(seq.flatten(0,1).tolist())
        loss.backward()
        optimiser.step()
    
    
    df_pred_train = coords_to_df_train(epoch_pred_train)
    df_true_train = coords_to_df_train(epoch_true_train)

    len(df_pred_train)

    df_pred_train['seq_idx'] = seq_idx
    df_true_train['seq_idx'] = seq_idx

    df_pred_train = df_pred_train[df_pred_train['seq_idx'] != 0]

    #df_pred_train['ID'] = target_ids
    #df_true_train['ID'] = target_ids
    
    # ids_mapping = {seq_n: seq_id for seq_n, seq_id in enumerate(target_ids)} # map integer IDs to actual seq/target IDs
    # df_pred_train['target_id'] = df_pred_train['ID'].map(ids_mapping)
    # df_true_train['target_id'] = df_true_train['ID'].map(ids_mapping)
    
    tm_train = score(df_pred_train, df_true_train, row_id_column_name = 'temp_seq_id')
    loss_train = sum(loss_train)/len(loss_train)

    initmodel.eval()
    with torch.no_grad():
        loss_test = []
        epoch_pred_test = []
        epoch_true_test = []
        for seq, coords in test_loader:
            pred_coords_test = initmodel(seq)
            epoch_true_test.extend(coords)
            epoch_pred_test.extend(pred_coords_test)
            loss_test.append(criterion(coords, pred_coords_test).item())
        
        #TMTest = score(pd.DataFrame(epoch_true_test), pd.DataFrame(epoch_pred_test))
        loss_test = sum(loss_test)/len(loss_test)
    
    #perf.iloc[epoch, :] = [epoch+1, loss_train, loss_test, TMTrain, TMTest]
    print(f"Epoch {epoch+1}: Loss train {round(loss_train, 2)}, Loss Test {round(loss_test, 2)}, TM TRAIN {tm_train}")


# TO DO: figure out TM Score (expects 2D dataframe of values), add padding masks, refine whole model, and train on full data, figure out how to import from src/func

In [258]:
len(df_pred_train['seq_idx'])

#len(train_lbl)

549

In [259]:
len(dataset.ids)

#df_pred_train['ID'].unique()

683

In [111]:
#pred_coords.flatten(0,1).size()
 
# FOR ONE SEQUENCE


seq_ids = torch.arange(batch_size).repeat_interleave(35).unsqueeze(1)
epoch_no = torch.Tensor([10]).repeat(35*8).unsqueeze(1)

pred_flat = pred_coords.flatten(0, 1)
seq_ids.size()
pred_idxs = torch.cat([epoch_no, seq_ids, pred_flat], dim=1)
pred_idxs.size()

torch.Size([280, 5])

In [None]:

df = pd.DataFrame(torch.cat(epoch_pred_train, dim=0).flatten(0, 1).detach().numpy())
df.columns = ["x", "y", "z"]
df

In [156]:
# DO THIS FOR epoch_pred_train & epoch_true_train - smth different for test
def coords_to_df_train(tensor):
    flat_tensor = torch.cat(tensor, dim=0).flatten(0,1) # fuse tensors in list, then flatten (batch + seq)

    n_seq = 0
    seq_length = tensor[0].size()[1]
    for i in tensor: # calculate number of sequences 
        n_seq = n_seq + i.size()[0] 
    
    seq_ids = torch.arange(n_seq).repeat_interleave(seq_length).unsqueeze(1) # create ID for each seq in flat tensor
    pred_idxs = torch.cat([seq_ids, flat_tensor], dim=1) # fuse IDs with tensor itself
    df = pd.DataFrame(pred_idxs.detach().numpy()) # convert to dataframe
    df.columns = ["temp_seq_id", "x", "y", "z"] 
    df['temp_seq_id'] = df['temp_seq_id'].astype(int) # change to integer
    return df 

flat_tensor = epoch_true_train

df_testing = coords_to_df_train(flat_tensor)
df_testing


Unnamed: 0,temp_seq_id,x,y,z
0,0,13.760,-25.974001,0.102
1,0,9.310,-29.638000,2.669
2,0,5.529,-27.813000,5.878
3,0,2.678,-24.900999,9.793
4,0,1.827,-20.136000,11.793
...,...,...,...,...
835,23,0.000,0.000000,0.000
836,23,0.000,0.000000,0.000
837,23,0.000,0.000000,0.000
838,23,0.000,0.000000,0.000


In [152]:
tensor_list = epoch_pred_train
flat_tensor.size()


torch.Size([840, 3])