In [4]:
import torch
import torch.nn.functional as F
from torch import optim

import yaml
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import quantile_transform 
from x_transformers import XTransformer, TransformerWrapper, Decoder, Encoder, ViTransformerWrapper

from aptamer_transformer.model import *
from aptamer_transformer.factories_model_loss import *
from aptamer_transformer.data_utils import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
cfg = read_cfg('config.yaml')
# Initialize the model and optimizer here as you did during training
device = torch.device("cuda:0")  
cfg.update({
    'device': device,
})

model = get_model(cfg).to(device)

with open('../data/raw_data/nupack_strucutre_data/mfe.pickle', 'rb') as f:
    mfe = pickle.load(f)


In [None]:
df = load_seq_and_struc_data(cfg)

In [None]:
df.dot_bracket_struc

In [None]:
struc_energy = mfe[0][0]

energy = struc_energy.energy
struc = struc_energy.structure

print(f'Dot Bracket Secondart Strucutre Notation:\n{struc.dotparensplus()}\n')
print(f'Adjacency Matrix (edges):\n{struc.matrix()}\n')
print(f'Secondary Strucutre Mean Free Energy:\n{energy}')

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast
from transformers import DataCollatorForLanguageModeling

tokenizer = Tokenizer(BPE(unk_token="N"), )
tokenizer.pre_tokenizer = Whitespace()
tokenizer.mask_token = "[MASK]"
tokenizer.cls_token = "[CLS]"
tokenizer.pad_token = "[PAD]"
tokenizer.unknown_token = "N"
tokenizer.model_max_length = 42
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A",
    special_tokens=[("[PAD]",0), ("N", 1), ("[CLS]", 2), ("[MASK]", 3)]
)

trainer = BpeTrainer(
    vocab_size=8,
    special_tokens=["[PAD]", "N", "[CLS]", "[MASK]"],
)

structures = [' '.join(df.) for key in mfe.keys()]
tokenizer.train_from_iterator(structures, trainer=trainer)

fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, model_max_length=42, cls_token="[CLS]", unk_token="N", pad_token="[PAD]", mask_token="[MASK]", return_special_tokens_mask=1)


In [None]:
tokenized_structures = fast_tokenizer(structures, padding=True, )

In [None]:
F.cross_entropy(out.movedim(2,1), trg)

In [None]:
fast_tokenizer.save_pretrained('../data/AptamerBERT_tokenizer')

In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
tokenizer = AutoTokenizer.from_pretrained('../data/AptamerBERT_tokenizer')
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
tokenizer