In [1]:
import torch
import torch.nn.functional as F
from torch import optim

import yaml
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import quantile_transform 
from x_transformers import XTransformer, TransformerWrapper, Decoder, Encoder, ViTransformerWrapper

from aptamer_transformer.model import *
from aptamer_transformer.factories_model_loss import *
from aptamer_transformer.data_utils import *

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from tokenizers.pre_tokenizers import PreTokenizer


%load_ext autoreload
%autoreload 2

2024-01-17 15:29:48.302150: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-17 15:29:48.302218: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-17 15:29:48.303172: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-17 15:29:48.308767: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_pickle('../data/saved_processed_data/saved_h5/seq_struct_n_classes_2.pkl')


In [None]:
df.head()

In [4]:
structures = df.dot_bracket_struc.apply(lambda x: ' '.join(x)).values.tolist()
seqs = df.Sequence.apply(lambda x: ' '.join(x)).values.tolist()

seq_structs = [f'{seq}{struct}' for seq, struct in zip(df.Sequence, df.dot_bracket_struc)]

seq_structs_white_space = [(seq , struct) for seq, struct in zip(seqs, structures)]

len_seq_structs = [len(struct) for struct in seq_structs]
print(max(len_seq_structs))

82


In [52]:
seq_struct_tokenize = AutoTokenizer.from_pretrained(cfg['seq_tokenizer_path'])

tokenized = seq_struct_tokenize(seqs[:10], padding=True)

In [59]:
np.array(tokenized.input_ids)

array([[2, 4, 7, 5, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 7, 7, 7, 6, 6, 6, 7, 4,
        7, 6, 6, 6, 6, 7, 5, 7, 6, 5, 4, 5, 7, 4, 7, 6, 6, 5, 7, 5],
       [2, 7, 5, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 7, 5, 6, 6, 6, 7, 6,
        5, 7, 5, 6, 7, 7, 5, 6, 4, 6, 6, 6, 6, 7, 5, 6, 5, 4, 6, 0],
       [2, 4, 6, 6, 7, 7, 6, 6, 7, 6, 7, 4, 6, 6, 7, 7, 7, 4, 6, 6, 7, 7,
        6, 7, 4, 7, 7, 6, 6, 7, 7, 4, 6, 6, 6, 7, 7, 6, 4, 6, 4, 0],
       [2, 7, 6, 6, 5, 7, 6, 7, 5, 7, 7, 4, 5, 6, 4, 7, 5, 6, 5, 6, 4, 4,
        6, 6, 4, 5, 6, 4, 7, 7, 6, 4, 4, 4, 4, 5, 7, 7, 7, 6, 4, 0],
       [2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0],
       [2, 4, 7, 4, 6, 7, 7, 4, 5, 6, 6, 6, 7, 6, 4, 6, 5, 5, 6, 7, 5, 4,
        7, 7, 5, 4, 4, 6, 7, 7, 7, 4, 5, 7, 4, 5, 7, 7, 5, 7, 6, 0],
       [2, 5, 5, 5, 4, 7, 6, 6, 7, 4, 6, 6, 7, 4, 7, 7, 6, 5, 7, 7, 6, 6,
        7, 4, 6, 6, 6, 4, 7, 4, 6, 7, 6, 6, 6, 5, 7, 7, 6, 4, 7, 6],

In [54]:
seq_struct_tokenize.batch_decode(tokenized.input_ids)

['[CLS] A T C G G G G G G T G G G T T T G G G T A T G G G G T C T G C A C T A T G G C T C',
 '[CLS] T C G C G G G G G G C G G G T C G G G T G C T C G T T C G A G G G G T C G C A G [PAD]',
 '[CLS] A G G T T G G T G T A G G T T T A G G T T G T A T T G G T T A G G G T T G A G A [PAD]',
 '[CLS] T G G C T G T C T T A C G A T C G C G A A G G A C G A T T G A A A A C T T T G A [PAD]',
 '[CLS] G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G G [PAD]',
 '[CLS] A T A G T T A C G G G T G A G C C G T C A T T C A A G T T T A C T A C T T C T G [PAD]',
 '[CLS] C C C A T G G T A G G T A T T G C T T G G T A G G G A T A G T G G G C T T G A T G',
 '[CLS] G G G A G G G A G G G T G G G G G G T T C T C G C T G C G G G T T T T G G T G C [PAD]',
 '[CLS] T G G G T G G G A G G G A G G G G G G C T T T A T T C C G G T G T T T T T T C G [PAD]',
 '[CLS] A G G C T T A T C G T C A G G G G G G T G G G T C C G G G T T C G G G G T C G T [PAD]']

In [None]:
seq_structs_white_space

In [7]:
cfg = read_cfg('../aptamer_transformer/config.yaml')

seq_struct_tokenize = AutoTokenizer.from_pretrained(cfg['seq_struct_tokenizer_path'])

seq_struct_tokenize(seq_structs_white_space[0])

{'input_ids': [2, 7, 10, 8, 9, 9, 9, 9, 9, 9, 10, 9, 9, 9, 10, 10, 10, 9, 9, 9, 10, 7, 10, 9, 9, 9, 9, 10, 8, 10, 9, 8, 7, 8, 10, 7, 10, 9, 9, 8, 10, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [3]:
tokenizer = Tokenizer(BPE(unk_token="N"), )
special_tokens = ["[PAD]", "N", "[CLS]", "[MASK]", "[SEP]"]
tokenizer.add_special_tokens(special_tokens)

tokenizer.pre_tokenizer = Whitespace()
tokenizer.mask_token = "[MASK]"
tokenizer.cls_token = "[CLS]"
tokenizer.pad_token = "[PAD]"
tokenizer.unknown_token = "N"
tokenizer.sep_token = "[SEP]"
tokenizer.model_max_length = 84
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")

# Post processor
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[("[PAD]", 0), ("N", 1), ("[CLS]", 2), ("[MASK]", 3), ("[SEP]", 4)]
)

# Train the tokenizer
trainer = BpeTrainer(special_tokens=special_tokens)
tokenizer.train_from_iterator(seq_structs_white_space, trainer=trainer)

# Create a fast tokenizer
fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer, 
    model_max_length=84, 
    sep_token="[SEP]", 
    cls_token="[CLS]", 
    unk_token="N", 
    pad_token="[PAD]", 
    mask_token="[MASK]", 
    return_special_tokens_mask=1
)

fast_tokenizer

NameError: name 'seq_structs_white_space' is not defined

In [48]:
fast_tokenizer.vocab

{'T': 11,
 'N': 1,
 '[CLS]': 2,
 '.': 7,
 'G': 10,
 '[MASK]': 3,
 ')': 6,
 '[SEP]': 4,
 '[PAD]': 0,
 'C': 9,
 '(': 5,
 'A': 8}

In [16]:
fast_tokenizer.save_pretrained('../data/tokenizers/seq_struct_sep_whitespace')

('../data/tokenizers/seq_struct_sep_whitespace/tokenizer_config.json',
 '../data/tokenizers/seq_struct_sep_whitespace/special_tokens_map.json',
 '../data/tokenizers/seq_struct_sep_whitespace/tokenizer.json')

In [49]:
tokenized_structures = fast_tokenizer(seq_structs_white_space, padding=True,)

Token indices sequence length is longer than the specified maximum sequence length for this model (85 > 84). Running this sequence through the model will result in indexing errors


In [23]:
tokenizer = AutoTokenizer.from_pretrained('../data/tokenizers/seq_struct_sep_whitespace')
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
tokenizer

PreTrainedTokenizerFast(name_or_path='../data/tokenizers/seq_struct_whitespace', vocab_size=11, model_max_length=83, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': 'N', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("N", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [24]:
tokenized_seq_structs = tokenizer(seq_structs_white_space, padding=True)

In [18]:
tokenized_seq_structs.input_ids[0]

NameError: name 'tokenized_seq_structs' is not defined

In [26]:
cfg = read_cfg('../aptamer_transformer/config.yaml')
# Initialize the model and optimizer here as you did during training
device = torch.device("cuda:0")  
cfg.update({
    'device': device,
})

model = get_model(cfg).to(device)

# with open('../data/raw_data/nupack_strucutre_data/mfe.pickle', 'rb') as f:
#     mfe = pickle.load(f)


[autoreload of aptamer_transformer.model failed: Traceback (most recent call last):
  File "/glade/work/mlsample/conda-envs/guess/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/glade/work/mlsample/conda-envs/guess/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/glade/work/mlsample/conda-envs/guess/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/glade/work/mlsample/conda-envs/guess/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 365, in update_class
    update_instances(old, new)
  File "/glade/work/mlsample/conda-envs/guess/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 319, in update_instances
    refs = gc.get_referrers(old)
KeyboardInterrupt
]


TypeError: super(type, obj): obj must be an instance or subtype of type

In [61]:
x = tokenized_seq_structs.input_ids[:10]
attn_mask = tokenized_seq_structs.attention_mask[:10]

outputs = model(x, attn_mask=attn_mask)

ValueError: not enough values to unpack (expected 2, got 1)

In [7]:
cfg = read_cfg('../aptamer_transformer/config.yaml')

dna_dataset = load_dataset(cfg)

In [56]:
tokenizer.vocab

{'[PAD]': 0, 'N': 1, '[CLS]': 2, '(': 4, ')': 5, '.': 6, '[MASK]': 3}

In [55]:
tokenizer.decode(dna_dataset.tokenized_struc[1])

'[CLS]. (. ( (..... ) ). ) ( ( (.......... ) ) ).......... [PAD]'

In [None]:
struc_energy = mfe[0][0]

energy = struc_energy.energy
struc = struc_energy.structure

print(f'Dot Bracket Secondart Strucutre Notation:\n{struc.dotparensplus()}\n')
print(f'Adjacency Matrix (edges):\n{struc.matrix()}\n')
print(f'Secondary Strucutre Mean Free Energy:\n{energy}')

In [None]:
tokenized_structures