In [1]:
import torch
import torch.nn.functional as F
from torch import optim

import yaml
import pandas as pd
import numpy as np

from sklearn.preprocessing import quantile_transform 
from x_transformers import XTransformer, TransformerWrapper, Decoder, Encoder, ViTransformerWrapper

from model import *
from factories_model_loss import *
from data_utils import *

%load_ext autoreload
%autoreload 2

2024-01-09 21:03:22.870751: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-09 21:03:22.911537: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-09 21:03:22.911577: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-09 21:03:22.912988: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-09 21:03:22.920902: I tensorflow/core/platform/cpu_feature_guar

In [2]:
cfg = read_cfg('config.yaml')

In [None]:
# Initialize the model and optimizer here as you did during training
device = torch.device("cuda:0")  
cfg.update({
    'device': device,
})

model = get_model(cfg).to(device)
optimizer = optim.Adam(model.parameters(), lr=cfg['learning_rate'])

# Load the checkpoint
checkpoint = torch.load(cfg['checkpoint_path'])  # Replace X with the epoch number

# Restore the model and optimizer states
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Move the model to evaluation mode if you are doing inference
model.eval()


In [39]:
logits, embed, tgt = model(df.Sequence[:2].tolist(), df.Sequence[:2].tolist())

embed[:,0,:].shape

torch.Size([2, 512])

In [None]:
df = load_and_preprocess_enrichment_data(cfg)


In [3]:
df = pd.read_hdf('../data/saved_h5/dna_dataset_classification.h5', 'df')

In [4]:
dna_dataset = DNAEncoderDataSet(df, cfg)

In [5]:
import pickle

In [6]:
with open('data.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(dna_dataset, f, pickle.HIGHEST_PROTOCOL)

In [7]:
with open('data.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    data = pickle.load(f)

In [13]:
mask = torch.full([max_seq_len, max_seq_len], float('-inf'))
mask = torch.triu(mask, diagonal=1)

mask

tensor([[0., -inf, -inf,  ..., -inf, -inf, -inf],
        [0., 0., -inf,  ..., -inf, -inf, -inf],
        [0., 0., 0.,  ..., -inf, -inf, -inf],
        ...,
        [0., 0., 0.,  ..., 0., -inf, -inf],
        [0., 0., 0.,  ..., 0., 0., -inf],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:

N = 20  # Replace with your value of N

# Regular expression pattern to match more than N consecutive 'G's
pattern = f'C{{{N},}}'

# Filter sequences with more than N consecutive 'G's
sequences_with_consecutive_Gs = df[df['Sequence'].str.contains(pattern)]

print(len(sequences_with_consecutive_Gs))

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast
from transformers import DataCollatorForLanguageModeling

tokenizer = Tokenizer(BPE(unk_token="N"), )
tokenizer.pre_tokenizer = Whitespace()
tokenizer.mask_token = "[MASK]"
tokenizer.cls_token = "[CLS]"
tokenizer.pad_token = "[PAD]"
tokenizer.unknown_token = "N"
tokenizer.model_max_length = 42
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A",
    special_tokens=[("[PAD]",0), ("N", 1), ("[CLS]", 2), ("[MASK]", 3)]
)

trainer = BpeTrainer(
    vocab_size=8,
    special_tokens=["[PAD]", "N", "[CLS]", "[MASK]"],
)

temp_df = df.Sequence.apply(lambda x:" ".join(x))
tokenizer.train_from_iterator(temp_df.values, trainer=trainer)

fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, model_max_length=42, cls_token="[CLS]", unk_token="N", pad_token="[PAD]", mask_token="[MASK]", return_special_tokens_mask=1)


In [None]:
fast_tokenizer.save_pretrained('../data/AptamerBERT_tokenizer')

In [11]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
tokenizer = AutoTokenizer.from_pretrained('../data/AptamerBERT_tokenizer')
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
tokenizer

PreTrainedTokenizerFast(name_or_path='../data/AptamerBERT_tokenizer', vocab_size=8, model_max_length=42, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': 'N', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("N", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
tokenizer.model_max_length

In [13]:
temp_df = df.Sequence.apply(lambda x:" ".join(x))

batched_data = temp_df.values.tolist()[:1000]
tokenized_batch = tokenizer(batched_data, padding=True, )

masked_data = data_collator(tokenized_batch.input_ids)


In [18]:
~torch.Tensor(tokenized_batch['attention_mask']).bool()

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False,  True],
        [False, False, False,  ..., False, False,  True],
        ...,
        [False, False, False,  ..., False, False,  True],
        [False, False, False,  ..., False, False,  True],
        [False, False, False,  ..., False, False, False]])

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
cfg['device'] = device

model = x_transformer_encoder= TransformerWrapper(
    num_tokens = 8,
    max_seq_len = 42,
    num_memory_tokens = cfg['num_memory_tokens'],
    l2norm_embed = cfg['l2norm_embed'],
    attn_layers = Encoder(
        dim = cfg['d_model'],
        depth = cfg['num_layers'],
        heads = cfg['nhead'],
        layer_dropout = cfg['dropout_rate'],   # stochastic depth - dropout entire layer
        attn_dropout = cfg['dropout_rate'],    # dropout post-attention
        ff_dropout = cfg['dropout_rate'],       # feedforward dropout,
        attn_flash = cfg['attn_flash'],
        attn_num_mem_kv = cfg['attn_num_mem_kv'],
        use_scalenorm = cfg['use_scalenorm'],
        use_simple_rmsnorm = cfg['use_simple_rmsnorm'],
        ff_glu = cfg['ff_glu'],
        ff_swish = cfg['ff_swish'],
        ff_no_bias = cfg['ff_no_bias'],
        attn_talking_heads = cfg['attn_talking_heads']
    )
).to(device)



src = masked_data['input_ids'].to(device)
src_mask = torch.Tensor(tokenized_batch.attention_mask).bool().to(device)

trg = masked_data['labels'].to(device)


out = model(src, mask=src_mask)

In [None]:
F.cross_entropy(out.movedim(2,1), trg)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
cfg['device'] = device

model = XTransformer(
    dim = cfg['d_model'],
    enc_num_tokens = 8,
    enc_depth = cfg['num_layers'],
    enc_heads = 8,
    enc_max_seq_len = 42,
    dec_num_tokens = 8,
    dec_depth = cfg['num_layers'],
    dec_heads = 8,
    dec_max_seq_len = 42,
    tie_token_emb = True      # tie embeddings of encoder and decoder
).to(device)



src = masked_data['input_ids'].to(device)
src_mask = torch.Tensor(tokenized_batch.attention_mask).bool().to(device)

trg = torch.Tensor(tokenized_batch.input_ids).long().to(device)


out = model(src, trg, mask=src_mask)

out

In [None]:
df.hist(column='Normalized_Frequency', bins=100)