### Import Requirements

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from collections import Counter
import time
import pickle
from transformers import Trainer,TrainingArguments,BartConfig,BartTokenizer,BartModel,BartForConditionalGeneration,BartTokenizerFast
from torch.utils import data as data_utils
from torch.utils.data import Dataset, DataLoader

import torch

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Load Data

"big_data" contains precomputed simulations for 198 length sequences.

Do not recommend for CPU systems, or if you want to do this quickly.

In [2]:
data_dir = "../DataSets/"

In [3]:
big_data = data_dir+"dna_big_sim_output.txt"
big_data_header = ["dna","length","energy","struct","blank","prob"]

"small_data" is only of length 32, with fewer sequences and is more managable

Future work will be on a more flexible model that can handle variable lengths, but current intended use is only on sequences of a known set length.

In [4]:
small_data = data_dir+"dna_small_sim_output.txt"
small_data_header = ["dna","length","energy","struct"]

Load prefered data set into a pandas dataframe

In [5]:
header_names = small_data_header
data = pd.read_csv(small_data,sep="\\t",header=None,names=header_names)

  


Break the DNA and structure up for the tokenizer 

This is because the tokenizer expects to see words, not individual characters 

(it was made originally for NLP)

(building custom tokenizer may be possible, but model was prebuilt to be used with this tokenizer)

In [6]:
dna = data["dna"]
struct = data["struct"]

dna_list = [" ".join(list(d)) for d in data["dna"]]
struct_list = [" ".join(list(s)) for s in data["struct"]]

Split the data set into a train and validation set

In [7]:
length = len(dna) # total number of elements (this was used in the test version)

In [8]:
# run this block for an easier, shorter run 
length = 10000

In [30]:
split = int(length * 0.75) # default for this is 75/25 train-val.

dna_train = dna_list[:split]
dna_val = dna_list[split:length-1]

struct_train = struct_list[:split]
struct_val = struct_list[split:length-1]


Using a prebuilt tokenizer that is known to work with this model

NOTE: may take a while on first run

In [10]:
tokenizer = BartTokenizerFast.from_pretrained("roberta-base")

Tokenize the training and validation dataset

In [31]:
tokenized_train = tokenizer.prepare_seq2seq_batch(src_texts = dna_train, 
                                                  tgt_texts = struct_train,
                                                  padding=True,
                                                  return_tensors='pt',
                                                  truncation=True,
                                                  #return_token_type_ids = True,
                                                  max_length=len(dna[0])+2)

In [32]:
tokenized_val = tokenizer.prepare_seq2seq_batch(src_texts = dna_val, 
                                                tgt_texts = struct_val,
                                                padding=True,
                                                return_tensors='pt',
                                                truncation=True,
                                                #return_token_type_ids = True,
                                                max_length=len(dna[0])+2)

Custom dataset class because pytorch

### Model and Training

In [18]:
class MyDataset(data_utils.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        # self.encodings.keys() = ['input_ids', 'attention_mask', 'start_positions', 'end_positions']
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

Create pyTorch Datasets for training (tds) and evalidation (eds)

In [33]:
tds = MyDataset(tokenized_train)
eds = MyDataset(tokenized_val)

Arguments for the trainer

Training can take a while before a good result is found

In [20]:
small_training_args = TrainingArguments(
    output_dir = "./BART_Small_Output", # where to store the checkpoints
    logging_dir = "./BART_Small_log", # logging directory
    per_device_train_batch_size=256,  # batch size per device during training NOTE: if on CPU or if you get OOM errors, set to smaller number
    per_device_eval_batch_size=256,   # batch size for evaluation NOTE: if on CPU or if you get OOM errors, set to smaller number
    fp16 = True, # if having NAN errors, disable. keeping true helps run faster
    num_train_epochs=100) # number of training epochs to run

In [None]:
big_training_args = TrainingArguments(
    output_dir = "./BART_Big_Output", # where to store the checkpoints
    logging_dir = "./BART_Big_log", # logging directory
    per_device_train_batch_size=64,  # batch size per device during training NOTE: if on CPU or if you get OOM errors, set to smaller number
    per_device_eval_batch_size=63,   # batch size for evaluation NOTE: if on CPU or if you get OOM errors, set to smaller number
    fp16 = True, # if having NAN errors, disable. keeping true helps run faster
    num_train_epochs=100) # number of training epochs to run

Configure and create model

In [34]:
config = BartConfig(
    d_model = 256,
    encoder_layers=6,
    decoder_layers=6,
    encoder_attention_heads=8,
    decoder_attention_heads=8,
    decoder_ffn_dim=1024,
    encoder_ffn_dim=1024
    )
model = BartForConditionalGeneration (config)

Uncommont and fill in URI for a previous training checkpoint

In [35]:
#model = BartForConditionalGeneration.from_pretrained('.\<Path To Model>', return_dict=True)

Create the trainer from previous arguments and the model

In [36]:
trainer = Trainer(model = model, 
                         args = small_training_args, 
                         train_dataset =tds,
                         eval_dataset = eds)

Run the trainer

It will create checkpoints every 500 steps

(recommend to clean out checkpoints, they can take up a lot of space)

In [37]:
trainer.train()

  import sys


Step,Training Loss
500,2.31242
1000,0.341208
1500,0.304814
2000,0.293921
2500,0.288546
3000,0.285404


  import sys
  import sys
  import sys
  import sys
  import sys


TrainOutput(global_step=3000, training_loss=0.6377189381917318)

Save the final result

In [38]:
model.save_pretrained('DNA_BART_32')

Open the testing notebook to try out your model