### Imports

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from collections import Counter
import time
from transformers import Trainer,TrainingArguments,BartConfig,BartTokenizer,BartModel,BartForConditionalGeneration
from transformers import BartTokenizerFast
from torch.utils import data as data_utils
from torch.utils.data import Dataset, DataLoader

import torch

### Load Data

In [None]:
data_dir = "../DataSets/"

"big_data" contains precomputed simulations for 198 length sequences.

In [None]:
big_data = data_dir+"dna_big_sim_output.txt"
big_data_header = ["dna","length","energy","struct","blank","prob"]

"small_data" is only of length 32, with fewer sequences and is more managable

Future work will be on a more flexible model that can handle variable lengths, but current intended use is only on sequences of a known set length.

In [None]:
small_data = data_dir+"dna_small_sim_output.txt"
small_data_header = ["dna","length","energy","struct"]

Load prefered data set into a pandas dataframe

In [None]:
header_names = small_data_header
data = pd.read_csv(small_data,sep="\\t",header=None,names=header_names)


Break the DNA and structure up for the tokenizer 

This is because the tokenizer expects to see words, not individual characters 

(it was made originally for NLP)

(building custom tokenizer may be possible, but model was prebuilt to be used with this tokenizer)

In [None]:
dna = data["dna"]
struct = data["struct"]

dna_list = [" ".join(list(d)) for d in data["dna"]]
struct_list = [" ".join(list(s)) for s in data["struct"]]

length=len(dna[0])

Using same tokenizer model was trained for

In [None]:
tokenizer = BartTokenizerFast.from_pretrained("roberta-base")

### Model and Testing

Load model from trainer output

In [None]:
model_dir = "../Models/"

small_model = model_dir + "DNA_BART_32"
big_model = model_dir + "DNA_BART_198"

In [None]:
model = BartForConditionalGeneration.from_pretrained(small_model, return_dict=True)

This will translate individual sequences or batches of sequences

In [None]:
def translate(dna,model=model,tokenizer=tokenizer,max_length=length+2):#+2 is because models are weird
    # tokenize input
    inputTensor = tokenizer(dna,return_tensors='pt')
    # generate output tensor
    outputTensor = model.generate(inputTensor['input_ids'],
                                 max_length = max_length,
                                  early_stopping=True)
    # decode and clean up output
    output = ["".join(tokenizer.decode(t[2:-1].tolist()).split()) for t in outputTensor]
    return output

For this test, compare the output from translating the first 1000 lines to a pre_run version

In [None]:
output = translate(dna_list[:1000])

In [None]:
val_file_path = data_dir + "small_bart_output_1000.txt"

with open(val_file_path) as f:
    val_output = f.readlines()
val_output = [l.strip() for l in val_output]

val_output

In [None]:
Z = zip(output,val_output)

comparison = [o == v for o,v in Z]

If everything went right, this should return an empty list

If not, inspect the elements that differ.

In [None]:
np.where(comparison == False)

save results

In [None]:
output_path = data_dir + "bart_output.txt"

with open(output_path,'w') as f:
    for l in output:
        f.write(l)