First we need to load the sequence file

In [1]:
from Bio import SeqIO

inputFile="./Data/s288c.fasta"
fasta_records = SeqIO.parse(open(inputFile),'fasta')
seqList=[]
for record in fasta_records:
    seqList.append(str(record.seq))
    

    

Then we will need to convert each sequence to codons, each codon is treated as a "word" and the sequence is the "sentence"

In [2]:
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, './CUB_Code')
import CodonLibraries as CL

sentenceList=[]
for seq in seqList:
    codonList = CL.loadSequence(seq)
    sentence=""
    for codon in codonList:
        sentence+=codon+" "
    sentenceList.append(sentence)
#print(sentenceList[:2])

Now let's try to train a tokenlizer on our data

In [3]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers

tokenizer = Tokenizer(models.WordLevel())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()

trainer = trainers.WordLevelTrainer(
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=['<s>', '<pad>', '<unk>', '<mask>'],
)

tokenizer.train_from_iterator(sentenceList, trainer=trainer)
import os



tokenizer.save('yeast_token')

Now the tokens are built and saved, lets try to use it

In [8]:
from transformers import BertTokenizer


# initialize the tokenizer using the tokenizer we initialized and saved to file
tokenizer = BertTokenizer.from_pretrained('./yeast_token')
tokenizer(sentenceList[0])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'input_ids': [105, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 103], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0