First we need to load the sequence file

In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
from datasets import load_dataset

trainFile="Data/s288c.fasta.train.txt"
testFile="Data/s288c.fasta.test.txt"
    



Then we will need to convert each sequence to codons, each codon is treated as a "word" and the sequence is the "sentence"

In [3]:
dataset=load_dataset('text', data_files=trainFile,split='train')



Using custom data configuration default-00ee6744db450d6f
Reusing dataset text (/home/lu/.cache/huggingface/datasets/text/default-00ee6744db450d6f/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)


Now let's try to train a tokenlizer on our data

In [4]:
dataset

Dataset({
    features: ['text'],
    num_rows: 4990
})

In [5]:
batch_size = 1000
all_texts = [dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)]

In [6]:
batch_size = 1000
all_texts = [dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)]

def batch_iterator():
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

This is where the tokenizer starts:

In [7]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

tokenizer = Tokenizer(models.WordPiece(unl_token="[UNK]"))

In [8]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False)


In [9]:
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(),normalizers.StripAccents()]
)

In [10]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()


In [11]:
tokenizer.pre_tokenizer.pre_tokenize_str("CAA GTG ACC AGA ATG ATC ACC GGT GTT")


[('CAA', (0, 3)),
 ('GTG', (4, 7)),
 ('ACC', (8, 11)),
 ('AGA', (12, 15)),
 ('ATG', (16, 19)),
 ('ATC', (20, 23)),
 ('ACC', (24, 27)),
 ('GGT', (28, 31)),
 ('GTT', (32, 35))]

In [12]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=5000, special_tokens=special_tokens)

In [13]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)


In [14]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [15]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", cls_token_id),
        ("[SEP]", sep_token_id),
    ],
)

In [16]:
encoding = tokenizer.encode("CAA GTG ACC AGA ATG ATC ACC GGT GTT")


In [17]:
encoding.tokens

['[CLS]',
 'CAA',
 'GTG',
 'ACC',
 'AGA',
 'ATG',
 'ATC',
 'ACC',
 'GGT',
 'GTT',
 '[SEP]']

In [18]:
encoding.type_ids


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [19]:
tokenizer.decoder = decoders.WordPiece(prefix="##")
from transformers import BertTokenizerFast
tokenizer.save("./yeast_token")
tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)


In [20]:
from transformers import BertTokenizer
from transformers import AutoTokenizer




In [21]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [22]:
datasets = load_dataset("text", data_files={"train": trainFile, "validation": testFile})
print(datasets)
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
print(tokenized_datasets)

Using custom data configuration default-69e50c05f0cb0ce3
Reusing dataset text (/home/lu/.cache/huggingface/datasets/text/default-69e50c05f0cb0ce3/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4990
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})
      

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4990
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1000
    })
})


In [23]:
import pandas as pd
from datasets import Dataset

def tokenize_function(examples):
    return tokenizer(examples["text"])

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

# block_size = tokenizer.model_max_length
block_size = 128

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

print(tokenized_datasets)
print(tokenized_datasets['train'][0])

 

Loading cached processed dataset at /home/lu/.cache/huggingface/datasets/text/default-69e50c05f0cb0ce3/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-a825d9129fc02ddc.arrow


 

Loading cached processed dataset at /home/lu/.cache/huggingface/datasets/text/default-69e50c05f0cb0ce3/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-a82cb3888b9404c4.arrow


 

Loading cached processed dataset at /home/lu/.cache/huggingface/datasets/text/default-69e50c05f0cb0ce3/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-e849912bb1131fee.arrow


 

Loading cached processed dataset at /home/lu/.cache/huggingface/datasets/text/default-69e50c05f0cb0ce3/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-2fa0753c28f6a76e.arrow


 

Loading cached processed dataset at /home/lu/.cache/huggingface/datasets/text/default-69e50c05f0cb0ce3/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-175c209bde76ae03.arrow


 

Loading cached processed dataset at /home/lu/.cache/huggingface/datasets/text/default-69e50c05f0cb0ce3/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-6858e6e818df31f9.arrow


 

Loading cached processed dataset at /home/lu/.cache/huggingface/datasets/text/default-69e50c05f0cb0ce3/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-80988a18a56ef861.arrow


 

Loading cached processed dataset at /home/lu/.cache/huggingface/datasets/text/default-69e50c05f0cb0ce3/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-08befb66a9b04248.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4990
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1000
    })
})
{'input_ids': [2, 56, 27, 25, 37, 56, 81, 54, 45, 54, 41, 73, 45, 27, 25, 78, 25, 53, 25, 30, 78, 81, 33, 46, 78, 25, 81, 76, 78, 80, 41, 70, 25, 44, 40, 25, 34, 78, 31, 23, 69, 44, 53, 44, 48, 25, 54, 55, 52, 31, 58, 23, 54, 36, 78, 31, 40, 78, 40, 31, 86, 41, 30, 25, 78, 53, 35, 27, 45, 31, 33, 31, 54, 28, 36, 53, 44, 28, 34, 23, 67, 59, 66, 44, 79, 81, 57, 38, 67, 28, 56, 52, 25, 71, 79, 42, 36, 53, 51, 53, 42, 85, 40, 33, 69, 64, 40, 34, 69, 40, 65, 37, 40, 31, 40, 30, 38, 65, 67, 36, 25, 54, 34, 67, 40, 25, 33, 25, 61, 28, 34, 40, 81, 35, 27, 27, 61, 61, 23, 72, 63, 23, 31, 31, 25, 47, 52, 81, 40, 38, 23, 34, 33, 28, 57, 52, 65, 30, 49, 71, 23, 27, 27, 54, 30, 45, 50, 54, 25, 61, 33, 47, 41, 27, 37, 31, 61, 81, 45, 31, 25, 

In [24]:
lm_datasets =tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)



      

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

In [31]:
lm_datasets["train"][1]





In [33]:
torch.cuda.memory_summary(device=None, abbreviated=False)
torch.cuda.empty_cache()


In [26]:
print(lm_datasets["train"])
print(lm_datasets["train"][0])


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 19207
})
{'input_ids': [2, 56, 27, 25, 37, 56, 81, 54, 45, 54, 41, 73, 45, 27, 25, 78, 25, 53, 25, 30, 78, 81, 33, 46, 78, 25, 81, 76, 78, 80, 41, 70, 25, 44, 40, 25, 34, 78, 31, 23, 69, 44, 53, 44, 48, 25, 54, 55, 52, 31, 58, 23, 54, 36, 78, 31, 40, 78, 40, 31, 86, 41, 30, 25, 78, 53, 35, 27, 45, 31, 33, 31, 54, 28, 36, 53, 44, 28, 34, 23, 67, 59, 66, 44, 79, 81, 57, 38, 67, 28, 56, 52, 25, 71, 79, 42, 36, 53, 51, 53, 42, 85, 40, 33, 69, 64, 40, 34, 69, 40, 65, 37, 40, 31, 40, 30, 38, 65, 67, 36, 25, 54, 34, 67, 40, 25, 33, 25], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [27]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    f"CUBERT",
    evaluation_strategy = "epoch",
    learning_rate=1e-3,
    weight_decay=0.01,
)

from transformers import AutoConfig, AutoModelForCausalLM
model_checkpoint = "gpt2"

config = AutoConfig.from_pretrained(model_checkpoint)

model = AutoModelForCausalLM.from_config(config)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [28]:
import torch
print(torch.cuda.is_available())


True


In [35]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [34]:
trainer.train()

***** Running training *****
  Num examples = 19207
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 7203


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 3.95 GiB total capacity; 2.94 GiB already allocated; 16.50 MiB free; 3.08 GiB reserved in total by PyTorch)

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")