In [1]:
from copy import deepcopy
from importlib import reload
from pathlib import Path
import sys

import accelerate
import datasets
import evaluate
import tokenizers
import transformers

import preprocess
reload(preprocess)
from preprocess import *

accelerate.__version__, datasets.__version__, evaluate.__version__, tokenizers.__version__, transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


('0.19.0', '2.12.0', '0.3.0', '0.11.0', '4.28.1')

In [3]:
USE_CACHE = True
CLR_CACHE = False
SUBSAMPLE = 4
MODEL_MAX_LENGTH = 512

root = Path("./data")
disassemble_path = root / "disassemble"
tokenizers_path = root / "tokenizers"
snippets_path = root / "snippets"

In [4]:
pretraining_dataset = get_pretraining_dataset(
	[p for p in disassemble_path.iterdir() if p.suffix == ".asm"],
)
if CLR_CACHE:
    pretraining_dataset.cleanup_cache_files()
if SUBSAMPLE is not None:
	pretraining_dataset = pretraining_dataset.select(range(SUBSAMPLE))
print(pretraining_dataset)
print(pretraining_dataset[0].keys())
print(pretraining_dataset[0]["text"])

Found cached dataset generator (/home/lk3591/.cache/huggingface/datasets/generator/default-779e027cf20da0b2/0.0.0)


Dataset({
    features: ['text'],
    num_rows: 4
})
dict_keys(['text'])
0x400	push ebp
0x401	mov ebp, esp
0x403	push -1
0x405	push 0x40ece0
0x40a	mov eax, dword ptr fs:[0]
0x410	push eax
0x411	push ecx
0x412	sub esp, 0x114
0x418	mov eax, dword ptr [0x4a3004]
0x41d	xor eax, ebp
0x41f	mov dword ptr [ebp - 0x14], eax
0x422	push ebx
0x423	push esi
0x424	push edi
0x425	push eax
0x426	lea eax, [ebp - 0xc]
0x429	mov dword ptr fs:[0], eax
0x42f	mov dword ptr [ebp - 0x10], esp
0x432	mov dword ptr [ebp - 0x124], 0x55c474f9
0x43c	mov eax, 4
0x441	shl eax, 1
0x443	lea ecx, [ebp - 0x10c]
0x449	mov dword ptr [ebp + eax - 0xf0], ecx
0x450	mov edx, dword ptr [0x4a14d8]
0x456	mov dword ptr [ebp - 0x28], edx
0x459	mov eax, dword ptr [0x4a14dc]
0x45e	mov dword ptr [ebp - 0x24], eax
0x461	mov ecx, dword ptr [0x4a14e0]
0x467	mov dword ptr [ebp - 0x20], ecx
0x46a	mov edx, dword ptr [0x4a14e4]
0x470	mov dword ptr [ebp - 0x1c], edx
0x473	mov al, byte ptr [0x4a14e8]
0x478	mov byte ptr [ebp - 0x18], al
0x47b	m

In [5]:
tokenizer = get_tokenizer(tokenizers_path / "WordLevel.json", "WordLevel", pretraining_dataset)
tokenizer

<tokenizers.Tokenizer at 0x7c3e100>

In [6]:
fast_tokenizer = get_pretrained_tokenizer(tokenizer, model_max_length=MODEL_MAX_LENGTH)
fast_tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=311, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<BOS>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<EOS>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<UNK>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("<SEP>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<PAD>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<CLS>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<MSK>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'additional_special_tokens': [AddedToken("<ADR>", rstrip=False, lstrip=False, single_word=False, normalized=True), AddedToken("<STR>", rstrip=False, lstrip=F

In [7]:
tokenized_pretraining_dataset = get_processed_pretraining_dataset(
    pretraining_dataset, fast_tokenizer, USE_CACHE
)
if CLR_CACHE:
    tokenized_pretraining_dataset.cleanup_cache_files()
tokenized_pretraining_dataset

Loading cached processed dataset at /home/lk3591/.cache/huggingface/datasets/generator/default-779e027cf20da0b2/0.0.0/cache-29f5795db8dd2a8f.arrow


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 485
})

In [8]:
split_tokenized_pretraining_dataset = tokenized_pretraining_dataset.train_test_split(
    test_size=0.1, load_from_cache_file=USE_CACHE
)
if CLR_CACHE:
    split_tokenized_pretraining_dataset.cleanup_cache_files()
split_tokenized_pretraining_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 436
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 49
    })
})

In [9]:
# TODO: use AutoModelForPreTraining where possible

In [10]:
config = transformers.BertConfig(
    vocab_size=fast_tokenizer.vocab_size,
    hidden_size=64,
    num_hidden_layers=4,
    num_attention_heads=4,
    intermediate_size=1024,
    max_position_embeddings=MODEL_MAX_LENGTH,
    type_vocab_size=2,
    pad_token_id=fast_tokenizer.pad_token_id,
    position_embedding_type="absolute",
    use_cache=True,
    classifier_dropout=None,
)
encoder = transformers.AutoModelForMaskedLM.from_config(config)
data_collator = transformers.DataCollatorForLanguageModeling(fast_tokenizer)

In [11]:
args = transformers.TrainingArguments(
    output_dir="./tmp",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    optim="adamw_torch",
)

trainer = transformers.Trainer(
    model=encoder,
    args=args,
    data_collator=data_collator,
    train_dataset=split_tokenized_pretraining_dataset["train"],
    eval_dataset=split_tokenized_pretraining_dataset["test"],
    tokenizer=fast_tokenizer,
)

trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TrainOutput(global_step=84, training_loss=5.538658868698847, metrics={'train_runtime': 7.5628, 'train_samples_per_second': 172.951, 'train_steps_per_second': 11.107, 'total_flos': 1794732950088.0, 'train_loss': 5.538658868698847, 'epoch': 3.0})

In [12]:
encoder.save_pretrained("./tmp/encoder")

In [14]:
config = transformers.GPT2Config(
    vocab_size=fast_tokenizer.vocab_size,
    n_positions=MODEL_MAX_LENGTH,
    n_embd=64,
    n_layer=4,
    n_head=4,
    n_inner=1024,
    bos_token_id=fast_tokenizer.bos_token_id,
    eos_token_id=fast_tokenizer.eos_token_id,
)
decoder = transformers.AutoModelForCausalLM.from_config(config)
data_collator = transformers.DataCollatorForLanguageModeling(fast_tokenizer, mlm=False)

In [15]:
args = transformers.TrainingArguments(
    output_dir="./tmp",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    optim="adamw_torch",
)

trainer = transformers.Trainer(
    model=encoder,
    args=args,
    data_collator=data_collator,
    train_dataset=split_tokenized_pretraining_dataset["train"],
    eval_dataset=split_tokenized_pretraining_dataset["test"],
    tokenizer=fast_tokenizer,
)

trainer.train()



Step,Training Loss


TrainOutput(global_step=84, training_loss=5.228447687058222, metrics={'train_runtime': 4.8016, 'train_samples_per_second': 272.41, 'train_steps_per_second': 17.494, 'total_flos': 1794732950088.0, 'train_loss': 5.228447687058222, 'epoch': 3.0})

In [16]:
decoder.save_pretrained("./tmp/decoder")

In [17]:
reload(preprocess)

pseudo_parallel_dataset = preprocess.get_seq2seq_dataset(
    list((snippets_path / "mal").iterdir()),
    list((snippets_path / "ben").iterdir()),
    "min",
)
if CLR_CACHE:
    pseudo_parallel_dataset.cleanup_cache_files()
print(pseudo_parallel_dataset)
print(pseudo_parallel_dataset[0].keys())
print(pseudo_parallel_dataset[0]["mal"])

Found cached dataset generator (/home/lk3591/.cache/huggingface/datasets/generator/default-f57281bd6d616c3b/0.0.0)


Dataset({
    features: ['mal', 'ben'],
    num_rows: 28
})
dict_keys(['mal', 'ben'])
0xc601	push ecx
0xc602	mov ecx, dword ptr [ebx]
0xc604	call dword ptr [0x40f118]
0xc60a	pop ecx
0xc60b	pop edx
0xc60c	pop eax
0xc60d	jmp dword ptr [ebx]
0xc60f	call 0xc723
0xc614	fxch st(1)
0xc616	lea esp, [esp]
0xc61d	lea ecx, [ecx]
0xc620	fstp st(0)
0xc622	lea esp, [esp]
0xc629	lea esp, [esp]
0xc630	ret
0xc631	call 0xc723
0xc636	jmp 0xc620
0xc638	fstp st(0)
0xc63a	fstp st(0)
0xc63c	fldz
0xc63e	ret
0xc63f	nop
0xc640	fstp st(0)
0xc642	fstp st(0)
0xc644	fldz
0xc646	test ch, ch
0xc648	je 0xc64c
0xc64a	fchs
0xc64c	ret
0xc64d	fstp st(0)
0xc64f	nop
0xc650	fstp st(0)
0xc652	fld1
0xc654	ret
0xc655	lea esp, [esp]
0xc65c	lea esp, [esp]
0xc660	fstp xword ptr [ebp - 0x9e]
0xc666	fld xword ptr [ebp - 0x9e]
0xc66c	test byte ptr [ebp - 0x97], 0x40
0xc673	je 0xc67d
0xc675	mov byte ptr [ebp - 0x90], 0
0xc67c	ret
0xc67d	mov byte ptr [ebp - 0x90], 0
0xc684	fadd qword ptr [0x4a0a6e]
0xc68a	ret
0xc68b	jmp 0xc690
0xc68d	i

In [21]:
tokenized_pseudo_parallel_dataset = preprocess.get_processed_seq2seq_dataset(
    pseudo_parallel_dataset, fast_tokenizer, USE_CACHE, truncation=True, max_length=MODEL_MAX_LENGTH,
)
if CLR_CACHE:
    tokenized_pseudo_parallel_dataset.cleanup_cache_files()
print(tokenized_pseudo_parallel_dataset)
print(tokenized_pseudo_parallel_dataset[0].keys())
print(tokenized_pseudo_parallel_dataset[0]["input_ids"])
print(tokenized_pseudo_parallel_dataset[0]["labels"])

                                                            

Dataset({
    features: ['mal', 'ben', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 28
})
dict_keys(['mal', 'ben', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])
[5, 30, 22, 16, 22, 9, 18, 15, 13, 31, 14, 32, 18, 15, 13, 10, 12, 11, 14, 36, 22, 36, 23, 36, 17, 42, 18, 15, 13, 31, 14, 32, 10, 12, 11, 145, 92, 90, 44, 91, 38, 25, 9, 13, 25, 14, 38, 22, 9, 13, 22, 14, 85, 92, 90, 41, 91, 38, 25, 9, 13, 25, 14, 38, 25, 9, 13, 25, 14, 47, 32, 10, 12, 11, 42, 10, 12, 11, 85, 92, 90, 41, 91, 85, 92, 90, 41, 91, 139, 47, 37, 85, 92, 90, 41, 91, 85, 92, 90, 41, 91, 139, 43, 75, 9, 75, 39, 10, 12, 11, 153, 47, 85, 92, 90, 41, 91, 37, 85, 92, 90, 41, 91, 157, 47, 38, 25, 9, 13, 25, 14, 38, 25, 9, 13, 25, 14, 85, 127, 15, 13, 19, 21, 10, 12, 11, 14, 89, 127, 15, 13, 19, 21, 10, 12, 11, 14, 43, 26, 15, 13, 19, 21, 10, 12, 11, 14, 9, 10, 12, 11, 39, 10, 12, 11, 16, 26, 15, 13, 19, 21, 10, 12, 11, 14, 9, 41, 47, 16, 26, 15, 13, 19, 21, 10, 12, 11, 14, 9, 41, 



In [22]:
split_tokenized_pseudo_parallel_dataset = tokenized_pseudo_parallel_dataset.train_test_split(
    test_size=0.1, load_from_cache_file=USE_CACHE
)
if CLR_CACHE:
    split_tokenized_pseudo_parallel_dataset.cleanup_cache_files()
split_tokenized_pseudo_parallel_dataset

DatasetDict({
    train: Dataset({
        features: ['mal', 'ben', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
    test: Dataset({
        features: ['mal', 'ben', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
})

In [23]:
seq2seq = transformers.EncoderDecoderModel.from_encoder_decoder_pretrained(
    "./tmp/encoder", "./tmp/decoder"
)
seq2seq.config.decoder_start_token_id = fast_tokenizer.bos_token_id
seq2seq.config.forced_bos_token_id = True
seq2seq.config.eos_token_id = fast_tokenizer.eos_token_id
seq2seq.config.forced_eos_token_id = True
seq2seq.config.pad_token_id = fast_tokenizer.pad_token_id

data_collator = transformers.DataCollatorForSeq2Seq(
    fast_tokenizer,
    model=seq2seq,
    max_length=MODEL_MAX_LENGTH,
    padding="max_length",
)

Some weights of the model checkpoint at ./tmp/encoder were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at ./tmp/decoder and are newly initialized: ['transformer.h.3.crossattention.q_attn.weight', 'transformer.h.

In [24]:
args = transformers.Seq2SeqTrainingArguments(
    output_dir="./tmp",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    optim="adamw_torch",
)

trainer = transformers.Seq2SeqTrainer(
    model=seq2seq,
    args=args,
    data_collator=data_collator,
    train_dataset=split_tokenized_pseudo_parallel_dataset["train"],
    eval_dataset=split_tokenized_pseudo_parallel_dataset["test"],
    tokenizer=fast_tokenizer,
)

trainer.train()



Step,Training Loss


TrainOutput(global_step=6, training_loss=5.62211799621582, metrics={'train_runtime': 0.8187, 'train_samples_per_second': 91.606, 'train_steps_per_second': 7.329, 'total_flos': 291210854400.0, 'train_loss': 5.62211799621582, 'epoch': 3.0})

In [25]:
seq2seq.save_pretrained("./tmp/seq2seq")