In [1]:
import pandas as pd
import transformers
from transformers import BatchEncoding

  from .autonotebook import tqdm as notebook_tqdm


In [60]:
class customTokenizer:
    def __init__(self, all_tokens):
        self.all_tokens = all_tokens
        self.token_dict = {}
        self.gen_token_dict()
        self.vocab_size = len(self.token_dict)+1

    def gen_token_dict(self):
        for i,token in enumerate(self.all_tokens):
            self.token_dict[token] = i+1
        
    def get_token(self, token):
        token_type = token[1]
        token_value = token[0]
        if token_type == "String":
            return "<STR>"
        elif token_type == "Number":
            return "<NUM>"
        elif token_type == "RegularExpression":
            return "<REGEX>"
        elif token_type == "Template":
            return "<TEMPLATE>"
        elif token_value not in self.token_dict:
            return "<UNK>"
        else:
            return token_value
    

    def tokenize(self, row, max_length=512):
        tokenized = {}
        tokens = row["tokens"]
        annotations = row["annotations"]
        
        if len(tokens) < max_length:
            tokenized["input_ids"] = [self.token_dict[self.get_token(token)] for token in tokens] + [0]*(max_length - len(tokens))
            tokenized["token_type_ids"] = annotations + [0]*(max_length - len(tokens))
            tokenized["attention_mask"] = ([1] * len(tokens)) + [0] * (max_length - len(tokens))
        else:
            tokenized["input_ids"] = [self.token_dict[self.get_token(token)] for token in tokens[:max_length]]
            tokenized["token_type_ids"] = annotations[:max_length]
            tokenized["attention_mask"] = [1] * max_length
        
        tokenized["label"] = row["label"]
        return tokenized


def get_all_tokens_from_df(df):
    result = set()
    all_sequences = df["tokens"]
    for i,tokens in all_sequences.items():
        for token in tokens:
            token_type = token[1]
            token_value = token[0]
            if token_type == "String":
                result.add("<STR>")
            elif token_type == "Number":
                result.add("<NUM>")
            elif token_type == "RegularExpression":
                result.add("<REGEX>")
            elif token_type == "Template":
                result.add("<TEMPLATE>")
            else:
                result.add(token_value)
    result.add("<UNK>")
    return list(result)

df = pd.read_pickle("full_escape.pkl")
all_tokens = get_all_tokens_from_df(df)
print(len(all_tokens))
custom_tokenizer = customTokenizer(all_tokens)

27445


In [61]:
df.label.value_counts()

label
0.0    1079
1.0    1003
Name: count, dtype: int64

In [62]:
def get_all_types_from_df(df):
    result = {}
    all_sequences = df["tokens"]
    for i,tokens in all_sequences.items():
        for token in tokens:
            token_type = token[1]
            if token_type in result:
                result[token_type].add(token[0])
            else:
                result[token_type] = set()
                result[token_type].add(token[0])
    return result
res = get_all_types_from_df(df)
for key in res:
    print(key, len(res[key]))

Punctuator 49
Keyword 30
Identifier 23002
Numeric 4357
String 36103
Null 1
RegularExpression 1089
Template 86
Boolean 2


In [63]:
processed_data = []

for i in range(len(df)):
    processed_data.append(custom_tokenizer.tokenize(df.iloc[i]))

In [85]:
from sklearn.model_selection import train_test_split
new_df = pd.DataFrame(processed_data)
new_df["label"] = new_df["label"].astype(int)
train_df, valid_df = train_test_split(
    new_df,
    test_size=0.05,
    random_state=2022
)

In [86]:
valid_df.label.value_counts()

label
0    53
1    52
Name: count, dtype: int64

In [98]:
import pyarrow as pa
from datasets import Dataset

train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

In [109]:
from transformers import BertConfig, BertForSequenceClassification
config = BertConfig(custom_tokenizer.vocab_size, hidden_size=300, 
                    num_hidden_layers=2, num_attention_heads=2, is_decoder=True,
                    add_cross_attention=True, num_labels=2)
model = BertForSequenceClassification(config)

In [110]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="./result")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hg,
    eval_dataset=valid_hg
)
trainer.train()







[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A

{'loss': 0.6848, 'learning_rate': 1.639784946236559e-05, 'epoch': 2.02}








[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A

{'train_runtime': 561.8224, 'train_samples_per_second': 10.557, 'train_steps_per_second': 1.324, 'train_loss': 0.6812012785224504, 'epoch': 3.0}





TrainOutput(global_step=744, training_loss=0.6812012785224504, metrics={'train_runtime': 561.8224, 'train_samples_per_second': 10.557, 'train_steps_per_second': 1.324, 'train_loss': 0.6812012785224504, 'epoch': 3.0})

In [45]:
trainer.evaluate(valid_hg)





[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A

KeyboardInterrupt: 

In [111]:
result = trainer.predict(valid_hg)








[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





100%|██████████| 14/14 [00:03<00:00,  3.69it/s]


In [103]:
preds

array([1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1])

In [112]:
import numpy as np

preds = np.argmax(result.predictions, axis=-1)
print(preds)
result.label_ids

[1 1 1 1 0 1 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 0
 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1 0 0 1 0 0 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1
 1 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1]


array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])

In [113]:

import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=result.label_ids)

{'accuracy': 0.5238095238095238, 'f1': 0.6093750000000001}

In [84]:
{'accuracy': 0.5371702637889688, 'f1': 0.5849462365591398}

array([1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,

In [115]:
precision_recall_fscore_support(y_true, y_pred, average='macro')

0.49523809523809526