In [1]:
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from transformers import BertConfig, BertForMaskedLM
from transformers import Trainer
import pandas as pd

In [2]:
data = pd.read_csv('dataset.csv')
data.head()

Unnamed: 0,text,label,link
0,–í 2003-–µ–º –≥–æ–¥—É –ø–æ–¥ —Ä—É–∫–æ–≤–æ–¥—Å—Ç–≤–æ–º –º–∞–ª–æ–∏–∑–≤–µ—Å—Ç–Ω–æ–≥–æ...,0,dataset/neg/1000083-0.txt
1,"–ì—Ä—É—Å—Ç–Ω–æ –∏ –ø–µ—á–∞–ª—å–Ω–æ. –ì—Ä—É—Å—Ç–Ω–æ –æ—Ç —Ç–æ–≥–æ, —á—Ç–æ –¥–æ–≤–æ–ª...",0,dataset/neg/1000083-1.txt
2,–î–∞–≤–Ω—ã–º-–¥–∞–≤–Ω–æ –ö–∏—Ä–∞ –ù–∞–π—Ç–ª–∏ –≤–æ—Ä–≤–∞–ª–∞—Å—å –Ω–∞ —ç–∫—Ä–∞–Ω –æ—Ç...,0,dataset/neg/1000125-3.txt
3,"–Ø, –≤ –æ–±—â–µ–º, –Ω–∏—á–µ–≥–æ –ø—Ä–æ—Ç–∏–≤ —É—Ä–∞–≤–Ω–æ–≤–µ—à–µ–Ω–Ω–æ–≥–æ —Ñ–µ–º–∏...",0,dataset/neg/1000125-4.txt
4,"–ò–∑–º–µ–Ω–∞ –æ–¥–∏–Ω –∏–∑ —Å—é–∂–µ—Ç–æ–≤, –∫–æ—Ç–æ—Ä—ã–π –≤—Å–µ–≥–¥–∞ –±—É–¥–µ—Ç ...",0,dataset/neg/1000125-6.txt


In [3]:
# —Ç—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞ –Ω–∞ –∫–æ—Ä–ø—É—Å–µ
bert_tokenizer = BertWordPieceTokenizer()
bert_tokenizer.train('corpus.txt')

In [4]:
len(bert_tokenizer.get_vocab())

30000

In [5]:
# –ø—Ä–æ–≤–µ—Ä–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
tokenized_sentence = bert_tokenizer.encode('–ü—Ä–∏–≤–µ—Ç—Å—Ç–≤—É—é –≤–∞—Å –≥—Ä–∞–∂–¥–∞–Ω–µ !')
tokenized_sentence.tokens

['–ø—Ä–∏–≤–µ—Ç', '##—Å—Ç–≤—É—é', '–≤–∞—Å', '–≥—Ä–∞–∂–¥–∞–Ω–µ', '!']

In [6]:
# —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–Ω–∏–µ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
bert_tokenizer.save_model('tokenizer')
tokenizer = BertWordPieceTokenizer.from_file("tokenizer/vocab.txt")

In [7]:
tokenized_sentence = tokenizer.encode('–ü—Ä–∏–≤–µ—Ç—Å—Ç–≤—É—é –≤–∞—Å –≥—Ä–∞–∂–¥–∞–Ω–µ !')
tokenized_sentence.tokens

['[CLS]', '–ø—Ä–∏–≤–µ—Ç', '##—Å—Ç–≤—É—é', '–≤–∞—Å', '–≥—Ä–∞–∂–¥–∞–Ω–µ', '!', '[SEP]']

In [8]:
# –∑–∞–≥—Ä—É–∑–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞ –≤ –æ–±–æ–ª–æ—á–∫–µ transformers
tokenizer = BertTokenizerFast.from_pretrained("tokenizer")

# –æ–±–æ–ª–æ—á–∫–∞ LineByLine –¥–ª—è –±–æ–ª–µ–µ –±—ã—Å—Ç—Ä–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è
dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "corpus.txt",
    block_size = 128
)

# collator MLM –¥–ª—è –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∏ –∫ –æ–±—É—á–µ–Ω–∏—é
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = True,
    mlm_probability = 0.15
)



In [9]:
# –∞—Ä–≥—É–º–µ–Ω—Ç—ã –æ–±—É—á–µ–Ω–∏—è 
training_args = TrainingArguments(
    output_dir = "BERT",
    overwrite_output_dir = True,
    num_train_epochs = 1,
    per_device_train_batch_size = 128
)

# —Å–æ–∑–¥–∞–Ω–∏–µ –º–æ–¥–µ–ª–∏ BERT —Å –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–µ–π –ø–æ —É–º–æ–ª—á–∞–Ω–∏—é (—Ç.–µ. –±–∞–∑–æ–≤–∞—è –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞ –º–æ–¥–µ–ª–∏)
tiny_bert_config = BertConfig(
    max_position_embeddings=512,
    hidden_size=128,
    num_attention_heads=2,
    num_hidden_layers=2,
    intermediate_size=512
)

tiny_bert = BertForMaskedLM(tiny_bert_config)


# –æ–±—ä–µ–∫—Ç Trainer
trainer = Trainer(
    model = tiny_bert,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset
)

# —Ç—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33moudelexsus2010[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/1029 [00:00<?, ?it/s]

{'loss': 9.1263, 'grad_norm': 1.171692132949829, 'learning_rate': 2.5704567541302237e-05, 'epoch': 0.49}
{'loss': 8.1207, 'grad_norm': 1.0261869430541992, 'learning_rate': 1.4091350826044704e-06, 'epoch': 0.97}
{'train_runtime': 399.7437, 'train_samples_per_second': 329.396, 'train_steps_per_second': 2.574, 'train_loss': 8.605672586068467, 'epoch': 1.0}


TrainOutput(global_step=1029, training_loss=8.605672586068467, metrics={'train_runtime': 399.7437, 'train_samples_per_second': 329.396, 'train_steps_per_second': 2.574, 'train_loss': 8.605672586068467, 'epoch': 1.0})

In [10]:
# —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ
trainer.save_model("MyBERT")