# Using neural networks for word embeddings

In [1]:
import sentencepiece as spm

## Convert original corpus into pieces

In [2]:
sp_model = spm.SentencePieceProcessor()
sp_model.Load("models/original_60k_250x1/level1/unigram_vs60000_lw.model")

True

In [None]:
all_sentences = ""
with open("data/level_wise/level0/corpus_original.txt", "r", encoding="utf-8") as original_corpus_fp:
    for line in original_corpus_fp.readlines():
        pieces = list(
            filter(
                lambda x: x != "▁",
                sp_model.EncodeAsPieces(line)
            )
        )

        units = [piece.replace("▁", "") for piece in pieces]
        new_utterances[word].append("".join(
            clusters.word_to_cluster[piece] for piece in units
        ))

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
CONTEXT_SIZE = 40
EMBEDDING_DIM = 150

In [None]:
ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence))
]

# BERT from scratch

## Step 1 - Train the tokenizer

In [1]:
from tokenizers import BertWordPieceTokenizer

In [2]:
special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]

# if you want to train the tokenizer on both sets
# files = ["train.txt", "test.txt"]
# training the tokenizer on the training set
files = ["data/level_wise/level0/corpus_original.txt"]

# 30,522 vocab is BERT's default vocab size, feel free to tweak
vocab_size = 60000

In [17]:
# initialize the WordPiece tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    strip_accents=False,
    lowercase=False,
)
# train the tokenizer
tokenizer.train(
    files,
    vocab_size=vocab_size,
    min_frequency=0,
    show_progress=True,
    special_tokens=special_tokens
)

In [3]:
from os.path import isdir, join
from os import mkdir
from json import dump

In [None]:
tokenizer.mask

In [20]:
# Saving the tokenizer
tokenizer_path = "bert-lvl0"

if not isdir(tokenizer_path):
    mkdir(tokenizer_path)

tokenizer.save_model(tokenizer_path)

# Saving the tokenizer config
with open(join(tokenizer_path, "config.json"), "w+", encoding="utf-8") as config_fp:
    dump(
        {
            "path": tokenizer_path,
            "vocab_size": vocab_size,
            "special_tokens": special_tokens,
            "clean_text": True,
            "lowercase": False
        },
        config_fp
    )

In [None]:
from transformers import BertTokenizerFast

In [3]:
tokenizer = BertTokenizerFast(vocab_file="bert/lvl0-wp-tokenizer-vocab.txt")

### Train BERT

In [4]:
from transformers import BertConfig, BertForMaskedLM, TrainingArguments, DataCollatorForLanguageModeling, Trainer

In [5]:
# maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 1000

In [6]:
# Set model configs
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)

# Init model
model = BertForMaskedLM(config=model_config)

In [64]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.2
)

In [65]:
training_args = TrainingArguments(
    output_dir="models/bert-lvl0/",          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=10,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=10, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64,  # evaluation batch size
    logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

In [18]:
from datasets import load_dataset, Dataset

In [50]:
dataset = load_dataset("text", data_files={"train": "data/level_wise/level0/corpus_original.txt", "test": "data/level_wise/level0/dev_corpus_original.txt"})

Downloading and preparing dataset text/default to C:/Users/mj115gl/.cache/huggingface/datasets/text/default-3c406c1161a7d444/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to C:/Users/mj115gl/.cache/huggingface/datasets/text/default-3c406c1161a7d444/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer.encode()

In [51]:
train_dataset = []

for row in dataset['train']:
    encoding = tokenizer.encode(row["text"])
    train_dataset.append({
        'input_ids': encoding.ids,
        'attention_mask': encoding.attention_mask,
        'special_tokens_mask': encoding.special_tokens_mask
    })

In [52]:
test_dataset = []

for row in dataset['test']:
    encoding = tokenizer.encode(row["text"])
    test_dataset.append({
        'input_ids': encoding.ids,
        'attention_mask': encoding.attention_mask,
        'special_tokens_mask': encoding.special_tokens_mask
    })

In [53]:
train_dataset = Dataset.from_list(train_dataset)
train_dataset.set_format(type="torch")

test_dataset = Dataset.from_list(test_dataset)
test_dataset.set_format(type="torch")

In [54]:
len(train_dataset), len(test_dataset)

(281241, 5567)

In [None]:
tokenizer.pad

In [63]:
tokenizer = BertTokenizerFast(vocab_file="bert/lvl0-wp-tokenizer-vocab.txt")

In [66]:
# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [67]:
trainer.train()



  0%|          | 0/35150 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 