In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
!pip install transformers

In [None]:
!curl -O https://raw.githubusercontent.com/manhtientran/4998-Scripts/master/requirements.txt

!curl -O https://raw.githubusercontent.com/manhtientran/4998-Scripts/master/train.json
!curl -O https://raw.githubusercontent.com/manhtientran/4998-Scripts/master/dev.json
!curl -O https://raw.githubusercontent.com/manhtientran/4998-Scripts/master/test.json

!curl -O https://raw.githubusercontent.com/manhtientran/4998-Scripts/master/Data_for_training_MLM/viquad_mlm_train.txt
!curl -O https://raw.githubusercontent.com/manhtientran/4998-Scripts/master/Data_for_training_MLM/viquad_mlm_dev.txt
!curl -O https://raw.githubusercontent.com/manhtientran/4998-Scripts/master/Data_for_training_MLM/viquad_mlm_test.txt

!curl -O https://raw.githubusercontent.com/manhtientran/4998-Scripts/master/vi_squad-translate-train-train-v1.1.json


!pip install -r requirements.txt

In [None]:
import json
import math
import os
import random
from itertools import chain
from pathlib import Path

import datasets
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from huggingface_hub import Repository
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    SchedulerType,
    get_scheduler,
)
from transformers.utils import get_full_repo_name, send_example_telemetry
from transformers.utils.versions import require_version

In [None]:
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

with_tracking = False

preprocessing_num_workers = None

In [None]:
# Default Config

CONFIG = {
    "train_file": "viquad_mlm_train.txt",
    "validation_file": "viquad_mlm_test.txt",
    "pad_to_max_length": True,
    "model_name_or_path": "/content/drive/MyDrive/IT4998/models/PhoBERT-Language-Modeling-June_26_2022_02h_46m_53s",
    "use_slow_tokenizer": True,
    "per_device_train_batch_size": 68,
    "per_device_eval_batch_size": 68,
    "learning_rate": 4e-5,
    "weight_decay": 0.01,
    "num_train_epochs": 3,          # DEFAULT 3
    "gradient_accumulation_steps": 1,
    "lr_scheduler_type": "linear",
    "num_warmup_steps": 0,
    "output_dir": "/content/drive/MyDrive/IT4998/models",
    "seed": 42,
    "max_seq_length": 256,
    "line_by_line": True,
    "overwrite_cache": True,
    "mlm_probability": 0.15
}

for key, value in CONFIG.items():
    print(key, ":", value)

In [None]:
accelerator = Accelerator()

if accelerator.is_local_main_process:
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()
else:
    datasets.utils.logging.set_verbosity_error()
    transformers.utils.logging.set_verbosity_error()

set_seed(CONFIG["seed"])

if accelerator.is_main_process:
    if CONFIG["output_dir"] is not None:
        os.makedirs(CONFIG["output_dir"], exist_ok=True)

accelerator.wait_for_everyone()

In [None]:
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
#
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.

data_files = {}
if CONFIG["train_file"] is not None:
    data_files["train"] = CONFIG["train_file"]
if CONFIG["validation_file"] is not None:
    data_files["validation"] = CONFIG["validation_file"]
extension = CONFIG["train_file"].split(".")[-1]
if extension == "txt":
    extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files)

In [None]:
# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

config = AutoConfig.from_pretrained(CONFIG["model_name_or_path"])
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name_or_path"], use_fast=not CONFIG["use_slow_tokenizer"])
model = AutoModelForMaskedLM.from_pretrained(
            CONFIG["model_name_or_path"],
            from_tf=bool(".ckpt" in CONFIG["model_name_or_path"]),
            config=config,
        )

model.resize_token_embeddings(len(tokenizer))

In [None]:
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

if CONFIG["max_seq_length"] > tokenizer.model_max_length:
    print(
        """The max_seq_length passed {} is larger than the maximum length for the
        model {}. Using max_seq_length={}.""".format(CONFIG["max_seq_length"], tokenizer.model_max_length, tokenizer.model_max_length)
    )

max_seq_length = min(CONFIG["max_seq_length"], tokenizer.model_max_length)
print("max_seq_length :", max_seq_length)

In [None]:
if CONFIG["line_by_line"]:
    # When using line_by_line, we just tokenize each nonempty line.
    padding = "max_length" if CONFIG["pad_to_max_length"] else False

    def tokenize_function(examples):
        # Remove empty lines
        examples[text_column_name] = [
            line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()
        ]
        return tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            max_length=max_seq_length,
            # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
            # receives the `special_tokens_mask`.
            return_special_tokens_mask=True,
        )

    with accelerator.main_process_first():
        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=None,
            remove_columns=[text_column_name],
            load_from_cache_file=not CONFIG["overwrite_cache"],
            desc="Running tokenizer on dataset line_by_line",
        )
else:
    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
    # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
    # efficient when it receives the `special_tokens_mask`.
    def tokenize_function(examples):
        return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

    with accelerator.main_process_first():
        tokenized_datasets = raw_datasets.map(
            tokenize_function,
            batched=True,
            num_proc=None,
            remove_columns=column_names,
            load_from_cache_file=not CONFIG["overwrite_cache"],
            desc="Running tokenizer on every text in dataset",
        )

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of
    # max_seq_length.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= max_seq_length:
            total_length = (total_length // max_seq_length) * max_seq_length
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
            for k, t in concatenated_examples.items()
        }
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
    # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
    # might be slower to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

    with accelerator.main_process_first():
        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=None,
            load_from_cache_file=not CONFIG["overwrite_cache"],
            desc=f"Grouping texts in chunks of {max_seq_length}",
        )

In [None]:
eval_dataset = tokenized_datasets["validation"]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=CONFIG["mlm_probability"])
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=CONFIG["per_device_eval_batch_size"])

In [None]:
# On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
if accelerator.distributed_type == DistributedType.TPU:
    model.tie_weights()

In [None]:
model, eval_dataloader = accelerator.prepare(model, eval_dataloader)

In [None]:
model.eval()
losses = []
for step, batch in enumerate(eval_dataloader):
    with torch.no_grad():
        outputs = model(**batch)

    loss = outputs.loss
    losses.append(accelerator.gather(loss.repeat(CONFIG["per_device_eval_batch_size"])))

losses = torch.cat(losses)
losses = losses[: len(eval_dataset)]
print(losses)

try:
    eval_loss = torch.mean(losses)
    print("eval_loss: ", eval_loss)
    perplexity = math.exp(eval_loss)
except OverflowError:
    perplexity = float("inf")

print("Dev Set Perplexity: {perplexity}".format(perplexity=perplexity))