In [2]:
%cd ..

/mnt/data3/haryoaw_workspace/projects/2021/2021_2/new-lm/newlm


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import os

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [6]:
!echo $CUDA_VISIBLE_DEVICES

0


In [4]:
from newlm.lm.elmo.modeling_elmo.elmo_model import ELMOGPTHeadModel
from newlm.lm.elmo.modeling_elmo.elmo_config import ELMOConfig

In [4]:
from transformers import GPT2Config

In [5]:
from transformers import GPT2Tokenizer

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [7]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [8]:
tokenizer.pad_token_id

50257

In [9]:
config = GPT2Config(vocab_size=55000, pad_token_id=tokenizer.pad_token_id)

In [10]:
config.pad_token_id

50257

In [11]:
elmo_model = ELMOGPTHeadModel(config)

In [12]:
batch = tokenizer.batch_encode_plus(['my name is haryo', 'i am a student'], return_tensors='pt', padding=True)

In [None]:
elmo_model(**batch)

# Try new LM

In [7]:
import torch
import os

from tqdm import tqdm

from datasets import load_dataset
from pathlib import Path
from typing import Union
from transformers import (
    BertTokenizerFast,
    PreTrainedTokenizer,
    TextDataset,
    LineByLineTextDataset,
    TextDatasetForNextSentencePrediction,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from newlm.utils.file_util import create_dir
import wandb
from loguru import logger
from newlm.lm.elmo.modeling_elmo.elmo_model import ELMOGPTHeadModel
from newlm.lm.elmo.modeling_elmo.elmo_config import ELMOConfig
from transformers import GPT2Config
# TODO:
# - take out data from this class then pass it only on training


class ELMOLMBuilder:
    """
    Wrapper class to train BERT LM. Here, we utilize HuggingFace Trainer to train the model.
    You only need to define your tokenizer and training data, then it would train from scratch.
    """

    def __init__(
        self,
        model_config,
        tokenizer: Union[str, PreTrainedTokenizer],
        max_len: int = 512,
    ):
        self.max_len = max_len
        self.model_config = model_config
        self.tokenizer = tokenizer
        if type(tokenizer) == str:
            self.tokenizer = BertTokenizerFast.from_pretrained(
                tokenizer,
                max_len=self.max_len,
                do_lower_case=False,  # uncased
            )

        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
            mlm_probability=0.15,
        )

    def create(
        self,
        train_path: str,
        output_dir: str,
        training_args: dict,
        use_nsp: bool = False,
        train_params={},
    ):
        """
        Train BERT MLM (and NSP (optional)) from scratch.

        Parameters
        ----------
        train_path : str
            Path to training file
        output_dir : str
            Path to output dir
        training_args : dict
            Training params based on transformers.TrainingArguments
        use_nsp : bool
            Wether to train NSP too or not, default: True
        """
        config = GPT2Config(**self.model_config)
        dataset = self.__get_dataset(train_path)
        model = ELMOGPTHeadModel(config=config)

        create_dir(output_dir)
        args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            **training_args,
        )
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=dataset,
            data_collator=self.data_collator,
        )

        self.__resolve_checkpoint(train_params, output_dir)
        if "resume_from_checkpoint" in train_params:
            logger.info(
                f"Resume training from checkpoint {train_params['resume_from_checkpoint']}"
            )
        trainer.train(**train_params)
        trainer.save_model(output_dir)

        wandb.finish()

    def __get_dataset_via_ds(self, train_path):
        dataset = load_dataset("text", data_files=train_path)

        def preprocess_function(examples):
            return self.tokenizer(examples["text"], truncation=True)

        encoded_dataset = dataset.map(preprocess_function, batched=True)
        return encoded_dataset["train"]

    def __get_dataset(self, train_path):
        dataset = self.__get_dataset_via_ds(train_path)["input_ids"]
        print(len(dataset))

        logger.info("Constructing roBERTa style dataset")
        # merge multiple lines to form a single example
        merged_dataset = []
        
        # init the tmp with the first dataset
        tmp = dataset[0]

        for d in tqdm(dataset[1:]):
            # special case, empty line that indicates document breaks
            # i.e. [CLS] [SEP]
            # in this case, we want to keep the [SEP]
            if len(d) == 2:
                d.append(d[-1]) # convert to [CLS] [SEP] [SEP]
            
            d_len = len(d) - 2  # exclude the first [CLS] and last [SEP]

            if len(tmp) + d_len < self.max_len:
                # tmp = [CLS] xxx yyy zzz [SEP]
                # d = [CLS] aaa bbb [SEP]
                # resulting tmp = [CLS] xxx yyy zzz aaa bbb [SEP]

                # for a special case of d = [CLS] [SEP] [SEP]
                # resulting tmp will be:
                # [CLS] xxx yyy zzz [SEP] [SEP]
                # which later be added with the next sentence to form:
                # [CLS] xxx yyy zzz [SEP] ooo ppp [SEP]
                tmp = tmp[:-1] + d[1:]
            else:
                merged_dataset.append(tmp)
                tmp = d
        
        # add the leftover tmp
        merged_dataset.append(tmp)

        merged_dataset = [{"input_ids": d} for d in merged_dataset]
        
        return merged_dataset

    def __resolve_checkpoint(self, train_params, output_dir):
        resume_from = train_params.get("resume_from_checkpoint")
        if resume_from == "latest":
            latest_ckpt = ""
            max_ckpt = 0
            for d in os.listdir(output_dir):
                if "checkpoint" in d:
                    ckpt = int(d.split("checkpoint-")[1])
                    if ckpt > max_ckpt:
                        max_ckpt = ckpt
                        latest_ckpt = str(Path(output_dir) / d)
            train_params["resume_from_checkpoint"] = (
                latest_ckpt if max_ckpt > 0 else output_dir
            )


In [8]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [9]:
model_config = dict(pad_token_id=tokenizer.pad_token_id)

In [10]:
lm_builder = ELMOLMBuilder(model_config, tokenizer, max_len=16)

In [11]:
hf_trainer_args = {
    "per_device_train_batch_size": 4,
      "num_train_epochs": 1,
      "save_steps": 500,
      "save_total_limit": 2,
      "prediction_loss_only": True,
}

In [12]:
 torch.cuda.set_device(0)

In [13]:
lm_builder.create("../untitled.txt", "../coba", hf_trainer_args)

Using custom data configuration default-6e4518841184bfd3
Reusing dataset text (/home/haryoaw/.cache/huggingface/datasets/text/default-6e4518841184bfd3/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

2021-09-06 11:36:17.753 | INFO     | __main__:__get_dataset:119 - Constructing roBERTa style dataset
100%|██████████| 62/62 [00:00<00:00, 305577.96it/s]



63


[34m[1mwandb[0m: Currently logged in as: [33mkata-research[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Step,Training Loss


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/train_runtime,6.4636
train/train_samples_per_second,2.475
train/total_flos,3973611949056.0
train/epoch,1.0
_runtime,6.0
_timestamp,1630928195.0
_step,16.0


0,1
train/train_runtime,▁
train/train_samples_per_second,▁
train/total_flos,▁
train/epoch,▁
_runtime,▁
_timestamp,▁
_step,▁


In [27]:
!nvidia-smi

Mon Sep  6 11:27:01 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 3090    Off  | 00000000:01:00.0 Off |                  N/A |
| 32%   61C    P2   124W / 350W |   2051MiB / 24265MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 3090    Off  | 00000000:02:00.0 Off |                  N/A |
| 96%   74C    P2   308W / 350W |  24262MiB / 24268MiB |     88%      Default |
|       

In [13]:
!echo $CUDA_VISIBLE_DEVICES


