In [None]:
from transformers import BertModel, BertConfig

In [None]:
class BertCausalModel(BertModel):
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config, add_pooling_layer)

In [None]:
bconfig = BertConfig()

In [None]:
bconfig

In [None]:
bmodel = BertCausalModel(bconfig)

In [None]:
bmodel.config.pad_token_id

In [None]:
import torch

def get_sequence_lengths(pad_token_id, input_ids=None, inputs_embeds=None, **kwargs):
    if input_ids is not None:
        batch_size, sequence_length = input_ids.shape[:2]
    else:
        batch_size, sequence_length = inputs_embeds.shape[:2]

    assert (
        pad_token_id is not None or batch_size == 1
    ), "Cannot handle batch sizes > 1 if no padding token is defined."
    if pad_token_id is None:
        sequence_lengths = -1
    else:
        if input_ids is not None:
            sequence_lengths = torch.ne(input_ids, pad_token_id).sum(-1) - 1
        else:
            sequence_lengths = -1
            logger.warning(
                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
            )

    return (batch_size, sequence_lengths)

In [None]:
bs, seqlen = get_sequence_lengths(0, batch['input_ids'])
bs, seqlen

In [None]:
embed = batch['input_ids']
embed

In [None]:
embed[range(bs), seqlen]

In [None]:
from transformers import BertModel
from newlm.lm.bert.modeling_bert.bert_model import BertCausalModel

bm = BertModel.from_pretrained("indobenchmark/indobert-base-p1")
bcm = BertCausalModel.from_pretrained("indobenchmark/indobert-base-p1")

In [None]:
bm

In [None]:
bcm

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("indobenchmark/indobert-base-p1")

batch = tokenizer(
    ["saya pergi ke pasar", "saya makan di warung", "pada hari senin ku turut ayah ke kota"],
    padding=True,
    return_tensors="pt"
)
batch

In [None]:
out = bm(batch)

In [None]:
out2 = bcm(**batch)

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("indobenchmark/indobert-base-p1")

batch = tokenizer(
    ["saya pergi ke pasar", "saya makan di warung", "pada hari senin ku turut ayah ke kota"],
    padding=True,
    return_tensors="pt"
)
batch

In [None]:
from transformers import BertForSequenceClassification, BertTokenizerFast
from newlm.lm.bert.modeling_bert.bert_model import BertCausalModel, BertModelCausalForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=3
)
model.bert = BertCausalModel(model.config)

In [None]:
model(**batch)

In [None]:
from newlm.lm.bert.modeling_bert.bert_model import BertModelCausalForSequenceClassification

model2 = BertModelCausalForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p1", num_labels=3)

In [None]:
model2(**batch)

In [None]:
model2(**batch)

## Sanity Check

In [1]:
from newlm.lm.elmo.modeling_elmo.elmo_head import ELMOBertLMHeadModel
from newlm.lm.elmo.lm_builder import ELMOLMBuilder
from transformers import BertConfig

In [2]:
from newlm.utils.file_util import read_from_yaml
config_file = read_from_yaml('examples/configs/run.1-percent-bert-causal.yaml')

In [7]:
config_file['lm']['model']['config']

{'vocab_size': 30000,
 'hidden_size': 768,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'intermediate_size': 3072,
 'max_position_embeddings': 1024,
 'is_decoder': True}

In [3]:
elmo_lm_builder = ELMOLMBuilder(
    model_config = config_file['lm']['model']['config'],
    tokenizer="bert-base-cased",
    model_type="bert-causal-elmo"
)

In [14]:
elmo_lm_builder.tokenizer.tokenize("Jews were indeed infected in numbers similar to their non-Jewish neighbors Yet they were still made scapegoats.")

['Jews',
 'were',
 'indeed',
 'infected',
 'in',
 'numbers',
 'similar',
 'to',
 'their',
 'non',
 '-',
 'Jewish',
 'neighbors',
 'Yet',
 'they',
 'were',
 'still',
 'made',
 's',
 '##cape',
 '##go',
 '##ats',
 '.']

In [11]:
with open('/mnt/data4/made_workspace/newlm-data/en.1-percent/text.txt', 'r+') as fr:
    lines = fr.readlines()

In [15]:
lines[1000:1500]

['\n',
 'Typha albida is a plant species endemic to Afghanistan.\n',
 'It grows in freshwater marshes.\n',
 '\n',
 'Talczyn-Kolonia [ˈtalt͡ʂɨn\xa0kɔˈlɔɲa] is a village in the administrative district of Gmina Kock, within Lubartów County, Lublin Voivodeship, in eastern Poland.\n',
 '\n',
 'Niveocatharylla romieuxi is a moth in the Crambidae family.\n',
 'It was described by Graziano Bassi in 1999. It is found in the Democratic Republic of the Congo.\n',
 '\n',
 'Floyd Burton Jones (November 22, 1910, Cisco, Texas – April 15, 1999, Santa Barbara, California) was an American mathematician, active mainly in topology.\n',
 "Jones's father was a pharmacist and local politician in Shackelford County, Texas.\n",
 "As the valedictorian of his high school class, Jones earned a Regents' Scholarship to The University of Texas, intending to study law eventually.\n",
 'Jones soon discovered that he had a poor memory for dates and history, and thus changed his major to chemistry.\n',
 'Jones had the 

In [5]:
# dataset
train_path = "/mnt/data4/made_workspace/newlm-data/en.1-percent/text_small.txt"
dataset = elmo_lm_builder._get_dataset(train_path)

Using custom data configuration default-e9c2b1c84264b5e7


Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /mnt/data1/hf_dataset_cache/text/default-e9c2b1c84264b5e7/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /mnt/data1/hf_dataset_cache/text/default-e9c2b1c84264b5e7/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

2021-11-10 07:52:58.791 | INFO     | newlm.lm.elmo.lm_builder:_get_dataset:142 - Constructing roBERTa style dataset
100%|██████████| 12/12 [00:00<00:00, 39787.86it/s]


13





In [9]:
# model
config = BertConfig(**elmo_lm_builder.model_config)
model = ELMOBertLMHeadModel(config=config)

In [12]:
# trainer
from transformers import TrainingArguments, Trainer

output_dir = "tmp-outputs"

args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    **{
          "per_device_train_batch_size": 1,
          "max_steps": 50, # scale this based on data size
          "save_total_limit": 1,
          "prediction_loss_only": True,
          # optimizer, according to the paper
          "learning_rate": 0.0001,
          "warmup_steps": 10000, # do we scale this as well?
          "weight_decay": 0.01,
    },
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    data_collator=elmo_lm_builder.data_collator,
)

max_steps is given, it will override any value given in num_train_epochs


In [13]:
data_loader = trainer.get_train_dataloader()

In [15]:
b1 = next(iter(data_loader))

In [22]:
# elmo_lm_builder.tokenizer.__dict__

In [31]:
for k in b1:
    print(k)
    b1[k].to('cuda:0')

input_ids
attention_mask
labels


In [42]:
# elmo_lm_builder.tokenizer.convert_ids_to_tokens(b1['input_ids'][0])

['[CLS]',
 'National',
 'Route',
 'A',
 '##00',
 '##3',
 'also',
 'known',
 'as',
 'T',
 '##ig',
 '##re',
 'Access',
 'is',
 'an',
 '8',
 '.',
 '8',
 'km',
 '(',
 '5',
 '.',
 '5',
 'mi',
 ')',
 '-',
 'long',
 'four',
 '-',
 'lane',
 'highway',
 '.',
 'It',
 'goes',
 'from',
 'the',
 'junction',
 'with',
 'National',
 'Route',
 '9',
 'and',
 'Cam',
 '##ino',
 'de',
 'C',
 '##int',
 '##ura',
 '(',
 'Provincial',
 'Route',
 '4',
 ')',
 'to',
 'the',
 'town',
 'of',
 'T',
 '##ig',
 '##re',
 ',',
 'passing',
 'the',
 'towns',
 'of',
 ':',
 'Bo',
 '##ulo',
 '##gne',
 'San',
 'Is',
 '##id',
 '##ro',
 'B',
 '##é',
 '##cca',
 '##r',
 'Victoria',
 'V',
 '##ir',
 '##rey',
 '##es',
 'San',
 'Fernando',
 'In',
 '1993',
 'the',
 'Federal',
 'Government',
 'opened',
 'a',
 'bid',
 'for',
 'the',
 'Buenos',
 'Aires',
 'road',
 'access',
 'network',
 '.',
 'The',
 'winner',
 'for',
 'the',
 'maintenance',
 'contract',
 'for',
 'the',
 'North',
 'Access',
 'roads',
 'was',
 'Auto',
 '##pis',
 '##tas',
 

In [32]:
b1

{'input_ids': tensor([[  101,  1305,  3320,   138,  7629,  1495,  1145,  1227,  1112,   157,
           6512,  1874, 11737,  1110,  1126,   129,   119,   129,  1557,   113,
            126,   119,   126,  1940,   114,   118,  1263,  1300,   118,  7576,
           4083,   119,  1135,  2947,  1121,  1103,  6698,  1114,  1305,  3320,
            130,  1105, 14805,  4559,  1260,   140, 10879,  4084,   113,  9087,
           3320,   125,   114,  1106,  1103,  1411,  1104,   157,  6512,  1874,
            117,  3744,  1103,  4281,  1104,   131,  9326, 22806,  8376,  1727,
           2181,  2386,  2180,   139,  2744, 19495,  1197,  3006,   159,  3161,
          12210,  1279,  1727,  8834,  1130,  1949,  1103,  3467,  2384,  1533,
            170,  6875,  1111,  1103,  8883,  8837,  1812,  2469,  2443,   119,
           1109,  2981,  1111,  1103,  5972,  2329,  1111,  1103,  1456, 11737,
           4744,  1108, 12983, 19093, 10401,  3687, 17135,   119,  1456, 11737,
           2075, 15057,  26

In [35]:
model.to('cpu')

ELMOBertLMHeadModel(
  (transformer): ELMOBertModel(
    (l2r_gpt): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30000, 768, padding_idx=0)
        (position_embeddings): Embedding(1024, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [1]:
import torch
with torch.no_grad():
    model(**b1)

NameError: name 'model' is not defined