In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoConfig

In [2]:
config = AutoConfig.from_pretrained('microsoft/deberta-v3-base')
tokenizer = AutoTokenizer.from_pretrained('malaysia-ai/bpe-tokenizer')
special_tokens_dict = {"mask_token": "[MASK]"}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
config.vocab_size = len(tokenizer)
config.max_position_embeddings = 4096

In [4]:
config

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 4096,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.35.0",
  "type_vocab_size": 0,
  "vocab_size": 32001
}

In [5]:
model = AutoModelForMaskedLM.from_config(config)

In [6]:
model.save_pretrained('debertav2-base')
tokenizer.save_pretrained('debertav2-base')

('debertav2-base/tokenizer_config.json',
 'debertav2-base/special_tokens_map.json',
 'debertav2-base/tokenizer.json')

In [7]:
!ls debertav2-base

config.json	   special_tokens_map.json  tokenizer_config.json
model.safetensors  tokenizer.json


In [8]:
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import StreamingDataset
import torch
import numpy as np

class UInt16(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint16)

_encodings['uint16'] = UInt16

class DatasetFixed(torch.utils.data.Dataset):
    def __init__(self, local):
        self.dataset = StreamingDataset(local=local)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        data.pop('token_type_ids', None)
        for k in data.keys():
            data[k] = data[k].astype(np.int64)
        return data

    def __len__(self):
        return len(self.dataset)

train_dataset = DatasetFixed(local='tokenized-512')

In [9]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
    pad_to_multiple_of=None,
)

In [12]:
batch = [train_dataset[i] for i in range(3)]
b = data_collator(batch)

In [13]:
model(**b)

MaskedLMOutput(loss=tensor(10.5440, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.0000,  0.6299,  0.3935,  ...,  0.1540, -0.2767, -0.8502],
         [ 0.0000,  0.6043, -0.7251,  ..., -0.1069,  0.7736, -0.1743],
         [ 0.0000,  0.5119,  0.2992,  ...,  0.2536, -0.2253, -0.4136],
         ...,
         [ 0.0000,  0.8840,  0.2400,  ..., -0.2831,  0.3892,  0.3696],
         [ 0.0000,  0.6039, -0.1760,  ..., -0.2009,  0.0234, -0.1737],
         [ 0.0000,  0.3058, -0.1681,  ...,  0.8590,  0.1530, -0.2431]],

        [[ 0.0000,  0.0809, -0.0523,  ...,  0.0676, -0.5746,  0.1287],
         [ 0.0000,  0.0800, -0.2212,  ...,  0.4210,  0.4900, -0.0251],
         [ 0.0000,  0.3146,  0.1671,  ..., -0.4739,  0.5507, -0.5570],
         ...,
         [ 0.0000, -0.1627, -0.0639,  ...,  0.6361,  0.6218, -0.2991],
         [ 0.0000,  0.2889,  0.1489,  ...,  0.5500,  0.8279,  0.2799],
         [ 0.0000,  0.7858, -0.8017,  ...,  0.9580,  0.9399,  0.0629]],

        [[ 0.0000,  0.6794,  0.7442,  ...,  