In [1]:
from functools import partial

import torch
import torch.nn as nn
import torch.nn.functional as F

from tests.test import test_layer, summary_layer
from deberta.config import Config

config = Config(
    hidden_dim=768,
    embedding_dim=1024,
    max_seq_len=512,
    padding_idx=0,
    vocab_size=128001,
    absolute_position_biased_input=True,
    num_heads=12,
    num_head_dim=64,
    layernorm_eps=1e-9,
    hidden_dropout_prob=0.1,
    num_hidden_layers=12,
    device='cuda'
)

In [2]:
from deberta.attentions import DisentangledSelfAttention

In [2]:
torch.set_default_device(config.device)

In [None]:
layer = DisentangledSelfAttention(config)
layer

DisentangledSelfAttention(
  (query_layer): Linear(in_features=768, out_features=768, bias=True)
  (key_layer): Linear(in_features=768, out_features=768, bias=True)
  (value_layer): Linear(in_features=768, out_features=768, bias=True)
  (relative_position_embedding): RelativePositionEmbedding(
    (relative_position_embedding_layer): Embedding(512, 768)
    (relative_position_query_layer): Linear(in_features=768, out_features=768, bias=True)
    (relative_position_key_layer): Linear(in_features=768, out_features=768, bias=True)
  )
  (feedforward): AttentionFeedForward(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (layernorm): LayerNorm((768,), eps=1e-09, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [None]:
output = test_layer(layer, (10, 512, 768))

In [None]:
summary_layer(layer, (10, 512, 768))

Layer (type:depth-idx)                   Output Shape              Param #
DisentangledSelfAttention                [10, 512, 768]            --
├─Linear: 1-1                            [10, 512, 768]            590,592
├─Linear: 1-2                            [10, 512, 768]            590,592
├─Linear: 1-3                            [10, 512, 768]            590,592
├─RelativePositionEmbedding: 1-4         [10, 512, 768]            --
│    └─Embedding: 2-1                    [512, 768]                393,216
│    └─Linear: 2-2                       [512, 768]                590,592
│    └─Linear: 2-3                       [512, 768]                590,592
├─AttentionFeedForward: 1-5              [10, 512, 768]            --
│    └─Linear: 2-4                       [10, 512, 768]            590,592
│    └─Dropout: 2-5                      [10, 512, 768]            --
│    └─LayerNorm: 2-6                    [10, 512, 768]            1,536
Total params: 3,938,304
Trainable params: 3,938

In [None]:
from deberta.networks import InputEmbedding

layer = InputEmbedding(config)

In [None]:
arr = torch.randint(0, 30522, (2, 512))
output = test_layer(layer, input_data=arr)

input shape: torch.Size([2, 512])
output type: <class 'dict'>
embeddings shape: torch.Size([2, 512, 768])
position_embeddings shape: torch.Size([2, 512, 768])


In [None]:
summary_layer(layer, input_data=arr)

Layer (type:depth-idx)                   Output Shape              Param #
InputEmbedding                           [2, 512, 768]             --
├─Embedding: 1-1                         [2, 512, 1024]            31,254,528
├─Embedding: 1-2                         [2, 512, 1024]            524,288
├─Linear: 1-3                            [2, 512, 768]             786,432
├─Linear: 1-4                            [2, 512, 768]             786,432
├─LayerNorm: 1-5                         [2, 512, 768]             1,536
Total params: 33,353,216
Trainable params: 33,353,216
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 66.71
Input size (MB): 0.01
Forward/backward pass size (MB): 35.65
Params size (MB): 133.41
Estimated Total Size (MB): 169.07


In [None]:
from deberta.layers import RelativePositionEmbedding

layer = RelativePositionEmbedding(config)

In [None]:
hidden_states = output['embeddings']
output = test_layer(layer, input_data=hidden_states)

input shape: torch.Size([2, 512, 768])
output type: <class 'tuple'>
output 0 shape: torch.Size([2, 512, 768])
output 1 shape: torch.Size([2, 512, 768])


In [None]:
from deberta.networks import BaseNetwork

embedding_layer = InputEmbedding(config)
layer = BaseNetwork(config)

In [None]:
input_data = torch.randint(0, 30522, (10, 512))

output = test_layer(embedding_layer, input_data=input_data)
summary_layer(embedding_layer, input_data=input_data)

input shape: torch.Size([10, 512])
output type: <class 'dict'>
embeddings shape: torch.Size([10, 512, 768])
position_embeddings shape: torch.Size([10, 512, 768])
Layer (type:depth-idx)                   Output Shape              Param #
InputEmbedding                           [10, 512, 768]            --
├─Embedding: 1-1                         [10, 512, 1024]           31,254,528
├─Embedding: 1-2                         [10, 512, 1024]           524,288
├─Linear: 1-3                            [10, 512, 768]            786,432
├─Linear: 1-4                            [10, 512, 768]            786,432
├─LayerNorm: 1-5                         [10, 512, 768]            1,536
Total params: 33,353,216
Trainable params: 33,353,216
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 333.53
Input size (MB): 0.04
Forward/backward pass size (MB): 178.26
Params size (MB): 133.41
Estimated Total Size (MB): 311.71


In [None]:
inputs = output['embeddings']
output = test_layer(layer, input_data=inputs)
summary_layer(layer, input_data=inputs)

input shape: torch.Size([10, 512, 768])
output type: <class 'tuple'>
output 0 shape: torch.Size([10, 512, 768])
output 1 type: <class 'list'>
Layer (type:depth-idx)                                  Output Shape              Param #
BaseNetwork                                             [10, 512, 768]            --
├─ModuleList: 1-1                                       --                        --
│    └─TransformerBlock: 2-1                            [10, 512, 768]            --
│    │    └─DisentangledSelfAttention: 3-1              [10, 512, 768]            --
│    │    │    └─Linear: 4-1                            [10, 512, 768]            590,592
│    │    │    └─Linear: 4-2                            [10, 512, 768]            590,592
│    │    │    └─Linear: 4-3                            [10, 512, 768]            590,592
│    │    │    └─RelativePositionEmbedding: 4-4         [10, 512, 768]            --
│    │    │    │    └─Embedding: 5-1                    [512, 768]       

In [None]:
for i in range(len(output[1])):
    print(output[1][i].shape)

torch.Size([10, 512, 768])
torch.Size([10, 512, 768])
torch.Size([10, 512, 768])
torch.Size([10, 512, 768])
torch.Size([10, 512, 768])
torch.Size([10, 512, 768])
torch.Size([10, 512, 768])
torch.Size([10, 512, 768])
torch.Size([10, 512, 768])
torch.Size([10, 512, 768])
torch.Size([10, 512, 768])
torch.Size([10, 512, 768])


In [2]:
from deberta.networks import Generator, Discriminator

generator = Generator(config)
discriminator = Discriminator(config)

inputs = torch.randint(0, config.vocab_size, (10, config.max_seq_len))
labels = torch.randint(0, 2, (10, config.max_seq_len))
masks = torch.randint(0, 2, (10, config.max_seq_len))
# output = test_layer(generator, input_data=inputs)
# summary_layer(generator, input_data=inputs)
# output = generator(inputs, labels=labels, labels_mask=masks)

In [None]:
output[0].shape, output[1].shape

(torch.Size([10, 512, 30522]), torch.Size([2509]))

In [None]:
output = discriminator(inputs, labels=labels, labels_mask=masks)

In [None]:
output[0].shape, output[1].shape

(torch.Size([10, 512, 1]), torch.Size([2509]))

In [None]:
labels[masks>0]

tensor([1, 0, 0,  ..., 1, 0, 0], device='cuda:0')

In [None]:
output = test_layer(discriminator, input_data=inputs)
summary_layer(discriminator, input_data=inputs)

input shape: torch.Size([10, 512])
output type: <class 'torch.Tensor'>
output shape: torch.Size([10, 512, 1])
Layer (type:depth-idx)                                       Output Shape              Param #
Discriminator                                                [10, 512, 1]              --
├─InputEmbedding: 1-1                                        [10, 512, 768]            --
│    └─Embedding: 2-1                                        [10, 512, 1024]           31,254,528
│    └─Embedding: 2-2                                        [10, 512, 1024]           524,288
│    └─Linear: 2-3                                           [10, 512, 768]            786,432
│    └─Linear: 2-4                                           [10, 512, 768]            786,432
│    └─LayerNorm: 2-5                                        [10, 512, 768]            1,536
├─BaseNetwork: 1-2                                           [10, 512, 768]            --
│    └─ModuleList: 2-7                           

In [1]:
from functools import partial

import torch
import torch.nn as nn
import torch.nn.functional as F

from tests.test import test_layer, summary_layer
from deberta.config import Config

config = Config(
    hidden_dim=768,
    embedding_dim=1024,
    max_seq_len=512,
    padding_idx=0,
    vocab_size=128001,
    absolute_position_biased_input=True,
    num_heads=12,
    num_head_dim=64,
    layernorm_eps=1e-9,
    hidden_dropout_prob=0.1,
    num_hidden_layers=12,
    device='cuda',
    mask_lm_prob=0.15,
)

In [2]:
# token masking test
from deberta.data import ReplaceTaskData
from transformers import AutoTokenizer
from pprint import pprint

tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
replace_task_data = ReplaceTaskData(config, tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sentence = "The quick brown fox jumps over the lazy dog"
masked_data = replace_task_data.get_generator_inputs(sentence)
pprint(masked_data)


{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[     1,    279,   1538,   3258, 128000,  14929,    360,    262, 128000,
           1560,      2]]),
 'labels': tensor([[    0,     0,     0,     0, 16123,     0,     0,     0,  9118,     0,
             0]]),
 'original_input_ids': tensor([[    1,   279,  1538,  3258, 16123, 14929,   360,   262,  9118,  1560,
             2]])}


In [5]:
from deberta.networks import Generator, Discriminator

generator = Generator(config)
discriminator = Discriminator(config)

In [6]:
logits = generator(
    input_ids=masked_data['input_ids'],
    attention_mask=masked_data['attention_mask'],
    labels=masked_data['labels'],
    labels_mask=masked_data['attention_mask'])

In [7]:
logits[0].shape

torch.Size([1, 11, 128001])

In [10]:
masked_data = replace_task_data.get_discriminator_inputs(masked_data, logits[0].squeeze(), True)
masked_data

In [8]:
from deberta.fetch_dataset import fetch_dataset

dataset = fetch_dataset('wikipedia')
dataset = dataset['train']
book_dataset = fetch_dataset('bookcorpus')
book_dataset = book_dataset['train']

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [9]:
dataset.features

{'id': Value(dtype='string', id=None),
 'url': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

In [10]:
book_dataset.features

{'text': Value(dtype='string', id=None)}

### Batch processing
The map() function supports working with batches of examples. Operate on batches by setting batched=True. The default batch size is 1000, but you can adjust it with the batch_size parameter. Batch processing enables interesting applications such as splitting long sentences into shorter chunks and data augmentation.

### Split long examples
When examples are too long, you may want to split them into several smaller chunks. Begin by creating a function that:
Splits the sentence1 field into chunks of 50 characters.
Stacks all the chunks together to create the new dataset.

In [11]:
from deberta.prep_dataset import split_sentences

dataset = dataset.map(split_sentences, batched=True, num_proc=4, remove_columns=dataset.column_names)
book_dataset = book_dataset.map(split_sentences, batched=True, num_proc=4, remove_columns=book_dataset.column_names)

Map (num_proc=4): 100%|██████████| 6458670/6458670 [00:51<00:00, 126149.32 examples/s]
Map (num_proc=4): 100%|██████████| 74004228/74004228 [00:17<00:00, 4337859.61 examples/s]


In [14]:
dataset, book_dataset

(Dataset({
     features: ['text'],
     num_rows: 28795678
 }),
 Dataset({
     features: ['text'],
     num_rows: 74009525
 }))

In [20]:
from deberta.prep_dataset import SPLIT_IDX, MAX_SEQ_LEN, TOKENIZER

TOKENIZER = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
MAX_SEQ_LEN = 512
def tokenize_fn(sentences):
    tokenized = TOKENIZER(sentences['text'], max_length=MAX_SEQ_LEN, truncation=True, padding=True, return_tensors='pt')
    return tokenized

dataset = dataset.map(tokenize_fn, batched=True, num_proc=12, remove_columns=dataset.column_names)
book_dataset = book_dataset.map(tokenize_fn, batched=True, num_proc=12, remove_columns=book_dataset.column_names)

Map (num_proc=12): 100%|██████████| 28795678/28795678 [22:47<00:00, 21063.15 examples/s]
Map (num_proc=12):   7%|▋         | 4873000/74009525 [00:42<08:16, 139125.34 examples/s]