In [1]:
!pip install datasets
!pip install tokenizers
!pip install transformers==4.1.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.5.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 9.9 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 51.4 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 67.4 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 58.6 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 57.1 MB/s 
Installing

In [2]:
import datasets
from datasets import load_dataset
from tqdm.auto import tqdm
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaTokenizer, RobertaConfig, RobertaForMaskedLM, AdamW, pipeline
import os
import torch

# Urdu corpus

In [3]:
dataset = load_dataset("nthngdy/oscar-mini", "unshuffled_deduplicated_ur")

Downloading builder script:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/631k [00:00<?, ?B/s]

Downloading and preparing dataset oscar-mini/unshuffled_deduplicated_ur (download: 15.10 MiB, generated: 57.39 MiB, post-processed: Unknown size, total: 72.49 MiB) to /root/.cache/huggingface/datasets/nthngdy___oscar-mini/unshuffled_deduplicated_ur/1.0.0/d61b181331745a38dd31e8c6cc23d46566b96e255384c4421f2396af24a01dff...


Downloading data:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84559 [00:00<?, ? examples/s]

Dataset oscar-mini downloaded and prepared to /root/.cache/huggingface/datasets/nthngdy___oscar-mini/unshuffled_deduplicated_ur/1.0.0/d61b181331745a38dd31e8c6cc23d46566b96e255384c4421f2396af24a01dff. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text'],
        num_rows: 84559
    })
})

In [5]:
dataset["train"][0]

{'id': 0,
 'text': 'آئیے اہم اسلامی کتب کو یونیکوڈ میں انٹرنیٹ پر پیش کرنے کے لئے مل جل کر آن لائن ٹائپنگ کریں۔ محدث ٹائپنگ پراجیکٹ کے ذریعے آپ روزانہ فقط دس پندرہ منٹ ٹائپنگ کر کے ہزاروں صفحات پر مشتمل اہم ترین کتب کو ٹائپ کرنے میں اہم کردار ادا کرکے صدقہ جاریہ میں شامل ہو سکتے ہیں۔ محدث ٹائپنگ پراجیکٹ میں شمولیت کے لئے یہاں کلک کریں۔'}

loop throug samples

In [6]:
text_data = []
file_count = 0

for sample in tqdm(dataset["train"]):
  sample = sample["text"].replace("\n", " ")
  text_data.append(sample)

  if len(text_data) == 10_000:
    with open(f"pt_{file_count}.txt", "w", encoding="utf-8") as fp:
      fp.write("\n".join(text_data))
    text_data = []
    file_count += 1

with open(f"pt_{file_count}.txt", "w", encoding="utf-8") as fp:
      fp.write("\n".join(text_data))

  0%|          | 0/84559 [00:00<?, ?it/s]

# Building tokenizer

Getting the paths of our subsets

In [7]:
paths = [str(x) for x in Path("./").glob("*.txt")]

paths[:5]

['pt_1.txt', 'pt_3.txt', 'pt_2.txt', 'pt_7.txt', 'pt_6.txt']

Training the tokenizer.

We use a byte-level Byte-pair encoding (BPE) tokenizer. This allows us to build the vocabulary from an alphabet of single bytes, meaning all words will be decomposable into tokens.

In [8]:
tokenizer = ByteLevelBPETokenizer()

In [9]:
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,
                special_tokens=[
                    "<s>", "<pad>", "</s>", "<unk>", "<mask>"
                ])

Save tokenizer

In [12]:
root = "/content/saeenbert"

In [13]:
os.mkdir(f"{root}/saeen")

tokenizer.save_model(f"{root}/saeen")

['/content/saeenbert/saeen/vocab.json', '/content/saeenbert/saeen/merges.txt']

- merges.txt — performs the initial mapping of text to tokens
- vocab.json — maps the tokens to token IDs


Initializing the Tokenizer

In [14]:
tokenizer = RobertaTokenizer.from_pretrained(f"{root}/saeen", max_len=512)

In [15]:
# test our tokenizer on a simple sentence
tokens = tokenizer('السلام علیکم کیسے ہو')

tokens

{'input_ids': [0, 16530, 10939, 2020, 326, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [16]:
tokens.input_ids

[0, 16530, 10939, 2020, 326, 2]

# Creating the Input Pipeline

In [17]:
# Preparing data
with open("pt_0.txt", "r", encoding="utf-8") as fp:
  lines = fp.read().split("\n")

batch = tokenizer(lines, max_length=512, padding="max_length", truncation=True)
len(batch)

2

creating our tensors. We'll need three tensors:

- input_ids — our token_ids with ~15% of tokens masked using the mask token <mask>.
- attention_mask — a tensor of 1s and 0s, marking the position of ‘real’ tokens/padding tokens — used in attention calculations.
- labels — our token_ids with no masking.

In [18]:
labels = torch.tensor([x for x in batch.input_ids])
mask = torch.tensor([x for x in batch.attention_mask])

In [19]:
# make copy of labels tensor, this will be input_ids
input_ids = labels.detach().clone()

rand = torch.rand(input_ids.shape)

# We gonna mask tokens that have a randonly generated value less than 15% criteria
# and are not special tokens
mask_arr = (rand < 0.15) * (input_ids > 2)

for i in range(input_ids.shape[0]):
  # get indices of mask positions from mask array
  selection = torch.flatten(mask_arr[i].nonzero()).tolist()
  # mask input_ids
  input_ids[i, selection] = 4

In [20]:
input_ids.shape

torch.Size([10000, 512])

In [21]:
input_ids[0][:10]

tensor([    0, 24545,  1135,  1330,  4292,   320,  1760,   329,  2239,   300])

In [22]:
labels[0][:10]

tensor([    0, 24545,  1135,  1330,  4292,   320,  1760,   329,  2239,   300])

## Defining Dataset

In [23]:
encodings = {
    "input_ids": input_ids,
    "attention_mask": mask,
    "labels": labels
}

In [24]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    # store encodings internally
    self.encodings = encodings
  
  def __len__(self):
    # return the number of samples
    return self.encodings["input_ids"].shape[0]
  
  def __getitem__(self, i):
    # return dictionary of input_ids, attention_mask, and labels for index i
    return {key: tensor[i] for key, tensor in self.encodings.items()}

In [25]:
dataset = Dataset(encodings)

In [26]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True)

# Training the model

In [27]:
tokenizer.vocab_size

30522

Create configuration for Roberta

In [28]:
config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [29]:
model = RobertaForMaskedLM(config)

Begin to train

In [30]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [31]:
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [32]:
# activate training mode
model.train()

optim = AdamW(model.parameters(), lr=1e-4)

In [33]:
epochs = 7

for epoch in range(epochs):
  loop = tqdm(dataloader, leave=True)
  for batch in loop:
    optim.zero_grad()

    # pull all tensor batches required for training
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    # process
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    # extract loss
    loss = outputs.loss

    # calculate loss for every parameter that needs grad update
    loss.backward()

    # update parameters
    optim.step()

    # print relevant info to progress bar
    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [34]:
model.save_pretrained(f"{root}/saeen")

# Testing

In [35]:
fill = pipeline("fill-mask", model=f"{root}/saeen", tokenizer=f"{root}/saeen")

Some weights of RobertaModel were not initialized from the model checkpoint at /content/saeenbert/saeen and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
fill(f" آپ اس وقت پڑھ سکتے  {fill.tokenizer.mask_token}")

[{'sequence': '<s> آپ اس وقت پڑھ سکتے  کو</s>',
  'score': 0.06309904903173447,
  'token': 320,
  'token_str': 'ĠÚ©ÙĪ'},
 {'sequence': '<s> آپ اس وقت پڑھ سکتے  کے</s>',
  'score': 0.05134507268667221,
  'token': 290,
  'token_str': 'ĠÚ©ÛĴ'},
 {'sequence': '<s> آپ اس وقت پڑھ سکتے  سے</s>',
  'score': 0.05019540712237358,
  'token': 315,
  'token_str': 'ĠØ³ÛĴ'},
 {'sequence': '<s> آپ اس وقت پڑھ سکتے  کی</s>',
  'score': 0.0333869569003582,
  'token': 292,
  'token_str': 'ĠÚ©ÛĮ'},
 {'sequence': '<s> آپ اس وقت پڑھ سکتے  اس</s>',
  'score': 0.03184341639280319,
  'token': 317,
  'token_str': 'ĠØ§Ø³'}]