<a href="https://colab.research.google.com/github/nassim199/2CP_Project/blob/master/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install datasets tokenizers

#!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

#!pip3 install torch==1.10.0+cu102 torchvision==0.11.1+cu102 torchaudio===0.10.0+cu102 -f https://download.pytorch.org/whl/cu102/torch_stable.html



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# The first part is an implementation of [BERT](https://arxiv.org/abs/1810.04805) model using pytorch.

BERT stands for Bidirectional Encoder Representations from Transformers. It was first presented by Google in 2018, and had quite a significant impact in the nlp field with major performances. I encourage you to check the paper for more details.

In [None]:
import torch as th
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt

# imports the torch_xla package
# import torch_xla
# import torch_xla.core.xla_model as xm


print(th.cuda.is_available())
device = th.device('cuda' if th.cuda.is_available() else 'cpu')

# if not th.cuda.is_available():
#   device = xm.xla_device()

True


In [None]:
def weights_init(m):
  if hasattr(m, 'weight'):
    nn.init.normal_(m.weight.data, 0.0, 0.02)
  if hasattr(m, 'bias'):
    nn.init.constant_(m.bias.data, 0)

In [None]:
def gelu(x):
   return x * 0.5 * (1.0 + th.erf(x / np.sqrt(2.0)))

In [None]:
#Positional encoding
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [None]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return th.tensor(pos_encoding, dtype=th.float32, device=device)

In [None]:
#masking
def create_padding_mask(seq):
  seq = th.tensor(seq == 1, dtype=th.float32, device=device)
  batch_size = seq.size()[0]
  seq_len = seq.size()[1]
  return seq.view(batch_size, 1, 1, seq_len)

In [None]:
#scaled dot product attention
def scaled_dot_product_attention(q, k, v, mask=None):
  '''
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
    '''
  matmul_qk = th.matmul(q, k.transpose(len(k.size()) - 1, len(k.size()) - 2)) # shape = (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = k.size()[-1]
  scaled_attention_logits = matmul_qk / np.sqrt(dk)
  
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)
  
  attention_weights = F.softmax(scaled_attention_logits, dim=-1)
  
  output = th.matmul(attention_weights, v) # shape = (..., seq_len_q, depth_v)

  return output, attention_weights

In [None]:
#Multi head attention
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads):
    super().__init__()
    self.d_model = d_model
    self.num_heads = num_heads

    assert self.d_model % self.num_heads == 0

    self.depth = self.d_model // self.num_heads

    self.wq = nn.Linear(d_model, d_model)
    self.wk = nn.Linear(d_model, d_model)
    self.wv = nn.Linear(d_model, d_model)

    self.dense = nn.Linear(d_model, d_model)
  
  def split_heads(self, x, batch_size):
    return x.view(batch_size, -1, self.num_heads, self.depth).permute(0,2,1,3)
  
  def forward(self, v, k, q, mask=None):
    batch_size = q.size()[0]

    v = self.wv(v)
    k = self.wk(k)
    q = self.wq(q)

    v = self.split_heads(v, batch_size)
    k = self.split_heads(k, batch_size)
    q = self.split_heads(q, batch_size)

    scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

    scaled_attention = scaled_attention.permute(0,2,1,3).reshape(batch_size, -1, self.d_model)
    output = self.dense(scaled_attention)

    return output, attention_weights

In [None]:
def point_wise_feed_forward_network(d_model, dff):
  return nn.Sequential(
      nn.Linear(d_model, dff), 
      nn.ReLU(),
      nn.Linear(dff, d_model)
  ).to(device)

In [None]:
# Encoder and Decoder
class EncoderLayer(nn.Module):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super().__init__()

    self.rate = rate

    self.mha = MultiHeadAttention(d_model, num_heads).to(device)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.mha.apply(weights_init)
    self.ffn.apply(weights_init)


  def forward(self, x, training, mask):
    attn_out, _ = self.mha(x, x, x, mask)
    attn_out = F.dropout(attn_out, p=self.rate, training=training)
    out1 = F.layer_norm(x + attn_out, normalized_shape=x.size()[1:])

    ffn_out = self.ffn(out1)
    ffn_out = F.dropout(ffn_out, p=self.rate, training=training)
    out2 = F.layer_norm(out1 + ffn_out, normalized_shape=out1.size()[1:])

    return out2

In [None]:
class BERT(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
      super().__init__()

      self.rate = rate

      self.d_model = d_model
      self.num_layers = num_layers

      self.embedding = nn.Embedding(num_embeddings=input_vocab_size, embedding_dim=d_model)
      self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

      self.enc_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, dff, rate).to(device) for _ in range(num_layers)])
      
      self.linear = nn.Linear(d_model, d_model)
      self.activ = gelu
      self.norm = nn.LayerNorm(d_model)
      self.decoder = nn.Linear(d_model, input_vocab_size)

    def forward(self, x, masked_pos, training=True):
      seq_len = x.size()[1]
      mask = create_padding_mask(x)

      x = self.embedding(x)
      x *= np.sqrt(self.d_model)
      x += self.pos_encoding[:, :seq_len, :]

      x = F.dropout(x, p=self.rate, training=training)

      for i in range(self.num_layers):
        x = self.enc_layers[i](x, training, mask)

      masked_pos = masked_pos[:, :, None].expand(-1, -1, x.size(-1)) # [batch_size, max_pred, d_model]
      h_masked = th.gather(x, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
      h_masked = self.norm(self.activ(self.linear(h_masked)))
      logits = self.decoder(h_masked)

      return logits, x

# Let's now download the dataset we'll work on

The dataset we'll use is from the [OSCAE corpus](https://oscar-corpus.com/). 

The language we'll choose is Esperanto, which is a constructed language with the goal of being easy to learn. We pick it because it's a low-ressource language and its grammar is highly regular.

In [None]:
from datasets import load_dataset

dataset = load_dataset("oscar-corpus/OSCAR-2109", "deduplicated_eo", use_auth_token="WIRfICMAJdOUGgKReAurQCBiEIAVoWUDhxsoxlhPygkQpikFUkZalyOjYFBeGZhWgMnbYVFrmoXmzjANjopzHnUWsgsihxEYqoJURcnoJaprOXhGiYAKyfQvkzmpYUBL")

Downloading:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

Downloading and preparing dataset oscar2109/deduplicated_eo to /root/.cache/huggingface/datasets/oscar-corpus___oscar2109/deduplicated_eo/2021.9.0/f99db7058ca20335499dad39cda92ee05d57d8aa943fe651fdc5676101ee1e8f...


Downloading:   0%|          | 0.00/157 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/150M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/15.6M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

Dataset oscar2109 downloaded and prepared to /root/.cache/huggingface/datasets/oscar-corpus___oscar2109/deduplicated_eo/2021.9.0/f99db7058ca20335499dad39cda92ee05d57d8aa943fe651fdc5676101ee1e8f. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#Let's take a look at one example text in the dataset
Corpus = list(map(lambda doc: doc['text'], dataset['train']))
print(f"corpus length {len(Corpus)}")

Corpus[1]

corpus length 120679


'Parseko (mallongigo pc) estas mezurunuo, uzata en astronomio por esprimi distancojn de steloj, rilate al la Suno. Ĝi egalas al 206265 AU, aŭ 3,085 68 × 1016 m, aŭ 30,856 8 Pm (petametroj), aŭ 3,261 6 lj (lumjaroj). Ĝi ne estas SI-unuo. Ĝi respondas al la longo de latero de triangulo, kies la plej akuta angulo valoras unu angulan sekundon.\nUnu mezurmetodo por taksi la distancon estas mezuri la angulan mezuron de la stelo dum unu tago, kaj ĝian angulan mezuron je duonjaro poste; la diferenco inter la du anguloj mezuritaj en angulaj sekundoj estas la duoblo de la esplorata akuta angulo. Ju pli tia angulo estas malgranda, des pli la stelo estas malproksima.\nLa teksto disponeblas laŭ la permesilo Krea Komunaĵo Atribuite-Samkondiĉe 3.0 Neadaptita; eble aldonaj kondiĉoj aplikeblas. Vidu la uzkondiĉojn por detaloj.'

# Tokenization

Before using our dataset we need to train a tokenizer.
In our case we'll use a byte-level Byte-pair encoding tokenizer, we'll choose a vocabulary of size 5257

In [None]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.pre_tokenizers import Whitespace

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

#use a pre-tokenizer
tokenizer.pre_tokenizer = Whitespace()

# Customize training
tokenizer.train_from_iterator(Corpus, vocab_size=50000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
])

In [None]:
# Save tokenizer to disk
tokenizer.save_model("./drive/MyDrive/BERT", "tokenizer")

['../tokenizer-vocab.json', '../tokenizer-merges.txt']

In [None]:
from torch.utils.data import Dataset
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import Whitespace
import random

class EsperantoDataset(Dataset):
    def __init__(self, Corpus, max_length = 512, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./drive/MyDrive/BERT/tokenizer-vocab.json",
            "./drive/MyDrive/BERT/tokenizer-merges.txt",
        )

        tokenizer.pre_tokenizer = Whitespace()

        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=max_length)

        max_pred = 10
        vocab_size = 50000
        self.Corpus = []
        for input in Corpus:
          tokens = tokenizer.encode(input).ids
          n_pred =  min(max_pred, max(1, int(round(len(tokens) * 0.15)))) # 15 % of tokens in one sentence
          cand_maked_pos = list(range(1, len(tokens)-1))
          random.shuffle(cand_maked_pos)
          masked_tokens, masked_pos = [], []
          for pos in cand_maked_pos[:n_pred]:
              masked_pos.append(pos)
              masked_tokens.append(tokens[pos])
              if random.random() < 0.8:  # 80%
                  tokens[pos] = 4 # make mask
              elif random.random() < 0.5:  # 10%
                  token = random.randint(5, vocab_size - 1) # random token
                  tokens[pos] = token

          # Zero Paddings
          n_pad = max_length - len(tokens)
          tokens.extend([1] * n_pad)

          # Zero Padding (100% - 15%) tokens
          if max_pred > n_pred:
              n_pad = max_pred - n_pred
              masked_tokens.extend([1] * n_pad)
              masked_pos.extend([1] * n_pad)

          self.Corpus.append((tokens, masked_tokens, masked_pos)) 

    def __len__(self):
        return len(self.Corpus)

    def __getitem__(self, i):
        tokens, masked_tokens, masked_pos = self.Corpus[i]
        return [th.tensor(tokens, device=device), th.tensor(masked_tokens, device=device), th.tensor(masked_pos, device=device)]

train_ds = EsperantoDataset(Corpus[:120000])
test_ds = EsperantoDataset(Corpus[120000:])

In [None]:
model = BERT(
    num_layers=6, d_model=512, num_heads=8, dff=512, 
    input_vocab_size=50000, maximum_position_encoding=512).to(device)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 60,969,296 trainable parameters


In [None]:
#optimizer
optimizer = th.optim.Adam(model.parameters(), eps=1e-06, lr=1e-4, weight_decay=0.01)

criterion = nn.CrossEntropyLoss()

#scheduler = th.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

In [None]:
BATCH_SIZE = 16

train_data_loader = th.utils.data.DataLoader(train_ds,batch_size=BATCH_SIZE,shuffle=True)
test_data_loader = th.utils.data.DataLoader(test_ds,batch_size=BATCH_SIZE,shuffle=True)

In [None]:
#We train on the dataset on two epochs with a batch size of 16

from tqdm import tqdm
EPOCHS = 2
loss_history = []
for epoch in range(EPOCHS):
    loop = tqdm(train_data_loader, leave=True)
    loop.set_description(f'Epoch {epoch}')
    for input, masked_tokens, masked_pos in loop:  
      optimizer.zero_grad()
      logits = model(input, masked_pos)
      loss = criterion(logits.transpose(1, 2), masked_tokens) # for masked LM
      loss = (loss.float()).mean()
      loss.backward()
      loss_history.append(loss.item())
      optimizer.step()
      loop.set_postfix(loss=loss.item())

    correct_preds = 0
    for input, masked_tokens, masked_pos in test_data_loader:  
      logits = model(input, masked_pos)
      preds = F.softmax(logits).argmax(-1)
      correct_preds += (preds == masked_tokens).sum().item()

    acc = correct_preds / (10*len(test_ds))

    print()
    print ('Epoch {} accuracy {:.2f}'.format(epoch + 1, acc * 100))

  This is separate from the ipykernel package so we can avoid doing imports until
Epoch 0:  85%|████████▌ | 6404/7500 [1:01:50<10:28,  1.74it/s, loss=6.11]

In [None]:
PATH = './drive/MyDrive/BERT/model.pth'
th.save(model.state_dict(), PATH)

In [None]:
PATH = './drive/MyDrive/BERT/model.pth'
model.load_state_dict(th.load(PATH))

<All keys matched successfully>

## Finetuning on a downstream task:
after the pre-training step we'll now use our model and finetune it on a specific task, which is part of speech tagging.

POST refers to categorizing words in a text (corpus) in correspondence with a particular part of speech, depending on the definition of the word and its context.

data set can be found on the following [link](https://s3.amazonaws.com/datasets.huggingface.co/EsperBERTo/data/pos-train.txt).
Data format:
Each line contains a word with its corresponding tag. empty lines represent sentence boundaries.

the different pos we have are [(link)](https://s3.amazonaws.com/datasets.huggingface.co/EsperBERTo/data/pos-labels.txt):
- O
- NOUN
- ADJ
- ADV
- VERB
- PRON

In [None]:
class POSDataset(Dataset):
    def __init__(self, max_length = 512, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./drive/MyDrive/BERT/tokenizer-vocab.json",
            "./drive/MyDrive/BERT/tokenizer-merges.txt",
        )

        tokenizer.pre_tokenizer = Whitespace()

        # tokenizer._tokenizer.post_processor = BertProcessing(
        #     ("</s>", tokenizer.token_to_id("</s>")),
        #     ("<s>", tokenizer.token_to_id("<s>")),
        # )
        tokenizer.enable_truncation(max_length=max_length)

        self.Corpus = []
        for input in Corpus:
          tokens = tokenizer.encode(input).ids

          self.Corpus.append((tokens, masked_tokens, masked_pos)) 

    def __len__(self):
        return len(self.Corpus)

    def __getitem__(self, i):
        tokens, masked_tokens, masked_pos = self.Corpus[i]
        return [th.tensor(tokens, device=device), th.tensor(masked_tokens, device=device), th.tensor(masked_pos, device=device)]


In [None]:
ds_link = "./drive/MyDrive/BERT/pos-train.txt"
tokenizer = ByteLevelBPETokenizer(
    "./drive/MyDrive/BERT/tokenizer-vocab.json",
    "./drive/MyDrive/BERT/tokenizer-merges.txt",
)

with open(ds_link, 'r') as f:
  l = f.readline()
  while l != '\n':
    t = tokenizer.encode(l.split(' ')[0])
    print(l.split(' ')[0])
    print(t.tokens)
    print(t.ids)
    print()
    l = f.readline()

Ĉu
['Ä', 'Ī', 'u']
[154, 256, 90]

.
['.']
[19]

.
['.']
[19]

.
['.']
[19]

preĝi
['pre', 'Ä', 'Ŀ', 'i']
[7697, 154, 277, 78]

mediti
['medi', 'ti']
[8459, 7593]

ricevi
['ricevi']
[10014]

instigojn
['insti', 'gojn']
[9977, 8254]

kanti
['kanti']
[17675]

muziki
['muziki']
[46110]

informiĝi
['informi', 'Ä', 'Ŀ', 'i']
[15227, 154, 277, 78]

legi
['legi']
[9147]

studi
['studi']
[12450]

prepari
['prepari']
[16961]

Diservon
['Diser', 'von']
[45991, 8086]

