## 03_Poysuwop_MLM
Create a masked langauge model for just an experiment

### 0. Library install

In [1]:
! pip install torch accelerate transformers tokenizers huggingface sentencepiece
! pip install fugashi ipadic

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Installing collected packages: huggingface, accelerate
Successfully installed accelerate-0.27.2 huggingface-0.0.1
Collecting fugashi
  Downloading fugashi-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (600 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m600.9/600.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ipadic
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created w

In [5]:
import math
import copy
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import numpy as np

# Change Current Directory
os.chdir('/content/drive/MyDrive/Colab Notebooks/Poysuwop')

# Load preprocess module
#sys.path.append('/content/drive/MyDrive/Colab Notebooks/Poysuwop')
from modules import ainPreprocess

### 1_Set up toknizers

In [6]:
from tokenizers import Tokenizer

from transformers import RobertaTokenizerFast
ain_tokenizer = RobertaTokenizerFast.from_pretrained("/content/drive/MyDrive/AinuBERT")

input_string = ainPreprocess.preprocess("ohonno somo unukar=an")

# Encode text
print(ain_tokenizer(input_string))

{'input_ids': [0, 2338, 387, 3599, 272, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}


### 2_Create Masked Language Model for Ainu

In [8]:
import torch

def mlm(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = (rand < 0.15) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i, selection] = 4
    return tensor

In [9]:
from tqdm.auto import tqdm

input_ids = []
mask = []
labels = []

paths = ["poysuwop_ain.txt"]

for path in tqdm(paths):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    sample = ain_tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))

  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [11]:
# Check
labels[0][:10]

tensor([  0, 660, 580,   2,   1,   1,   1,   1,   1,   1])

In [12]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}

#### Create Dataloader

In [7]:
from transformers import AutoTokenizer

jpn_tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
jpn_tokenizer("久しぶりだね")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/258k [00:00<?, ?B/s]

{'input_ids': [2, 1658, 28454, 14657, 75, 1852, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [13]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

dataset = Dataset(encodings)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

#### Set up MLM

In [14]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=ain_tokenizer.vocab_size,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [15]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device,'\n--------------')

model.to(device)

cuda


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(17236, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

#### Train model

In [16]:
model.train()
optim = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [17]:
from tqdm.auto import tqdm

epochs = 2
step = 0

for epoch in range(epochs):

    loop = tqdm(dataloader, leave=True)

    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch: {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/316 [00:00<?, ?it/s]

  0%|          | 0/316 [00:00<?, ?it/s]

In [18]:
model.save_pretrained('AinuBERTMLM')

### 3_Check model

In [22]:
from transformers import pipeline
import re

fill = pipeline('fill-mask', model='AinuBERTMLM', tokenizer='/content/drive/MyDrive/AinuBERT')

results = fill(f'ohonno somo unukar {fill.tokenizer.mask_token}')

for i in range(len(results)):
    results[i]['token_str'] = re.sub('þ | þ','=',results[i]['token_str'])
    results[i]['sequence'] = re.sub('þ | þ','=',results[i]['sequence'])

results

[{'score': 0.05607295781373978,
  'token': 277,
  'token_str': ' wa',
  'sequence': 'ohonno somo unukar wa'},
 {'score': 0.049445103853940964,
  'token': 868,
  'token_str': ' kuþ',
  'sequence': 'ohonno somo unukar kuþ'},
 {'score': 0.03010522574186325,
  'token': 1170,
  'token_str': ' kþ',
  'sequence': 'ohonno somo unukar kþ'},
 {'score': 0.025947822257876396,
  'token': 326,
  'token_str': ' kusu',
  'sequence': 'ohonno somo unukar kusu'},
 {'score': 0.023167593404650688,
  'token': 280,
  'token_str': ' an',
  'sequence': 'ohonno somo unukar an'}]