In [None]:
!pip install transformers

In [4]:
from tqdm.auto import tqdm
import torch
import shutil
from tokenizers import BertWordPieceTokenizer
import time
from transformers import BertTokenizer

In [5]:
model_path='./RoPOS_Wordpiecetok1'

In [6]:
tokenizer = BertTokenizer.from_pretrained(model_path)

Loading the files

Create the input pipeline

In [7]:
def mlm(tensor):
  rand = torch.rand(tensor.shape)
  mask_arr = (rand < .15) * (tensor>2)
  for i in range(tensor.shape[0]):
    # get indices of mask positions from mask array
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    # mask input_ids
    tensor[i, selection] = 4  # our custom [MASK] token == 3
  return tensor


In [None]:
input_ids=[]
attention_mask=[]
labels=[]
input_ids_temp=[]
#for path in tqdm(paths):
with open('./data/articles.txt','r',encoding='utf-8')as f:
    lines = f.read().split('\n')
sample=tokenizer(lines, max_length=512, padding='max_length', truncation=True,return_tensors='pt')
labels.append(sample.input_ids)
attention_mask.append(sample.attention_mask)
input_ids_temp.append(sample.input_ids)

In [None]:
input_ids=[]
input_ids.append(mlm(sample.input_ids.detach().clone()))

In [None]:
input_ids=torch.cat(input_ids)
attention_mask=torch.cat(attention_mask)
labels=torch.cat(labels)

In [None]:
encodings = {'input_ids': input_ids,
             'attention_mask': attention_mask,
             'labels': labels}

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [None]:
dataset = Dataset(encodings)

In [None]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [9]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    hidden_states=True,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [10]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

In [None]:
from transformers import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

In [None]:
def save_ckp(epoch, network, optimizer, loss, is_best, checkpoint_dir, best_model_dir):
    checkpoint = {
        'epoch': epoch,
        'model': network,
        'model_state_dict': network.state_dict(),
        'optimizer': optimizer.state_dict(),
        'loss': loss
    }
    f_path = checkpoint_dir + '/checkpointWPtok1.pt'
    torch.save(checkpoint, f_path)
    if is_best:
        best_fpath = best_model_dir + '/best_modelWPtok1.pt'
        shutil.copyfile(f_path, best_fpath)

In [None]:
def load_ckp(checkpoint_fpath):
    checkpoint = torch.load(checkpoint_fpath)

    return checkpoint

In [None]:
ft=False

In [None]:
epochs = 10
start_epoch=0
save_chpt=20
start_time=time.time()
time_count=1
if ft==False:
  if start_epoch==0: best_loss=float('inf')
  ckp_load=load_ckp('./CheckPoint/checkpointWPtok1.pt')
  model.load_state_dict(ckp_load['model_state_dict'])
  model.to(device)
  optim= AdamW(model.parameters(),lr = 1e-4)
  optim.load_state_dict(ckp_load['optimizer'])
  for param_group in optim.param_groups:
     param_group['lr']=1e-5
  start_epoch=ckp_load['epoch']
  print(start_epoch)
  best_loss=ckp_load['loss']

for epoch in range(start_epoch,epochs):

  if start_epoch==0: best_loss=float('inf')
  is_best=False
    # setup loop with TQDM and dataloader
  loop = tqdm(dataloader, leave=True)
  for batch in loop:
    total_train_loss = 0
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask,labels=labels)
    loss = outputs.loss
    total_train_loss += float(loss)
    loss.backward()
    optim.step()
    checkpoint_dir='./CheckPoint'
    model_dir='./BestModel'
    end_time=time.time()
    if save_chpt>0 and (((end_time-start_time)//60)>=20):#saving after every 20 mins because of google colab intermittent unmounting og drive
      print(f"saving after {time_count*save_chpt} mins")
      save_ckp(epoch+1,model,optim,best_loss,is_best, checkpoint_dir, model_dir)
      model.save_pretrained('./KhasiPretrained_mins')
      start_time=end_time
      time_count+=1
    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())
  avg_train_loss=total_train_loss/len(batch)
  if epoch==0:
    best_loss=avg_train_loss
  else:
    if best_loss>avg_train_loss:
      is_best=True
      best_loss=avg_train_loss
      model.save_pretrained('./KhasiPretrained')
  print(f'epoch : {epoch} Average training Loss: {avg_train_loss}')
  #model.save_pretrained('./RoPOS/JBallRoPOStok1')

In [12]:
from transformers import pipeline

In [15]:
fill = pipeline('fill-mask', model='./KhasiPretrained', tokenizer=tokenizer)

In [None]:
fill(f'hadien shi {fill.tokenizer.mask_token} jong ka   {fill.tokenizer.mask_token} vote ?')