In [None]:
!pip install transformers
!pip install datasets




In [None]:
from transformers import RobertaForMaskedLM
from transformers import RobertaTokenizer, PreTrainedTokenizer
from transformers import RobertaConfig

In [None]:
#config
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

#model roberta
model = RobertaForMaskedLM(config=config)


**Code requirement**


1.   Dataset class :

        load and tokenize dataset->> input ids

        *look if nlp dataset library could be used here easily*

        tokenize key phrase as well as text and mask key phrase in data collator



      

2.   Data collator for masked LM

    takes a list of samples from a Dataset and collate them into a batch for (also masking and stuffs)

    Refrence class: DataCollatorForWholeWordMask



**Dataset class**

In [None]:
from torch.utils.data.dataset import Dataset
import json, os

In [None]:
class KLMDataset(Dataset):

    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
        assert os.path.isfile(file_path)

        # logger.info("Creating features from dataset file at %s", file_path)
        self.abst= []
        self.kps= []
        with open(file_path, encoding="utf-8") as f:
            for line in f:
                d=json.loads(line)
                self.abst.append(d['text'])
                self.kps.append(d['kp'])

        for (i,kp) in enumerate(self.kps):
            self.kps[i]= tokenizer(kp,add_special_tokens= False, truncation= False)['input_ids']
 

        self.abst = tokenizer(self.abst, add_special_tokens=True, truncation=True, max_length=block_size)["input_ids"]
        

    def __len__(self):
        return len(self.abst)

    def __getitem__(self, i):
        # print("called {} and results{}\n".format(i,{'input_ids': self.abst[i], 'kp': self.kps[i]}))
        return {'input_ids': self.abst[i], 'kp': self.kps[i]}

# super daset from HF


In [None]:
tok= RobertaTokenizer.from_pretrained("roberta-base")

**Data collator**

In [None]:
from transformers import DataCollatorForLanguageModeling
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
import torch
from transformers.data.data_collator import _collate_batch, tolist
import random

In [None]:
@dataclass
class DataCollatorForKLM(DataCollatorForLanguageModeling):
    def __init__(self, 
        tokenizer: PreTrainedTokenizer,
        mlm_probability= 0.15,
        kp_mask_percentage = 0.8):
        self.tokenizer= tokenizer
        self.mlm_probability= mlm_probability
        self.kp_mask_percentage = kp_mask_percentage

    def __call__(
        self, examples
    ) -> Dict[str, torch.Tensor]:
        print("collator   ",examples)
        if isinstance(examples[0], dict):
            print(examples[0])
            input_ids = [e["input_ids"] for e in examples]
            key_phrases= [e["labels"] for e in examples]
        else:
          print("proper inputr fromat is not found for kp input ids")
  

        batch_input = _collate_batch(input_ids, self.tokenizer)

        mask_labels = []
        kp_mask_labels= []
        for e in examples:
            ref_tokens = []
            kp_tokens_list= []
            for id in tolist(e["input_ids"]):
                token = self.tokenizer._convert_id_to_token(id)
                ref_tokens.append(token)
            for kp in tolist(e["labels"]):
                curr_kp= []
                for kp_id in kp:
                    tok= self.tokenizer._convert_id_to_token(kp_id)
                    curr_kp.append(tok)
                if len(curr_kp) >0:
                    kp_tokens_list.append(curr_kp)
            mask_res= self.kp_and_whole_word_mask(ref_tokens, kp_tokens_list) #[["KP1-T1", "KP1-T2"], ["KP2-T1", "KP2-T2", "KP2-T3"]] 
            mask_labels.append(mask_res[0])
            kp_mask_labels.append(mask_res[1])
        #collate
        batch_mask = _collate_batch(mask_labels, self.tokenizer)
        kp_batch_mask= _collate_batch(kp_mask_labels, self.tokenizer)
        #mask
        inputs, labels = self.mask_tokens_and_kp(batch_input, batch_mask, kp_batch_mask)

        return {"input_ids": inputs, "labels": labels}

    def kp_and_whole_word_mask(self, input_tokens, kp_tokens_list, max_predictions=512):
        """
        Get 0/1 labels for masked tokens with whole word mask proxy
        """

        cand_indexes = []
        kp_indexes= []
        for (i, token) in enumerate(input_tokens):
            if token == "[CLS]" or token == "[SEP]":
                continue
            kp_flag = False
            for kp in kp_tokens_list: # kp = ["KP1-T1", "KP1-T2"]
                j= i + len(kp)
                if j < len(input_tokens):
                    if input_tokens[i:j]== kp: # input_tokens = ["KP1-T1", "KP1-T2"]
                      kp_indexes.append([x for x in range(i,j)]) # kp_indexes = ["index of KP1-T1", "index of KP1-T2"]
                      i=j-1
                      kp_flag= True
                      break
            if kp_flag: #if token is included in kp mask then don't include in random token mask
                continue
            if len(cand_indexes) >= 1 and token.startswith("##"):
                cand_indexes[-1].append(i)
            else:
                cand_indexes.append([i])
            
        tok_to_predict= min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))
        kp_to_predict= min(max_predictions, max(1, int(round(len(kp_tokens_list) * self.kp_mask_percentage))))

        tok_mask_labels= self.get_mask_labels(cand_indexes=cand_indexes, len_input_tokens=len(input_tokens), num_to_predict=tok_to_predict)
        kp_mask_labels= self.get_mask_labels(cand_indexes=kp_indexes, len_input_tokens=len(input_tokens), num_to_predict=kp_to_predict)
        return tok_mask_labels, kp_mask_labels


    def get_mask_labels(self, cand_indexes, len_input_tokens, num_to_predict):
        random.shuffle(cand_indexes)
        masked_lms = []
        covered_indexes = set()
        for index_set in cand_indexes:
            if len(masked_lms) >= num_to_predict:
                break
            # If adding a whole-word mask would exceed the maximum number of
            # predictions, then just skip this candidate.
            if len(masked_lms) + len(index_set) > num_to_predict:
                continue
            is_any_index_covered = False
            for index in index_set:
                if index in covered_indexes:
                    is_any_index_covered = True
                    break
            if is_any_index_covered:
                continue
            for index in index_set:
                covered_indexes.add(index)
                masked_lms.append(index)

        assert len(covered_indexes) == len(masked_lms)
        mask_labels = [1 if i in covered_indexes else 0 for i in range(len_input_tokens)]
        return mask_labels

    def mask_tokens_and_kp(self, inputs, mask_labels, kp_mask_labels): 
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
        """

        if self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
            )
        labels = inputs.clone()
        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)

        probability_matrix = mask_labels
        kp_probability_matrix = kp_mask_labels

        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        # do zero for special tokens
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        kp_probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)

        # assert kp_probability_matrix & probability_matrix == 0
        # do zero for padded points
        if self.tokenizer._pad_token is not None:
            padding_mask = labels.eq(self.tokenizer.pad_token_id)
            probability_matrix.masked_fill_(padding_mask, value=0.0)
            kp_probability_matrix.masked_fill_(padding_mask, value=0.0)

        masked_indices = probability_matrix.bool()
        kp_masked_indices = kp_probability_matrix.bool()
        # get the gold lables
        labels[~(masked_indices | kp_masked_indices)] = -100  # We only compute loss on random masked tokens and kp masked token else is set to -100

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
        # 80 % masking for key phrases
        kp_indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & kp_masked_indices
        inputs[kp_indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
        # generate random tokens
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        inputs[indices_random] = random_words[indices_random]

        # replace 10 # kp tokens with random idices
        kp_indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & kp_masked_indices & ~kp_indices_replaced
        inputs[kp_indices_random] = random_words[kp_indices_random]
        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        # print("inside mask tok functiom \n",inputs,"\n", labels,"\n")

        # generation - t1, t2, t3 (actual) - [MASK], t4 [MASK], t5, t6
        # replacement - t1, t2, t3 (actual) - [MASK], t4 [MASK], t5, t6 (replace) t9
        
        return inputs, labels

  

In [None]:
from datasets import load_dataset
def load_klm_dataset(tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
    
    def pre_process(d):
        kp_pro= tokenizer(d['kp'],add_special_tokens= False, truncation= False)["input_ids"]
        d['input_ids']= tokenizer(d['text'], add_special_tokens=True, truncation=True, max_length=block_size)["input_ids"]
        d['labels'] = kp_pro
        # print("inn inn",d['kp'])
        return d


    dataset = load_dataset('json', data_files= file_path, split='train' )
    dataset= dataset.map(pre_process)
    # print("inn ", dataset)
    dataset.set_format(columns=[ 'labels', 'input_ids'])

    return dataset


In [None]:
#  tok(['iam mam'],add_special_tokens= False, truncation= False)

In [None]:

# data_set= KLMDataset(tokenizer=tok, file_path="/content/dummy.txt", block_size= 200)
data_set = load_klm_dataset(tokenizer= tok, file_path= "/content/train.json", block_size= 124)
# data_set.set_format(columns=[ 'kp', 'input_ids'])
dc= DataCollatorForKLM(tokenizer= tok)

Using custom data configuration default
Reusing dataset json (/content/json/default-16dd99a81353c724/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)
Loading cached processed dataset at /content/json/default-16dd99a81353c724/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514/cache-6d89745262ae7664.arrow


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2, # need to save all the models
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=dc,
    train_dataset= data_set
)

In [None]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


collator    [{'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 5, 5718, 526, 4, 252, 465, 14, 951, 34, 2673, 10, 569, 9, 106, 519, 2099, 8, 33, 17199, 24, 7, 5, 3742, 4, 1892, 51, 386, 5, 1015, 7, 465, 5, 17685, 8, 185, 159, 5, 569, 137, 840, 338, 18078, 18, 74, 12, 1610, 1623, 5684, 66, 4, 2], 'labels': [[34103, 526], [12326, 7, 465, 5, 17685]]}, {'input_ids': [0, 574, 23259, 2012, 19, 80, 82, 27744, 301, 8, 744, 8, 51, 2845, 7, 310, 784, 23259, 4, 20, 177, 51, 310, 44555, 7, 5, 1074, 9, 208, 2611, 257, 5371, 1439, 2636, 6, 3773, 1671, 6, 726, 257, 8, 163, 2582, 257, 4, 2], 'labels': [[7109, 82], [5367, 8, 744], [7215, 5982, 1626]]}, {'input_ids': [0, 33282, 1671, 8, 840, 338, 18078, 4620, 

Step,Training Loss


TrainOutput(global_step=1, training_loss=10.954537391662598)

In [None]:
# Data format - {"text": ....., "keyphrases": [{"surface_form": ..., "start": ..., "end": ...}]}
# format - jsonl one json per line
# dir - 1.jsonl, 2.jsonl, 3.jsonl