In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets

In [None]:
# utils
import os, sys
import argparse
from dataclasses import dataclass, field
from typing import Dict, List, Optional
@dataclass
class BasicKPArgs:
    model_type : Optional[str] = field(
        default="enc_dec",
        metadata= {"help": "encoder decoder type or other generative model like Bart"}
    )

    model_name_path : Optional[str] = field(
        default= None,
        metadata= {"help": "path or name to load pretrained model or from checkpoints"}
    )
    decoder_model_name_path : Optional[str] = field(
        default= None,
        metadata= {"help": "path or name of decoder part of the model in enc_dec architect"}
    )
    tokenizer_path  : Optional[str] = field(
        default= None,
        metadata= {"help": "path or name of custom tokenizer saved if provided this tokenizer will be loaded else auto tokenizer"}
    )
    data_dir : Optional[str] = field(
        default= "",
        metadata= {"help": "path to dir containg data"}
    )
    kp_task_type : Optional[str] = field(
        default= "one2one",
        metadata= {"help": "wether to use one2one or one2many"}
    )
    max_src_len : Optional[int] = field(
        
        default= 512,
        metadata= {"help": "length of source seq" }
    )
    max_tar_len : Optional[int] = field(
        
        default= 64,
        metadata= {"help": "length of target seq" }
    )
    # this is parsed from training args
    # out_dir: Optional[str] = field(
    #     default= "",
    #     metadata= {"help": "path of data dir to save trained weights and out put"}
    # )
    from_pretrained : Optional[bool] = field(
        default= True,
        metadata= {"help": "wether to load model weight from a pretrained checkpoint or from scratch"}
    )
    predict_only : Optional[bool] = field(
        default= False,
        metadata= {"help": "wether to predict only or train, validate and predict"}
    )
    dataset_class : Optional[str] = field(
        default= "single",
        metadata= {"help": "single | multiple , type of dataset reader to use, split train data into mltiple train file or from single" }
    )

In [None]:
#datset
import os, sys
import torch
import json
from torch.utils.data.dataset import Dataset
class KPone2manyDataset(Dataset):
    def __init__(self, tokenizer, file_path, max_src_len, max_tar_len, kp_sep_token = "<kp_sep>"):
        '''
        file should contain json in each line with
            "text": string and " key phrase": list[str] containing all kp
        '''
        assert os.path.exists(file_path)
        self.abst= []
        self.kps= []
        self.src_attn_mask = []
        self.tokenizer = tokenizer
        with open(file_path, encoding="utf-8") as f:
            for line in f:
                d=json.loads(line)
                self.abst.append(d['text'])
                curr_kp= ""
                for (i,kp) in enumerate(d['kp']):
                    if i !=0:
                        curr_kp += " " + kp_sep_token +" "
                    curr_kp += kp.strip()
                
                self.kps.append(curr_kp)
        
        assert len(self.kps) == len(self.abst)
        self.ex_len= len(self.abst)
        self.kps= self.tokenizer.batch_encode_plus(self.kps, truncation=True, max_length= max_tar_len, pad_to_max_length= True)
        self.abst= self.tokenizer.batch_encode_plus(self.abst, truncation=True, max_length= max_src_len, pad_to_max_length= True)

    def __len__(self):
        return self.ex_len

    def __getitem__(self, i):
        return {
            'src_ids': torch.tensor(self.abst['input_ids'][i]),
            'tar_ids': torch.tensor(self.kps['input_ids'][i]),
            'src_attn': torch.tensor(self.abst['attention_mask'][i]),
            'tar_attn': torch.tensor(self.kps['attention_mask'][i])
            }

# class kpone2manyMultiDataset(Dataset):
#     def __init__(self, tokenizer, data_dir, file_prefix, n=10000, max_src_len, max_tar_len, kp_sep_token = "<kp_sep>"):
#         self.tokenizer = tokenizer
#         self.data_dir = data_dir
#         self.file_prefix = file_prefix
#         self.total_examples = n
#         self.max_src_len = max_src_len
#         self.max_tar_len = max_tar_len
#         self.kp_sep_token = kp_sep_token

#         assert os.path.exists(self.data_dir)



        
#         pass
#     def read_files(self):
#         pass

#     def __len__(self):
#         pass
    
#     def __getitem__(self,i):
#         pass



# super dataset class
def load_kp_data_and_dataset_class( tokenizer, file_path, max_src_len, max_tar_len, kp_sep_token = "<kp_sep>"):
    from datasets import load_dataset
    def tok_and_process(d):
        curr_kp= ""
        for (i,kp) in enumerate(d['kp']):
            if i !=0:
                curr_kp += " " + kp_sep_token +" "
            curr_kp += kp.strip()
        src_encode= tokenizer(d['text'],  truncation=True, max_length= max_tar_len, pad_to_max_length= True)
        tar_encode= tokenizer(curr_kp, truncation=True, max_length= max_tar_len, pad_to_max_length= True)
        d['input_ids'] = src_encode['input_ids']
        d['decoder_input_ids']= tar_encode['input_ids']
        d['attention_mask']= src_encode['attention_mask']
        # d['tar_attn'] = tar_encode['attention_mask']

        return d
    

    dataset = load_dataset('json', data_files= file_path, split='train')
    dataset= dataset.map(tok_and_process)
    dataset.set_format(type='torch', columns=['input_ids', 'decoder_input_ids', 'attention_mask'])

    return dataset






In [None]:
#collate
import os, sys
import torch
class TPBDataCollator():
    def __init__(self, tokenizer, need_to_shift= False, start_tok_id= None):
        self.tokenizer = tokenizer
        self.shift_right= need_to_shift
        self.dec_start_tok_id= self.tokenizer.pad_token_id if start_tok_id is None else start_tok_id #generally same as pad token id


    def __call__(self, ex):
        # print(ex)
        src_ids= torch.stack([e['input_ids'] for e in ex])
        tar_ids= torch.stack([e['decoder_input_ids'] for e in ex])
        src_attn_mask= torch.stack([e['attention_mask'] for e in ex])
        # src_ids= [e['src_ids'] for e in ex]
        # tar_ids= [e['tar_ids'] for e in ex]
        # src_attn_mask= [e['src_attn'] for e in ex]
        # tar_attn_mask = torch.stack([e['tar_attn'] for e in ex])
        # create labels for loss calcualtiona
        labels= tar_ids.clone()
        labels[labels[:]== self.tokenizer.pad_token_id] = -100 #ignore loss at pad token ids

        # get decoder input ids

        if self.shift_right: # either shift right for bart/pegasus/t5 or pass decodeer ids as none for bart/ pegasus then it will create decoder ids by shifting labels to right
            decoder_ids=  self.right_shift(tar_ids)
            
        else:
            decoder_ids= tar_ids


        batch = {
            "input_ids": src_ids,
            "attention_mask" : src_attn_mask,
            "decoder_input_ids": decoder_ids,
            "labels": labels
        }

        return batch


    def right_shift(self, input_ids):
        pad_token_id= self.dec_start_tok_id  # same as pad token id
        prev_output_tokens = input_ids.clone()
        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
        # replace possible -100 values in labels by `pad_token_id`
        prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)

        index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
        decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
        prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
        prev_output_tokens[:, 0] = decoder_start_tokens
        return prev_output_tokens



In [None]:
#main
import os, sys
# from utils import arg_parse
from transformers import (
    AutoTokenizer,
    EncoderDecoderModel,
    BartTokenizerFast,
    AutoModelForSeq2SeqLM,
    AutoConfig,
    Trainer,
    TrainingArguments,
    HfArgumentParser
)

# from dataset_fn import *
# from collate_fn import *


COLLATE_DICT= {
    't5': TPBDataCollator


}

DATASET_DICT= {
    # 'one2many_single': KPone2manyDataset
    'one2many_single': load_kp_data_and_dataset_class

}

CONFIG_MAP = {

}


TOKENIZER_MAP = {
    

}

MODEL_MAP = {

}
# TODO
# modify tokenizer in main function if there is requirement of special token addition and stuff
# chek if there is crosss ateention enabled in decoder part of the model and its working
# see if special token needed and shifting or other requirement->>>> one at a time
#   1. bart model
#   2. t5
#   3. pegasus 
# add <kp_sep> token in every tokenizer and keeep rest same, qg has better logic
#token shifting in bart t5 pegasus
    # t5 tokenizer genrate token as required( there is need to shift right), but bart and pegasus add cls/bos and sep/eos in start and end and it also shifts automatically
    # for bart and pegasus simply copying target seq as labels and target seq as decodeer ip could be tried as these model automatically shift to right
    #final: bart shifts label (i.e target seq ) to right if passed decoder ip ids is none so only labels and input ids can be passed can be passed. if you want you cann remove [cls]/[sep] token as required
    #pegasus; same as bart
#encode decoder: look for shifting
    # cls can be use as bos and sep as eos: this is mentioned in HF blogs
# how to levare seq2seq trainer or trainer directly
    # trainer and seq2seq trainer seems to be the same thing, we can try them alternative and can see which is best
    # 

# add compute metrics

# add do predict and generate function option

def main_fn(args= None, training_args = None):
    #ars parsing
    # parser= HfArgumentParser((BasicKPArgs, TrainingArguments))
    # args , training_args = parser.parse_args_into_dataclasses()
    
    #load tokenizer
    if args.tokenizer_path is not None:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
    else:
        tokenizer= AutoTokenizer.from_pretrained(args.model_name_path)
    tokenizer.add_tokens(['<kp_sep>'])
        # tokenizer.sep_token = "<sep>"
        #save tokenizer
    tok_path= training_args.output_dir+"/kp_{}_tokenizer".format(args.model_name_path )
    if not os.path.exists(tok_path):
        os.mkdir(tok_path)
    tokenizer.save_pretrained(tok_path)

    
    #load model
    if args.model_type == "enc_dec":
        model =None
    else:
        if args.from_pretrained:
            model  = AutoModelForSeq2SeqLM.from_pretrained(
                args.model_name_path
            )
        else:
            config= AutoConfig.from_pretrained(args.model_name_path) #get the config file to load weight from scratch
            model= AutoModelForSeq2SeqLM.from_config(config) #load model with random weight from config

    #resize model embedding
    model.resize_token_embeddings(len(tokenizer))

    #freeze model embedding

    #datset class
    
    train_data_set= DATASET_DICT[args.kp_task_type+"_"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir + "/train.txt", max_src_len=  args.max_src_len, max_tar_len = args.max_tar_len)

    eval_data_set= DATASET_DICT[args.kp_task_type+"_"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir+"/val.txt", max_src_len=  args.max_src_len, max_tar_len = args.max_tar_len)
    
    # print(train_data_set)

    #data collator
    data_collator= COLLATE_DICT[args.model_type](tokenizer= tokenizer, need_to_shift= True)

    trainer= Trainer(model= model,
                 args= training_args,
                 data_collator= data_collator,
                 train_dataset = train_data_set,
                 eval_dataset= eval_data_set,
                #  compute_metrics= None, # metrics to compute scores,


                 )
    
    if args.predict_only:
        test_data_set = DATASET_DICT[args.kp_task_type+"_"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir+"/test.txt", max_src_len=  args.max_src_len, max_tar_len = args.max_tar_len)
        
    
    trainer.train()


    



In [None]:
def runner():
    args= BasicKPArgs(
        model_type = 't5',
        model_name_path = 't5-base', #todo
        data_dir= "/content", #todo
        kp_task_type= "one2many",
        dataset_class= 'single'
    )
    training_args = TrainingArguments(
        output_dir= "/content/tk_out", #todo
        overwrite_output_dir = True,
        num_train_epochs = 2,
        per_device_train_batch_size = 8,
        do_eval= True,
        evaluation_strategy = "epoch",
        save_steps = 1
        
        

    )
    main_fn(args, training_args)

In [None]:
runner()

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using custom data configuration default
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-0a0c845d87888c0a/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514)
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-0a0c845d87888c0a/0.0.0/70d89ed4db1394f028c651589f

Epoch,Training Loss,Validation Loss


RuntimeError: ignored

In [None]:
# 3 eval_kp.py
import os, sys
# from utils import arg_parse
from transformers import (
    AutoTokenizer,
    EncoderDecoderModel,
    BartTokenizerFast,
    AutoModelForSeq2SeqLM,
    AutoConfig,
    Trainer,
    TrainingArguments,
    HfArgumentParser
)

import torch


@dataclass
class EvalArgs:
    model_type : Optional[str] = field(
        default="enc_dec",
        metadata= {"help": "encoder decoder type or other generative model like Bart"}
    )

    model_name_path : Optional[str] = field(
        default= None,
        metadata= {"help": "path or name to load pretrained model or from checkpoints"}
    )
    tokenizer_path  : Optional[str] = field(
        default= None,
        metadata= {"help": "path or name of custom tokenizer saved if provided this tokenizer will be loaded else auto tokenizer"}
    )
    data_dir : Optional[str] = field(
        default= "",
        metadata= {"help": "path to dir containg data"}
    )
    kp_task_type : Optional[str] = field(
        default= "one2one",
        metadata= {"help": "wether to use one2one or one2many"}
    )
    dataset_class : Optional[str] = field(
        default= "single",
        metadata= {"help": "single | multiple , type of dataset reader to use, split train data into mltiple train file or from single" }
    )
    beam_size : Optional[int] = field(
        
        default= 4,
        metadata= {"help": "beam_size" }
    )
    max_pre_len : Optional[int] = field(
        
        default= 64,
        metadata= {"help": "length of target seq" }
    )
    max_src_len : Optional[int] = field(
        
        default= 512,
        metadata= {"help": "length of source seq" }
    )

COLLATE_DICT= {
    't5': TPBDataCollator


}
def main_eval(args= None):
    # p = HfArgumentParser((EvalArgs,))
    # args= p.parse_args_into_dataclasses()[0]
   

    device= 'cuda' if torch.cuda.is_available else 'cpu'
    print("device ", device)
    if args.tokenizer_path is not None:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path )
    else:
        tokenizer= AutoTokenizer.from_pretrained(args.model_name_path)
    tokenizer.add_tokens(['<kp_sep>'])

    if args.model_type == "enc_dec":
        model =None
    else:
        model  = AutoModelForSeq2SeqLM.from_pretrained(
            args.model_name_path
        )
    data_collator= COLLATE_DICT[args.model_type](tokenizer= tokenizer, need_to_shift= True)
    test_data_set = DATASET_DICT[args.kp_task_type+"_"+args.dataset_class](tokenizer= tokenizer, file_path= args.data_dir+"/test.txt", max_src_len=  args.max_src_len, max_tar_len = args.max_pre_len)

    data_loader= torch.utils.data.DataLoader(test_data_set, batch_size= 16, collate_fn= data_collator)

    model.to(device)
    model.eval()
    out_writer= open(args.data_dir+"prediction.txt", 'w')

    with torch.no_grad():
        for ex in data_loader:
            generated= model.generate(
                input_ids= ex['input_ids'].to(device),
                attention_mask= ex['attention_mask'].to(device),
                num_beams= args.beam_size,
                max_length= args.max_pre_len


            )

            pre= [tokenizer.decode(op, skip_special_token= True) for op in generated]
            for p in pre:
                out_writer.write(p+"\n")
            
    
    print("files written in dir {} as prediction.txt ".format(args.data_dir))

    out_writer.close()







In [None]:
args= EvalArgs(
        model_type= 't5',
        model_name_path= "/content/tk_out/checkpoint-6",
        tokenizer_path=  "/content/tk_out/kp_t5-base_tokenizer/",
        data_dir= "/content/", #todo
        kp_task_type= "one2many",
        dataset_class= 'single',
        beam_size= 4,
        max_pre_len = 64


    )
main_eval(args)