In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
import json
from tqdm import tqdm
import torch.nn.functional as F
import torch

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [2]:
tokenizer = AutoTokenizer.from_pretrained("biu-nlp/led-base-controlled-text-reduction")
model = AutoModelForSeq2SeqLM.from_pretrained("biu-nlp/led-base-controlled-text-reduction")

In [26]:
indir_jsonl = r"/home/nlp/sloboda1/controlled_reduction/DL_approach/paper_experiments/cleaning_training_data_experiment/data/test__highlights_duc_full_with_doc_id_quark.jsonl"

with open(indir_jsonl, 'r') as f1:
    data = [json.loads(line) for line in f1.readlines()]

data[0]


{'input': ' <highlight_start>Debi Thomas\' dream of Olympic gold turned into disappointment Saturday as East Germany\'s Katarina Witt won her second straight Olympic championship and Canadian Elizabeth Manley took home the silver<highlight_end> before a crowd of cheering countrymen.\n"It\'s over.\nBack to school," said <highlight_start>Thomas<highlight_end>, who <highlight_start>won the bronze<highlight_end> medal <highlight_start>despite three faulty landings<highlight_end>.\n"I\'m not going to make any excuses.\nI was really skating well this week.\nIt wasn\'t supposed to happen, I guess.\nBut I tried."\nWhile the top two skaters in the world staged a shootout to music from Bizet\'s "Carmen," Manley was so sensational in the freestyle that she finished first with seven judges.\nCombined with a fourth in the compulsory figures and a third-place finish in the short program earlier in the week, the performance put Manley in second place.\nWitt, a three-time world champion from East Germ

### Get Inputs

In [27]:
def get_inputs(inputs, tokenizer):
        
    encodings_dict = tokenizer(inputs, max_length= 4096, padding=True, truncation=True) 
    input_ids = torch.as_tensor(encodings_dict['input_ids'])
    attention_mask = torch.as_tensor(encodings_dict['attention_mask'])

    global_attention_mask = []
    for input_ids_instance in encodings_dict['input_ids']:
        curr_global_attention_mask = [0 for _ in range(len(input_ids_instance))]
        curr_global_attention_mask[0] = 1
        ids_with_global_attention = tokenizer.additional_special_tokens_ids

        for input_id_idx, input_id in enumerate(input_ids_instance):
            # Put attention on highlight tokens
            if input_id in ids_with_global_attention: 
                curr_global_attention_mask[input_id_idx] = 1


        global_attention_mask.append(curr_global_attention_mask)

    global_attention_mask = torch.as_tensor(global_attention_mask)


    return input_ids, attention_mask, global_attention_mask

### Get Reductions

In [39]:
DO_SAMPLE                  = True # @param {type:"boolean"}
TOP_P                      = 0.9 # @param {type:"number"}
MAX_LENGTH                 = 100 # @param {type:"number"}
NUM_BEAMS                  = 2 # @param {type:"number"}
LENGTH_PENALTY             = 2.0 # @param {type:"number"}
NO_REPEAT_NGRAMS_SIZE      = 3 # @param {type:"number"}

In [43]:
def generate_reductions(full_data, tokenizer, model, batch_size):

    generated_reductions = list()

    for i in tqdm(range(int(len(full_data)/batch_size) + 1)):

        curr_batch = full_data[i*batch_size:(i+1)*batch_size]

        inputs = [sample["input"] for sample in curr_batch]
        input_ids, attention_mask, global_attention_mask = get_inputs(inputs, tokenizer)



        model_kwargs = {'attention_mask': attention_mask}
        model_kwargs["max_length"] = MAX_LENGTH
        model_kwargs["num_beams"] = NUM_BEAMS
        model_kwargs["top_p"] = TOP_P
        model_kwargs["do_sample"] = DO_SAMPLE
        model_kwargs["length_penalty"] = LENGTH_PENALTY
        model_kwargs["no_repeat_ngram_size"] = NO_REPEAT_NGRAMS_SIZE
        
        if global_attention_mask != None:
            model_kwargs["global_attention_mask"] = global_attention_mask


        model.eval()
        with torch.no_grad():
            outputs = model.generate(
                    input_ids,
                    **model_kwargs,
                    return_dict_in_generate=True,
                    output_attentions=False,
                    output_hidden_states=False,
                    output_scores=True
            ) 
        
        output_ids = outputs.sequences
        output_texts = [tokenizer.decode(output_ids_instance, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                        for output_ids_instance in output_ids] 
        generated_reductions.extend(output_texts)
    return generated_reductions

In [44]:
batch_size = 5
embeddings = generate_reductions(data, tokenizer, model, batch_size)

  6%|▌         | 4/72 [02:54<49:22, 43.57s/it]


KeyboardInterrupt: 

# Notes:
1. Make sure the highlights are added as <highlight_start> and <highlight_end>