In [2]:
!pip install transformers rouge-score nltk datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 29.8 MB/s 
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 65.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 61.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 72.0 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 77.6 MB/s 
[?2

In [3]:
from torch import nn, Tensor, multiply
from transformers import BartTokenizer, AutoModelForSeq2SeqLM, \
Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, \
TFBartForConditionalGeneration, AutoTokenizer
import pandas as pd
import numpy as np
from google.colab import drive
from datasets import load_dataset, load_metric, load_from_disk
from rouge_score import rouge_scorer
from sklearn.model_selection import train_test_split
import sklearn
import rouge_score
import nltk
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# @title Initialize 
## Home directory of drive
HOME = "/content/gdrive/My Drive/Colab Notebooks/"

In [5]:
# @title Read data
train = load_dataset('csv', data_files={'train': [HOME+"Notebooks/Data/train_sampled.csv"],
                                        'validation':[HOME+"Notebooks/Data/val_sampled.csv"]})



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ce025a846814088e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

  

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ce025a846814088e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')

# @title Procressing the data
max_input = 512
max_target = 56

def preprocess_data(data_to_process):
  #get the dialogue text
  inputs = [x for x in data_to_process['abstracts']]
  #tokenize text
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

  #tokenize labels
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['label'], max_length=max_target, padding='max_length', truncation=True)
    
  model_inputs['labels'] = targets['input_ids']
  #reuturns input_ids, attention_masks, labels
  return model_inputs

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
tokenize_data = train.map(preprocess_data, batched = True, remove_columns=['terms', 'abstracts', 'label'])

  0%|          | 0/5 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

In [7]:
metric = load_metric('rouge')
def compute_rouge(pred):
  predictions, labels = pred
  #decode the predictions
  decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  #decode labels
  decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  #compute results
  res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
  #get %
  res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

  pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  res['gen_len'] = np.mean(pred_lens)

  return {k: round(v, 4) for k, v in res.items()}

  metric = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [8]:
args = Seq2SeqTrainingArguments(
    HOME+"Notebooks/Outputs/pointer_g_finetuned_new", #save directory
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_accumulation_steps=1
    )

class PointerGenerater(nn.Module):
  def __init__(self):
    self.finetunedbart = AutoModelForSeq2SeqLM.from_pretrained(HOME+'Notebooks/Outputs/bart_finetuned/checkpoint-1500/')
    ## Define new layers
    self.linear1 = nn.Linear(1024, 1)
    self.linear2 = nn.Linear(1024, 1)
    self.linear3 = nn.Linear(1024, 1)

    self.sum1 = Tensor.sum()
    self.activation1 = nn.Sigmoid()

  def forward(self, ids, mask):
    model_raw_output = self.finetunedbart.forward(ids)

    linear1_output = self.linear1(model_raw_output.encoder_last_hidden_state)
    linear2_output = self.linear1(model_raw_output.decoder_hidden_states[-1])
    linear3_output = self.linear1(model_raw_output.decoder_hidden_states[0])
    combined_output = self.sum(linear1_output, linear2_output, linear3_output)

    cross_attention = model_raw_output.cross_attentions
    p_gen = self.activation1(combined_output)

    # output of the pre-trained model
    vocab_dist = model_raw_output.logits
    # scale output
    scaled_vocab_dist = multiply(p_gen, vocab_dist)
    # scale cross attention
    scaled_cross_attention = multiply(1-p_gen, cross_attention)
    # final combined
    final_combined = Tensor.sum(scaled_vocab_dist,scaled_cross_attention)

    return final_combined

collator = DataCollatorForSeq2Seq(tokenizer, model=PointerGenerater)

In [9]:
trainer = Seq2SeqTrainer(
    PointerGenerater(), 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

AttributeError: ignored