In [1]:
!pip install transformers rouge-score nltk blurr datasets py7zr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.9 MB/s 
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
Collecting blurr
  Downloading blurr-0.4.1-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 86.5 MB/s 
[?25hCollecting py7zr
  Downloading py7zr-0.20.2-py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 2.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 84.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11

In [2]:
from transformers import BartTokenizer, AutoModelForSeq2SeqLM, \
Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, \
TFBartForConditionalGeneration, AutoTokenizer
import pandas as pd
import numpy as np
from google.colab import drive
from datasets import load_dataset, load_metric, load_from_disk
from rouge_score import rouge_scorer
from sklearn.model_selection import train_test_split
import sklearn
import rouge_score
import nltk
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# @title Initialize 
## Home directory of drive
HOME = "/content/gdrive/My Drive/Colab Notebooks/"

In [4]:
# @title Read data
X_train = pd.read_csv(HOME+"Notebooks/Data/X_train.csv")
y_train = pd.read_csv(HOME+"Notebooks/Data/y_train.csv")
X_train['label'] = y_train['titles']
X_train.to_csv(HOME+"Notebooks/Data/train.csv", index=False)

X_train.sample(n=5000, random_state=923).to_csv(HOME+"Notebooks/Data/train_sampled.csv", index=False)
X_train.sample(n=2000, random_state=1218).to_csv(HOME+"Notebooks/Data/val_sampled.csv", index=False)

In [5]:
train = load_dataset('csv', data_files={'train': [HOME+"Notebooks/Data/train_sampled.csv"],
                                        'validation':[HOME+"Notebooks/Data/val_sampled.csv"]})



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4a5e720550e05383/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

  

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4a5e720550e05383/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# @title Load model
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")
# tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')

Downloading:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

 [Good source for fine-tuning BART](https://github.com/AldoF95/bart-chat-summarizer-finetuning/blob/main/Bart_large_xsum_fine_tuned_samsum.ipynb)

In [7]:
# @title Procressing the data
max_input = 512
max_target = 56

def preprocess_data(data_to_process):
  #get the dialogue text
  inputs = [x for x in data_to_process['abstracts']]
  #tokenize text
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

  #tokenize labels
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['label'], max_length=max_target, padding='max_length', truncation=True)
    
  model_inputs['labels'] = targets['input_ids']
  #reuturns input_ids, attention_masks, labels
  return model_inputs

In [8]:
tokenize_data = train.map(preprocess_data, batched = True, remove_columns=['terms', 'abstracts', 'label'])

  0%|          | 0/5 [00:00<?, ?ba/s]



  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
metric = load_metric('rouge')

  metric = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [10]:
#####################
# metrics
# compute rouge for evaluation 
#####################

def compute_rouge(pred):
  predictions, labels = pred
  #decode the predictions
  decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  #decode labels
  decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  #compute results
  res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
  #get %
  res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

  pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  res['gen_len'] = np.mean(pred_lens)

  return {k: round(v, 4) for k, v in res.items()}

In [11]:
args = Seq2SeqTrainingArguments(
    HOME+"Notebooks/Outputs/bart_finetuned_new", #save directory
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_accumulation_steps=1
    )

collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [12]:
trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 2
  Total optimization steps = 3750
  Number of trainable parameters = 406291456
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.6322,0.548273,51.0874,31.5961,45.7446,45.7555,14.9885
2,0.4789,0.51048,52.6191,32.2157,46.1827,46.1291,16.736
3,0.3952,0.504065,52.7945,32.65,46.7693,46.7586,16.083


Saving model checkpoint to /content/gdrive/My Drive/Colab Notebooks/Notebooks/Outputs/bart_finetuned_new/checkpoint-500
Configuration saved in /content/gdrive/My Drive/Colab Notebooks/Notebooks/Outputs/bart_finetuned_new/checkpoint-500/config.json
Model weights saved in /content/gdrive/My Drive/Colab Notebooks/Notebooks/Outputs/bart_finetuned_new/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/gdrive/My Drive/Colab Notebooks/Notebooks/Outputs/bart_finetuned_new/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/gdrive/My Drive/Colab Notebooks/Notebooks/Outputs/bart_finetuned_new/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /content/gdrive/My Drive/Colab Notebooks/Notebooks/Outputs/bart_finetuned_new/checkpoint-1000
Configuration saved in /content/gdrive/My Drive/Colab Notebooks/Notebooks/Outputs/bart_finetuned_new/checkpoint-1000/config.json
Model weights saved in /content/gdrive/My Drive/Colab Notebooks/Notebooks/Ou

TrainOutput(global_step=3750, training_loss=0.7119291544596354, metrics={'train_runtime': 2757.1926, 'train_samples_per_second': 5.44, 'train_steps_per_second': 1.36, 'total_flos': 1.625328451584e+16, 'train_loss': 0.7119291544596354, 'epoch': 3.0})