In [None]:
%env WANDB_PROJECT=bart-summarizer

env: WANDB_PROJECT=bart-summarizer


In [None]:
# !pip install transformers datasets evaluate rouge_score wandb

In [5]:
# !curl -LO https://gist.github.com/noppakorn/627605aa61afa4a8f080f9996d0b10dd/raw/66489226825261ceb08aa02f064c15b45dc6a887/rate-my-prof-raw-dataset.csv
# !curl -LO https://gist.githubusercontent.com/noppakorn/8d4a7be866d18cd7dec5ada7aa8d8d0d/raw/f1114e613f302de830500cf4dd2701d661c7eba4/summarized_dataset.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 14.1M  100 14.1M    0     0  28.9M      0 --:--:-- --:--:-- --:--:-- 28.9M


In [4]:
# !wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import re
import pandas as pd
import numpy as np

import torch
import torchtext

from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

from datasets import load_dataset

import evaluate

In [None]:
checkpoint = "sshleifer/distilbart-cnn-12-6"
# checkpoint = "facebook/bart-large-cnn"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def preprocess_function(data):
    inputs = [doc for doc in data["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(text_target=data["summary"], max_length=70, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
data = load_dataset("csv", data_files="summarized_dataset.csv")
data = data["train"].train_test_split(test_size=0.1)



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

rouge = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="distilbart-cnn-12-6-rate-prof",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=9,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to="wandb",
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mnoppakorn[0m ([33mmeen[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.752418,0.3632,0.1564,0.2656,0.2655,64.5333
2,No log,0.736248,0.3291,0.1327,0.2372,0.2365,64.4333
3,No log,0.770152,0.2836,0.0716,0.1861,0.1871,64.6667
4,No log,0.835876,0.3337,0.1259,0.2346,0.2339,64.2667
5,No log,0.883963,0.3102,0.0996,0.2035,0.2039,64.7
6,No log,1.000014,0.2758,0.0849,0.1807,0.1809,65.8
7,No log,1.024567,0.3118,0.1041,0.1975,0.1978,63.6333
8,0.446800,1.07878,0.3095,0.1096,0.202,0.2028,68.3333
9,0.446800,1.13285,0.2999,0.0996,0.1952,0.1944,68.1
10,0.446800,1.134354,0.3151,0.1023,0.204,0.2052,68.6333


TrainOutput(global_step=680, training_loss=0.3496218667310827, metrics={'train_runtime': 651.6428, 'train_samples_per_second': 4.143, 'train_steps_per_second': 1.044, 'total_flos': 4178881931599872.0, 'train_loss': 0.3496218667310827, 'epoch': 10.0})