In [1]:
%env WANDB_PROJECT=bart-summarizer

env: WANDB_PROJECT=bart-summarizer


In [2]:
# !pip install transformers datasets evaluate rouge_score wandb ipywidgets==7.7.1 huggingface_hub

## Dataset

In [3]:
# !curl -LO https://gist.github.com/noppakorn/627605aa61afa4a8f080f9996d0b10dd/raw/66489226825261ceb08aa02f064c15b45dc6a887/rate-my-prof-raw-dataset.csv
# !curl -LO https://gist.githubusercontent.com/noppakorn/8d4a7be866d18cd7dec5ada7aa8d8d0d/raw/f1114e613f302de830500cf4dd2701d661c7eba4/summarized_dataset.csv

## Logger Login

In [4]:
# !wandb login

In [5]:
# from huggingface_hub import notebook_login

# notebook_login()

## Import and Seeds

In [6]:
seed = 42 # The answer to everything

import torch
torch.manual_seed(seed)
torch.use_deterministic_algorithms(True)

import random
random.seed(seed)

import numpy as np
np.random.seed(seed)


def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(seed)

import transformers
transformers.enable_full_determinism(seed)

In [7]:
import re
import pandas as pd

import torchtext

from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

from datasets import load_dataset

import evaluate

## Prepare the data

In [8]:
checkpoint = "sshleifer/distilbart-cnn-12-6"
# checkpoint = "facebook/bart-large-cnn"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
def preprocess_function(data):
    inputs = [doc for doc in data["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(text_target=data["summary"], max_length=70, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
data = load_dataset("csv", data_files="summarized_dataset.csv")
data = data["train"].train_test_split(test_size=0.1)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-99c532b369dc8738/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-99c532b369dc8738/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## Training

In [14]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir="distilbart-cnn-12-6-rate-prof",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=4,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    report_to="wandb",
)

In [17]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/noppakornj/distilbart-cnn-12-6-rate-prof into local empty directory.


In [18]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mnoppakorn[0m ([33mmeen[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,1.153038,0.2844,0.0943,0.204,0.2027,67.8
2,No log,1.094786,0.2614,0.0498,0.1672,0.168,67.8
3,No log,1.079701,0.3042,0.0983,0.2068,0.2082,66.6667
4,No log,1.080752,0.2932,0.0914,0.2012,0.2024,67.1333
5,No log,1.092212,0.3041,0.1196,0.2229,0.2241,66.9333


TrainOutput(global_step=340, training_loss=0.7534892362706801, metrics={'train_runtime': 464.4899, 'train_samples_per_second': 2.906, 'train_steps_per_second': 0.732, 'total_flos': 2088453869518848.0, 'train_loss': 0.7534892362706801, 'epoch': 5.0})

In [22]:
trainer.push_to_hub() # Save result to hugging face hub

Upload file pytorch_model.bin:   0%|          | 1.00/1.14G [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/3.68k [00:00<?, ?B/s]

To https://huggingface.co/noppakornj/distilbart-cnn-12-6-rate-prof
   e517f5c..979ed83  main -> main

   e517f5c..979ed83  main -> main

To https://huggingface.co/noppakornj/distilbart-cnn-12-6-rate-prof
   979ed83..6371c71  main -> main

   979ed83..6371c71  main -> main



'https://huggingface.co/noppakornj/distilbart-cnn-12-6-rate-prof/commit/979ed831c715f93ef73ef003adc8b9ca57766787'

## Predictions

In [19]:
pred_result = trainer.predict(test_dataset=tokenized_data["test"])

In [20]:
pred_decode = tokenizer.batch_decode(pred_result[0], skip_special_tokens=True)

In [31]:
idx = 7

print("Professor id", data["test"]["id"][idx])

print("Target: ", data["test"]["summary"][idx])
print()
print("Generated: ", pred_decode[idx])

Professor id 636
Target:  He can be hard to understand at some points, but he seems to know what he is talking about and is good at teaching the information. Sit close to the front as he is soft spoken. The lab assignments are easy. Projects are mostly incomprehensible.

Generated:  VERY hard to understand. Easy class but I didn't learn a thing. Very difficult to undserstand over his heavy accent. Class is pretty difficult if you dont know computer lingo. It's basically like taking a language class and the professor doesnt help much. 


In [57]:
pred_dict = [{"id": i, "target": j, "generated": k, } for i, j, k in zip(data["test"]["id"], data["test"]["summary"], pred_decode)]

pred_df = pd.DataFrame.from_dict(pred_dict)

rouge_pred = pred_df.apply(lambda x: rouge.compute(predictions=[x["generated"]], references=[x["target"]], use_stemmer=True), axis=1).apply(pd.Series)

pred_df = pred_df.join(rouge_pred)

In [58]:
pred_df

Unnamed: 0,id,target,generated,rouge1,rouge2,rougeL,rougeLsum
0,593,She is such a great person and she has so much...,"Pat rocks! She's a really great teacher, very ...",0.252252,0.018349,0.162162,0.162162
1,668,Dr. Karplus curves the class generously only b...,Andy definitely cares about his students succe...,0.285714,0.038835,0.133333,0.133333
2,202,"Great professor! A bit disorganized, but also ...","Great professor! A bit disorganized, but also ...",0.769231,0.764045,0.769231,0.769231
3,583,His lectures are aimless and unclear a lot of ...,He seems like a really nice guy who is obvious...,0.297521,0.033613,0.132231,0.132231
4,413,By far my favorite professor at UD. Gives grea...,"Awesome dude, well-versed, and genuinely cares...",0.359551,0.183908,0.292135,0.292135
5,137,I was looking forward to this class and was gr...,Worst professor I have ever had. Doesn't liste...,0.285714,0.017094,0.184874,0.184874
6,332,Quite possibly the worst excuse for a teacher ...,The man is the worst professor I have ever had...,0.307692,0.034783,0.17094,0.17094
7,636,"He can be hard to understand at some points, b...",VERY hard to understand. Easy class but I didn...,0.222222,0.045455,0.177778,0.177778
8,110,Dr. Smith is extremely knowledgeable and enthu...,The best professor by far! He knows his stuff ...,0.299213,0.032,0.141732,0.141732
9,586,Great Professor and an easy A. this class is s...,"Eugene was awesome. No textbook, no tests. Its...",0.19802,0.0,0.118812,0.118812


In [60]:
pred_df.to_csv("bart-summarizer-predictions.csv", encoding="utf-8")