In [1]:
# we reuse the following code from huggingface for training
#https://github.com/huggingface/notebooks/blob/master/examples/summarization.ipynb

In [1]:
import transformers
import os
import pandas as pd
import datasets
os.environ["CUDA_VISIBLE_DEVICES"]="5"
import pickle
print(transformers.__version__)

4.21.2


## Loading a model

In [5]:
#model_checkpoint = "t5-base"
# model_checkpoint = "Michau/t5-base-en-generate-headline"
#model_checkpoint = "facebook/bart-large-cnn"
# model_checkpoint = "facebook/bart-large"
model_checkpoint = 'microsoft/prophetnet-large-uncased-cnndm'
small = '' #'_small' ''

## Loading the dataset

In [6]:
from datasets import load_dataset, load_metric

# raw_datasets = load_dataset("xsum")
# print(raw_datasets)


# test_add = '../Datasets/NewsRoomorg/data_test2_small.csv'
# train_add = '../Datasets/NewsRoomorg/data_train2_small.csv'
# valid_add = '../Datasets/NewsRoomorg/data_valid2_small.csv'
#temp = pd.read_csv(test_add)

test_add = '../Datasets/NewsRoomorg/data_test2_small.py'
train_add = f'../Datasets/NewsRoomorg/data_train2{small}.py'
valid_add = '../Datasets/NewsRoomorg/data_valid2_small.py'

In [7]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

## Preprocessing the data

In [8]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "headline: "
else:
    prefix = "headline: "

In [10]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["title"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
metric = load_metric("rouge")

In [None]:
if os.path.isfile(f'./PData/tokenized_datasets{small}.pickle'):
    print('datasets exist, trying to load the dataset')
    tokenized_datasets = pickle.load(open(f'./PData/tokenized_datasets{small}.pickle','rb'))
else:
    print('dataset does not exist, loading and tokenizing the dataset')
    raw_datasets = load_dataset('pandas',
                             data_files={
                                 'train':train_add,
                                 'valid':valid_add,
                                 'test':test_add})
    tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
    pickle.dump(tokenized_datasets, open(f'./PData/tokenized_datasets{small}.pickle','wb'))

In [25]:
tokenized_datasets = pickle.load(open(f'./PData/tokenized_datasets{small}.pickle','rb'))

In [24]:
tokenized_datasets0 = tokenized_datasets

In [16]:
# tokenized_datasets.save_to_disk('./PData/tokenized_datasets')
# pickle.dump(tokenized_datasets, open('./PData/tokenized_datasets.pickle','wb'))

In [17]:
# tokenized_datasets = pickle.load(open('./PData/tokenized_datasets.pickle','rb'))

## Fine-tuning the model

In [26]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [27]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    output_dir = 'results/models/'+f"{model_name}-NewsRoom{small}",
    logging_dir = 'results/logs',
    evaluation_strategy = "epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end = True
    #push_to_hub=True,
)

In [21]:
args.save_strategy

<IntervalStrategy.EPOCH: 'epoch'>

In [28]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [29]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()