In [None]:
# !nvidia-smi
# !pip install --upgrade accelerate
# !pip uninstall -y transformers accelerate
# !pip install transformers accelerate
# !pip install seaborn

In [None]:
# ----------------------------------------------------------------
# Imports
# ----------------------------------------------------------------
from transformers import pipeline, set_seed, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_metric, load_from_disk
import matplotlib.pyplot as plt, seaborn as sns, pandas as pd, numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import torch

nltk.download('punkt')

In [None]:
# Setting up the device
# ----------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); 
print(device)

In [None]:
# Defining the Model
# ----------------------------------------------------------------
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [None]:
#!pip install patool


In [None]:
# # Download and Unzip the data
# # ----------------------------------------------------------------
# import patoolib
# data_zip_file = r"G:\My Drive\Study\Data\Project - Text Summarizer (summarizer-data).zip"
# # !unzip -q -o "G:\My Drive\Study\Data\Project - Text Summarizer (summarizer-data).zip"

# patoolib.extract_archive(data_zip_file, outdir=r"G:\My Drive\Study\Data\Project - Text Summarizer (data)")

In [None]:
dataset_samsum = load_from_disk(r"G:\My Drive\Study\Data\Project - Text Summarizer (data)\samsum_dataset")
dataset_samsum

In [None]:
# Looking at the Data
# ------------------------------------------------------------------------------
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]
print("Split length: ", split_lengths)
print("Features: ", dataset_samsum['train'].column_names)
print()
print("----------------------------------------")
print("Dialogue:")
print("----------------------------------------")
print(dataset_samsum['test'][1]['dialogue'])
print()
print("----------------------------------------")
print("Summary: ")
print("----------------------------------------")
print(dataset_samsum['test'][1]['summary'])

In [None]:
def convert_examples_to_features(example_batch):
    
    input_encodings = tokenizer(example_batch['dialogue'], max_length=1024, truncation='do_not_truncate')

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation='do_not_truncate')

    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels':target_encodings['input_ids']
    }

In [None]:
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)
dataset_samsum_pt['train']

---------------------

3 more columns has been added to the dataset.

---------------------

In [None]:
# Training
# ---------------------

from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
# Defining the Training Arguments
# ------------------------------------------------------------------
trainer_args = TrainingArguments(
    output_dir='pegasus-samsum',
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01, 
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16    
)

# Defining the Trainer
# -----------------------------------------------------------------
trainer = Trainer(model=model_pegasus, 
                  args=trainer_args,
                  tokenizer=tokenizer,
                  data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt['test'], #Since here train size is huge
                  eval_dataset=dataset_samsum_pt['validation']
                  )

In [None]:
# Training Begins
# -----------------------------------------------------------------
trainer.train()