# Abstract
This notebook fine-tunes the t5-small model checkpoint on a small subset of Reviews.csv. 
This is related to abstractive-text-summarization-project notebook, but I included it as a separate notebook because the dependencies and requirements are substantially different, PyTorch, HuggingFace transformers and it requires cuda acceleration to run.

# Dependencies

In [None]:
!pip install datasets transformers rouge-score nltk torch -q
import numpy as np
import pandas as pd
import torch
import transformers
import datasets
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Importing the data

In [None]:
def processed_dataset(*, path, nrows, train_split, test_split=0.1, test_val_split=0.5):
    """
    Creates a HF DataSet from a csv file and splits it into train and test using 
    the given parameters.
    """
    df = pd.read_csv(path, nrows=nrows)
    
    # Keep only properties of interest
    df = df[['Text','Summary']]

    # Rename
    df.rename(columns={"Summary": "summary", "Text": "document"}, inplace=True)
    df.drop_duplicates(keep='first')
    df = df.dropna()
    df['document']= df['document'].apply(lambda x: x.lower())
    df['summary'] = df['summary'].apply(lambda x: x.lower())

    # Convert to HF DataSet
    raw_datasets = Dataset.from_pandas(df)
    raw_datasets = raw_datasets.rename_column("__index_level_0__", "id")
    
    # Split into train, validation and test sets
    raw_datasets = raw_datasets.train_test_split(train_size=train_split, test_size=test_split)
    test_valid = raw_datasets['test'].train_test_split(test_size=test_val_split)

    train_test_valid_dataset = DatasetDict({
        'train': raw_datasets['train'],
        'test': test_valid['test'],
        'valid': test_valid['train']})

    return train_test_valid_dataset
        
dataset = processed_dataset(path='/kaggle/input/reviews/Reviews.csv',
                            nrows=100000,
                            train_split=0.05,
                            test_split=0.1, 
                            test_val_split=0.5)

# Model definition
- We will use the t5-small checkpoint
- This section was tested using a cuda enabled machine.

In [None]:
# Model definition
device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)
model_checkpoint ='t5-small'
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)
max_input_length = 1024
max_target_length = 128

# Tokenization
- The input was lowercased in *processed_dataset*.

In [None]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
pad_on_right = tokenizer.padding_side == "right"

def preprocess_function(examples, max_input_length=1024, max_target_length=128):
    inputs = ['summarize:' + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs,
                             max_length=max_input_length,
                             truncation=True,
                             padding='max_length')

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"],
                           max_length=max_target_length,
                           truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

train = dataset['train']
valid = dataset['valid']
test = dataset['test']
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_valid = valid.map(preprocess_function, batched=True)

# Fine-tuning
## Preparing the evaluation metrics

In [None]:

import nltk

batch_size = 16
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

## Training

In [None]:
# determine the device we will be using for training
print("[INFO] training using {}".format(torch.cuda.get_device_name(0)))
print('There are %d GPU(s) available.' % torch.cuda.device_count())
torch.cuda.empty_cache()
%env WANDB_DISABLED=True

model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-amazon-fine-goods-reviews",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

metric = load_metric("rouge")
trainer.train()

# Trying your own samples

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline
nltk.download("punkt")

index= 6
sample_text = valid[index]['document']
sample_summ = valid[index]['summary']

pipe = pipeline("summarization",
                model=model,
                tokenizer=tokenizer,
                device=0)
pipe_out = pipe(sample_text)
print(pipe_out[0]['summary_text'])

# Metrics and results

Even though we fine-tuned on 5000 samples from Reviews.csv, the model achieves a Rouge1 score of 14.89, significantly higher than the values obtained by training. 

<img src="./images/t5-small-training-eval.PNG" alt="Training results and metrics" />

However, these scores are not close to 50-60, which are considered state of the art. Some of the reasons are:
- We used 5000 of 500_000 samples available for fine-tuning.
- The pre-trained model is clearly more verbose than the target summaries from Reviews.csv dataset, and the target summaries contain lots of short titles that summarize the article in a very suscint way, with words that might not appear in the article. Since Rouge is recall based (looking for overlapping from the reference into the candidates), this can partially justify lower scores.

## Some examples

**Article 1**
this is my favorite tea of all time. i drink many different varieties including teavana and other loose leaf gourmet teas and teabags, and this one is still my favorite. it is great to drink all year and not just during the holidays.

**Generated summary**
my favorite tea of all time - and not just during the holidays - i love this tea. i'm a big fan of this tea


**Article 2**
shipping was very fast, but the product is so so. many other reviews compared these to slim jims in one way or another,so here is my thoughts. these are alot drier and a whole lot less greaser than slim jims, which is 2 good things about them, but i didn`t care for the flavor or after taste they leave you with. i rate these average at best, but will keep them for a quick snack now and then. along with my order i also recieved 2 samples; maple pepper beef jerky & buffalo bills "moist & tender" beef jerky, both are very tasty and a product i will buy again. i still plan on trying more of choo choo snacks other products in the future.

**Generated summary**
alot drier and a lot less greaser than slim jims, but i didn't care for the flavor or after taste



**Article 3**
we are from europe so our chocolate / christmas candy tastes differs from many in america.  but here is the scoop on this candy.  plum jam infused with liquor is wrapped in marzipan, (a paste made out of finally ground almonds, sugar and cream).  the whole thing is dipped in dark chocolate. this is an exquisite treat, pricy yet worth it, this one time a year.  it is best nibbled and savored slowly.  if you have kids, you can let them have a few bites or even a whole 1 piece.  the alcohol content is not too high.  however, if you intend to scarf up the whole box by yourself, you may spend some time happily sitting under the christmas tree.  ok, ok, so it's only my friends and i who live under there,  (since it is our european costume to hang these and other boozy christmas chocolates on the tree as part of the decorations).  so, give it a whirl!  try a box and enjoy!

**Generated summary**
a whirl of chocolate / christmas candy - if you have kids, you may spend some time under the christmas tree!


**Article 4**
i'm glad to find this again. it's tea with some zip added in. taste good with some lemon and ethier hot or cold. good product at a good price!(and much cheaper than my local health food store)

**Generated summary**
good tea! good price. good product. good price! good product & good quality. good value! ! :-) good product!