In [15]:
import numpy as np                                                                                    # linear algebra
import torch                                                                                          # pytorch
import transformers                                                                                   # huggingface transformers
from sklearn.model_selection import train_test_split                                                  # split data
from transformers import AutoTokenizer                                                                # tokenizer
import pandas as pd                                                                                   # data processing, CSV file I/O (e.g. pd.read_csv)
from datasets import load_dataset                                                                     # load dataset
import glob                                                                                           # glob for file path
import os                                                                                             # os for file path
import re                                                                                             # regex
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments,AutoModelWithLMHead   # T5 model

In [9]:
def read_data(path):                                                              # function to read data from the path
    data = []
    for topic in os.listdir(path):
        for file in os.listdir(path + "/" + topic):
            with open(path + "/" + topic + "/" + file) as f:
                data.append(f.read())
    return data

original_text = read_data("files/BBC News Summary/Summaries")                      # read the original text
summary_text = read_data("files/BBC News Summary/News Articles")                   # read the summary text

df = pd.DataFrame({'original':original_text,'summary':summary_text})               # create a dataframe

df.to_csv('files/summary.csv', index=False)                                        # export the dataframe to a csv file

In [10]:
dataset = load_dataset('csv', data_files='files/summary.csv', split='train')       # load the dataset
dataset = dataset.train_test_split(test_size=0.1)                                  # split to train and test
train_dataset = dataset['train']                                                   # train dataset
val_dataset = dataset['test']                                                      # test dataset

Using custom data configuration default-49933766bd4604e6


Downloading and preparing dataset csv/default to C:/Users/moham/.cache/huggingface/datasets/csv/default-49933766bd4604e6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1003.66it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 285.17it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                                 

Dataset csv downloaded and prepared to C:/Users/moham/.cache/huggingface/datasets/csv/default-49933766bd4604e6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.




In [11]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')                                 # load the tokenizer

def tokenize(batch):                                                                 # function to tokenize the data
    tokenized_input = tokenizer(batch['original'],                                   # tokenize the input and label
                                padding='max_length',                                # pad the input and label
                                truncation=True,                                     # truncate the input and label
                                max_length=512)                                      # max length of the input and label
    tokenized_label = tokenizer(batch['summary'], 
                                padding='max_length', 
                                truncation=True, 
                                max_length=159)

    tokenized_input['labels'] = tokenized_label['input_ids']                         # add the label to the input

    return tokenized_input                                                           # return the tokenized input

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=512)            # tokenize the train dataset
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))   # tokenize the test dataset

train_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels']) # set the format of the train dataset
val_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])   # set the format of the test dataset

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
100%|██████████| 4/4 [00:08<00:00,  2.24s/ba]
100%|██████████| 1/1 [00:00<00:00,  1.03ba/s]


In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')                         # load the model

output_dir = './output_dir'                                                           # output directory

training_args = TrainingArguments(                                                    # define the training arguments
    output_dir=output_dir,                                                            # output directory
    num_train_epochs=1,                                                               # number of training epochs, we set it to 1 because we are using wandb to track the training process
                                                                                      # and we can stop the training whenever we want, plus each epoch takes a lot of time
    per_device_train_batch_size=8,                                                    # batch size, we set it to 8 because we are using a GPU with 8GB of vRAM
    per_device_eval_batch_size=8,                                                     # batch size, we set it to 8 because we are using a GPU with 8GB of vRAM
    eval_accumulation_steps=1,                                                        # number of eval steps to keep in GPU (the higher, the mor vRAM used)
    prediction_loss_only=True,                                                        # if I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.1,                                                                # learning rate (the higher, the faster the model will learn, 
                                                                                      # but it can also lead to divergence)
    evaluation_strategy='steps',                                                      # run evaluation every eval_steps
    save_steps=1000,                                                                  # how often to save a checkpoint
    save_total_limit=1,                                                               # number of maximum checkpoints to save
    remove_unused_columns=True,                                                       # removes useless columns from the dataset
    run_name='run_name',                                                              # wandb run name
    logging_steps=1000,                                                               # how often to log loss to wandb
    eval_steps=1000,                                                                  # how often to run evaluation on the val_set
    logging_first_step=False,                                                         # whether to log also the very first training step to wandb
    load_best_model_at_end=True,                                                      # whether to load the best model found at each evaluation.
    metric_for_best_model="loss",                                                     # use loss to evaluate best model.
    greater_is_better=False                                                           # best model is the one with the lowest loss, not highest.
)

trainer = Trainer(                                                                    # define the trainer
    model=model,                                                                      # model to train
    args=training_args,                                                               # training arguments
    train_dataset=train_dataset,                                                      # train dataset       
    eval_dataset=val_dataset                                                          # evaluation dataset
)

trainer.train()                                                                       

In [7]:
trainer.save_model(output_dir + '/model_')                                            # save the model in order to continue training later

Saving model checkpoint to ./output_dir/model_
Configuration saved in ./output_dir/model_\config.json
Model weights saved in ./output_dir/model_\pytorch_model.bin


In [10]:
from transformers import pipeline

summarizer = pipeline('summarization',                                                # define the summarizer
                        model=output_dir + '/model_',                                 # model path
                        tokenizer=tokenizer,                                          # tokenizer
                        framework='pt')                                               # framework

loading configuration file ./output_dir/model_\config.json
Model config T5Config {
  "_name_or_path": "./output_dir/model_",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
   

# <p style='color:red;'>Notice</p> 
#### There is a big limitation here, the training is done on a single GPU, so it takes a lot of time to train the model, I tried to train it on a Google Colab Pro, but it was still too slow, so I decided to use wandb to track the training process and stop it whenever I want, then I save the model and use it to generate summaries.

#### Even though, I trained the model for 1 epoch, it took me more than 6 hours to train it, so I decided to use a pretrained model, which I will show you in the next section.


## Generating summaries using t5-small model

In [26]:
def generate_summary(text):                                                              # function to generate the summary

    model = AutoModelWithLMHead.from_pretrained("t5-small")                              # load the model
    tokenizer = AutoTokenizer.from_pretrained("t5-small")                                # load the tokenizer

    inputs = tokenizer.encode("summarize: " + text,                                      # encode the input
                                return_tensors="pt",                                     # return tensors
                                max_length=512,                                          # max length of the input
                                truncation=True)                                         # truncate the input

    outputs = model.generate(inputs,                                                     # generate the summary
                                max_length=250,                                          # max length of the summary (the higher, the more the model will try to generate a longer summary)
                                min_length=80,                                          # min length of the summary (the higher, the more the model will try to generate a longer summary)
                                length_penalty=2.0,                                      # length penalty (the higher, the more the model will try to generate a longer summary)
                                num_beams=4,                                             # number of beams (the higher, the more the model will try to generate a longer summary)
                                early_stopping=True)                                     # early stopping (if the model generates a summary that is longer than the max length, it will stop)
    return tokenizer.decode(outputs[0])                                                  # return the summary


def generate_summary_for_text(text):                                                     # function to generate the summary for a text

    # text = re.sub(r'\d+', '', text)                                                      # remove the numbers (optional because the model is trained on numbers too)
    text = re.sub(' +', ' ', text)                                                       # remove the extra spaces
    text = re.sub('\n+', ' ', text)                                                      # remove the extra new lines
    text = re.sub('\t+', ' ', text)                                                      # remove the extra tabs

    summary = generate_summary(text)                                                     # generate the summary               
    return summary                                                                       # return the summary

In [27]:
summary_ = generate_summary_for_text(df['original'][5])

print('Predicted summary :\n',summary_)
print(" ")
print('Original summary :\n',df['summary'][5])

def count_words(text):
    return len(text.split())

print('Predicted summary word count : ',count_words(summary_))
print('Original summary word count : ',count_words(df['summary'][5]))

Predicted summary :
 <pad> a common technical definition of a recession is two successive quarters of negative growth. on an annual basis, the data suggests annual growth of just 0.2%, suggesting a much more hesitant recovery than had previously been thought. japan's economy teetered on the brink of a technical recession in the three months to September, figures show.</s>
 
Original summary :
 Japan narrowly escapes recession

Japan's economy teetered on the brink of a technical recession in the three months to September, figures show.

Revised figures indicated growth of just 0.1% - and a similar-sized contraction in the previous quarter. On an annual basis, the data suggests annual growth of just 0.2%, suggesting a much more hesitant recovery than had previously been thought. A common technical definition of a recession is two successive quarters of negative growth.

The government was keen to play down the worrying implications of the data. "I maintain the view that Japan's economy 