In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/english-levels/test.txt
/kaggle/input/english-levels/validation.txt
/kaggle/input/english-levels/train.txt


In [8]:
!pip install datasets transformers nltk rouge-score sacrebleu sacremoses evaluate

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Building wheels for collected packages: rouge-score, sacremoses
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?2

In [9]:
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
from tqdm import tqdm
import torch
from datasets import load_dataset
from datasets import Dataset
import evaluate
from transformers import T5TokenizerFast
from transformers import T5ForConditionalGeneration
from transformers import BartTokenizerFast
from transformers import BartForConditionalGeneration
from transformers import LEDForConditionalGeneration
from transformers import LEDTokenizerFast
from transformers import LongT5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import EarlyStoppingCallback




[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
text1 = """People discovered the oldest human footprints outside of Africa in Britain, dating back around a million years, making these footprints incredibly important finds.

People found them on a beach on the Norfolk coast in the east of England, and they are direct evidence of the earliest known humans in Northern Europe.

People first discovered the prints in May 2013 during low tide after the sand had eroded to reveal hollows resembling human footprints. Scientists recorded the surface using photogrammetry, which is a technique that can stitch together digital photographs to create a permanent record and a 3D image of an imprint. People then unveiled the images and a model at a news conference at the British Museum in London.

Scientists now say that the amazing discovery will rewrite our understanding of the human occupation of Britain and Europe."""

In [10]:
def read_data(path):
    with open(path, encoding="utf-8") as f:
        lines = f.read().split("\n")
    
    data = {"simple": [], "medium": [], "hard": []}
    for i in range(1, len(lines)):
        simple, medium, hard = lines[i].split("\t")
        data["simple"].append(simple)
        data["medium"].append(medium)
        data["hard"].append(hard)
    
    return data

In [11]:
def clean_text(text):
    sentences = nltk.sent_tokenize(text.strip())
    sentences_cleaned = [s for sent in sentences 
                         for s in sent.split("\n") 
                         if len(s) > 0]
    text_cleaned = "\n".join(sentences_cleaned)
    return text_cleaned

In [4]:
def create_training_history_df(history_data):
    training_history = []
    for measurement in history_data[:-1]:
        if "eval_loss" in measurement:
            training_history.append(measurement)
    return pd.DataFrame(training_history)

In [12]:
class SimplificationModel:
    def __init__(self,
                 checkpoint,
                 model_type,
                 tokenizer,
                 prefix="Simplify English: ",
                 max_input_length=512,
                 max_target_length=512,
                 batch_size=8):
        if "/" in checkpoint:
            name = checkpoint.split("/")[-1]
        else:
            name = checkpoint
        
        self.model_dir = f"models/{name}"
        self.checkpoint = checkpoint
        self.prefix = prefix
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
        self.batch_size = batch_size
        self.model_type = model_type
        self.tokenizer = tokenizer.from_pretrained(checkpoint, max_length=self.max_input_length)
        self.data_collator = DataCollatorForSeq2Seq(self.tokenizer)
        self.model = model_type.from_pretrained(checkpoint)

    def create_tokenized_data(self, train, validation, test):
        tokenized_train = train.map(self.preprocess_data, batched=True)
        tokenized_validation = validation.map(self.preprocess_data, batched=True)
        tokenized_test = test.map(self.preprocess_data, batched=True)
        return tokenized_train, tokenized_validation, tokenized_test

    def preprocess_data(self, examples):
        inputs = [self.prefix + clean_text(text) for text in examples["hard"]]
        model_inputs = self.tokenizer(
            inputs, max_length=self.max_input_length, truncation=True
        )
    
        labels = self.tokenizer(
            text_target=[clean_text(text) for text in examples["simple"]],
            max_length=self.max_target_length,
            truncation=True
        )
        
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def compute_metrics(self, eval_pred):
        predictions, labels, inputs = eval_pred
        predictions = np.where(predictions != -100, predictions, self.tokenizer.pad_token_id)
        decoded_preds = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)

        labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        inputs = np.where(inputs != -100, inputs, self.tokenizer.pad_token_id)
        decoded_inputs = self.tokenizer.batch_decode(inputs, skip_special_tokens=True)

        decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                         for pred in decoded_preds]
        decoded_inputs = ["\n".join(nltk.sent_tokenize(single_input.strip()))
                          for single_input in decoded_inputs]
        decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                          for label in decoded_labels]
        nested_decoded_labels = [[label] for label in decoded_labels]
       
        result = rouge.compute(
            predictions=decoded_preds, 
            references=decoded_labels,
            use_stemmer=True)
        
        result = {k: v * 100 for k, v in result.items()}
        
        result["BLEU"] = sacrebleu.compute(
            predictions=decoded_preds,
            references=decoded_labels)["score"]

        result["SARI"] = sari.compute(
            sources=decoded_inputs,
            predictions=decoded_preds,
            references=nested_decoded_labels)["sari"]
        
        prediction_lens = [np.count_nonzero(pred != self.tokenizer.pad_token_id)
                          for pred in predictions]
        result["Generated text length"] = np.mean(prediction_lens)

        return {k: round(v, 4) for k, v in result.items()}

    def preprocess_test(self, examples):
        inputs = ["Simplify English: "+ text for text in examples["hard"]]
        model_inputs = self.tokenizer(
            inputs, max_length=self.max_target_length, 
            truncation=True, padding="max_length")
        return model_inputs
    
    def train(self, tokenized_train, tokenized_validation, tokenized_test, epochs):
        not_working_with_fp16 = {"google/flan-t5-small", "google/flan-t5-base", "google/long-t5-tglobal-base"}
        
        args = Seq2SeqTrainingArguments(
            self.model_dir,
            include_inputs_for_metrics=True,
            evaluation_strategy="steps",
            eval_steps=100,
            logging_strategy="steps",
            logging_steps=100,
            save_strategy="steps",
            save_steps=200,
            optim="adamw_torch",
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            save_total_limit=1,
            num_train_epochs=epochs,
            predict_with_generate=True,
            fp16=self.checkpoint not in not_working_with_fp16, 
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            report_to="none",
            generation_max_length=self.max_target_length
        )
        
        early_stop = EarlyStoppingCallback()

        trainer = Seq2SeqTrainer(
            model_init=lambda: self.model,
            args=args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_validation,
            data_collator=self.data_collator,
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics,
            callbacks=[early_stop]
        )

        trainer.train()
        history = trainer.state.log_history
        validation_evaluation = trainer.evaluate()
        test_evaluation = trainer.evaluate(eval_dataset=tokenized_test)
        trainer.save_model()
        self.tokenizer = self.tokenizer.from_pretrained(self.model_dir, max_length=self.max_input_length)
        self.model = self.model_type.from_pretrained(self.model_dir)
        return validation_evaluation, test_evaluation, history

    def simplify(self, text):
        inputs = ["Simplify English: " + text]
        inputs = self.tokenizer(inputs,
            max_length=self.max_input_length,
            truncation=True, return_tensors="pt")
        output = self.model.generate(
            **inputs, num_beams=8, do_sample=True,
            max_length=self.max_target_length)
        decoded_output = self.tokenizer.batch_decode(
            output, skip_special_tokens=True)[0]
        return decoded_output

In [13]:
train_dct = read_data("/kaggle/input/english-levels/train.txt")
validation_dct = read_data("/kaggle/input/english-levels/validation.txt")
test_dct = read_data("/kaggle/input/english-levels/test.txt")

train = Dataset.from_dict(train_dct)
validation = Dataset.from_dict(validation_dct)
test = Dataset.from_dict(test_dct)

In [14]:
rouge = evaluate.load("rouge")
sacrebleu = evaluate.load("sacrebleu")
sari = evaluate.load("sari")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

In [None]:
text1 = """People discovered the oldest human footprints outside of Africa in Britain, dating back around a million years, making these footprints incredibly important finds.

People found them on a beach on the Norfolk coast in the east of England, and they are direct evidence of the earliest known humans in Northern Europe.

People first discovered the prints in May 2013 during low tide after the sand had eroded to reveal hollows resembling human footprints. Scientists recorded the surface using photogrammetry, which is a technique that can stitch together digital photographs to create a permanent record and a 3D image of an imprint. People then unveiled the images and a model at a news conference at the British Museum in London.

Scientists now say that the amazing discovery will rewrite our understanding of the human occupation of Britain and Europe."""

In [25]:
text2 = """Well, we have heard of riding tandem, but fitting 8 people on a motorbike is getting a bit ridiculous.

A person in a vehicle filmed an overloaded motorcycle driving along a road in China. The person who filmed was concerned for the safety of the bike’s passengers.

The video shows one man driving the bike while two girls sit at the front and three women with two more kids sit at the back.

China’s Traffic Management Department warned people of the dangers of crazy driving and reminded motorists to bear safety in mind."""

In [26]:
text3 = """Emma Allen, a British artist from London, took over 750 photographs after changing her face paint each time.

She then created a stop-motion animation from the pictures, i.e. showing the pictures in a video at high speed to give the impression of movement.

The artwork called Ruby is about rebirth and the transfer of energy, and it took Emma five days to create, working 17 hours a day."""

## bart-base

In [27]:
bart_base_model = SimplificationModel(
    checkpoint="facebook/bart-base",
    model_type=BartForConditionalGeneration,
    tokenizer=BartTokenizerFast,
    batch_size=16
)

In [28]:
simplification_model = bart_base_model
print(simplification_model.checkpoint)
tokenized_train, tokenized_validation, tokenized_test = simplification_model.create_tokenized_data(train, validation, test)
validation_evaluation, test_evaluation, history = simplification_model.train(
    tokenized_train, tokenized_validation, tokenized_test, epochs=20)

facebook/bart-base


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Sari,Generated text length
100,2.0779,1.657589,48.1925,20.3432,30.4483,47.1865,14.8016,45.8268,101.6738
200,1.7096,1.612496,49.4466,21.003,31.3491,48.3269,15.2194,46.1963,99.44
300,1.5793,1.58959,49.7849,21.4078,31.5711,48.7139,15.3294,47.204,96.7062
400,1.4559,1.604145,50.1902,21.9162,31.8379,49.0543,16.555,47.3693,102.8569
500,1.369,1.616538,50.4242,22.1279,32.2203,49.2793,16.415,47.3554,101.3969


In [30]:
history

[{'loss': 2.0779,
  'learning_rate': 4.7971311475409835e-05,
  'epoch': 0.82,
  'step': 100},
 {'eval_loss': 1.6575886011123657,
  'eval_rouge1': 48.1925,
  'eval_rouge2': 20.3432,
  'eval_rougeL': 30.4483,
  'eval_rougeLsum': 47.1865,
  'eval_BLEU': 14.8016,
  'eval_SARI': 45.8268,
  'eval_Generated text length': 101.6738,
  'eval_runtime': 247.2235,
  'eval_samples_per_second': 2.629,
  'eval_steps_per_second': 0.166,
  'epoch': 0.82,
  'step': 100},
 {'loss': 1.7096,
  'learning_rate': 4.5922131147540986e-05,
  'epoch': 1.64,
  'step': 200},
 {'eval_loss': 1.612496256828308,
  'eval_rouge1': 49.4466,
  'eval_rouge2': 21.003,
  'eval_rougeL': 31.3491,
  'eval_rougeLsum': 48.3269,
  'eval_BLEU': 15.2194,
  'eval_SARI': 46.1963,
  'eval_Generated text length': 99.44,
  'eval_runtime': 218.8108,
  'eval_samples_per_second': 2.971,
  'eval_steps_per_second': 0.187,
  'epoch': 1.64,
  'step': 200},
 {'loss': 1.5793,
  'learning_rate': 4.387295081967213e-05,
  'epoch': 2.46,
  'step': 300}

In [31]:
validation_evaluation

{'eval_loss': 1.604144811630249,
 'eval_rouge1': 50.1902,
 'eval_rouge2': 21.9162,
 'eval_rougeL': 31.8379,
 'eval_rougeLsum': 49.0543,
 'eval_BLEU': 16.555,
 'eval_SARI': 47.3693,
 'eval_Generated text length': 102.8569,
 'eval_runtime': 239.698,
 'eval_samples_per_second': 2.712,
 'eval_steps_per_second': 0.171,
 'epoch': 4.1}

In [32]:
test_evaluation

{'eval_loss': 1.6068183183670044,
 'eval_rouge1': 50.4142,
 'eval_rouge2': 22.1904,
 'eval_rougeL': 32.1049,
 'eval_rougeLsum': 49.2875,
 'eval_BLEU': 16.4303,
 'eval_SARI': 47.2784,
 'eval_Generated text length': 103.7831,
 'eval_runtime': 241.0791,
 'eval_samples_per_second': 2.696,
 'eval_steps_per_second': 0.17,
 'epoch': 4.1}

In [33]:
for text in (text1, text2, text3):
    print(simplification_model.simplify(text))
    print()

People find the oldest human footprints in Britain.
These footprints date back around a million years.
The footprints are on a beach in Norfolk, England.
They are the oldest footprints in Northern Europe.
People find them in May 2013 during low tide.
The sand breaks.
There are hollows in the sand.
This makes the footprints look like footprints.
Scientists use photogrammetry.
This is a technique that takes digital photographs.
It makes a permanent record and a 3D image of an imprint.
Scientists talk about the new footprints at a news conference.
They say that they will change our understanding of the human occupation of Britain and Europe.

This news is about a motorcycle.
It is a tandem.
It has 8 people on it.
One man drives the bike.
Two girls sit at the front.
Three women with two more children sit in the back.
The motorcycle is overloaded.
There are eight people on the motorbike.
This is dangerous.
China’s Traffic Management Department warns people about this dangerous driving.

Emm

## t5-small

In [20]:
t5_small_model = SimplificationModel(
    checkpoint="t5-small",
    model_type=T5ForConditionalGeneration,
    tokenizer=T5TokenizerFast,
    batch_size=16
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [21]:
simplification_model = t5_small_model
print(simplification_model.checkpoint)
tokenized_train, tokenized_validation, tokenized_test = simplification_model.create_tokenized_data(train, validation, test)
validation_evaluation, test_evaluation, history = simplification_model.train(
    tokenized_train, tokenized_validation, tokenized_test, epochs=20)

t5-small


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Sari,Generated text length
100,2.791,2.226701,26.9448,9.9677,19.3652,25.9844,4.4428,41.15,248.6108
200,2.3547,2.131125,29.224,10.9908,21.3417,28.2709,6.207,42.0663,185.8108
300,2.2792,2.082598,30.6159,11.4876,22.4185,29.6575,7.5765,42.496,152.6
400,2.2049,2.056505,33.1586,12.9042,23.8185,32.2308,8.5116,43.2876,151.3569
500,2.176,2.037465,33.4125,13.1933,23.852,32.4809,8.4597,43.6042,156.6923
600,2.1507,2.020243,32.6863,12.538,23.513,31.7513,8.7972,43.3502,139.5462
700,2.1233,2.011278,35.0025,13.9307,24.8577,34.013,10.0921,43.8525,129.6015
800,2.0924,2.002599,35.7042,14.2732,25.1728,34.6657,10.3993,44.0579,130.36
900,2.0831,1.990785,35.0276,13.8934,24.7776,34.0605,10.3228,43.9326,126.3523
1000,2.0727,1.98607,36.0806,14.5838,25.3544,35.1099,10.7078,44.4838,128.4031


In [23]:
history

[{'loss': 2.791,
  'learning_rate': 4.799180327868853e-05,
  'epoch': 0.82,
  'step': 100},
 {'eval_loss': 2.226700782775879,
  'eval_rouge1': 26.9448,
  'eval_rouge2': 9.9677,
  'eval_rougeL': 19.3652,
  'eval_rougeLsum': 25.9844,
  'eval_BLEU': 4.4428,
  'eval_SARI': 41.15,
  'eval_Generated text length': 248.6108,
  'eval_runtime': 195.6902,
  'eval_samples_per_second': 3.322,
  'eval_steps_per_second': 0.21,
  'epoch': 0.82,
  'step': 100},
 {'loss': 2.3547,
  'learning_rate': 4.596311475409836e-05,
  'epoch': 1.64,
  'step': 200},
 {'eval_loss': 2.1311252117156982,
  'eval_rouge1': 29.224,
  'eval_rouge2': 10.9908,
  'eval_rougeL': 21.3417,
  'eval_rougeLsum': 28.2709,
  'eval_BLEU': 6.207,
  'eval_SARI': 42.0663,
  'eval_Generated text length': 185.8108,
  'eval_runtime': 179.6805,
  'eval_samples_per_second': 3.618,
  'eval_steps_per_second': 0.228,
  'epoch': 1.64,
  'step': 200},
 {'loss': 2.2792,
  'learning_rate': 4.3913934426229506e-05,
  'epoch': 2.46,
  'step': 300},
 {'e

In [24]:
validation_evaluation

{'eval_loss': 1.9568997621536255,
 'eval_rouge1': 37.6872,
 'eval_rouge2': 15.2518,
 'eval_rougeL': 26.1978,
 'eval_rougeLsum': 36.6748,
 'eval_BLEU': 11.525,
 'eval_SARI': 44.7712,
 'eval_Generated text length': 122.3108,
 'eval_runtime': 100.6309,
 'eval_samples_per_second': 6.459,
 'eval_steps_per_second': 0.407,
 'epoch': 15.57}

In [25]:
test_evaluation

{'eval_loss': 1.9588016271591187,
 'eval_rouge1': 37.5573,
 'eval_rouge2': 15.2255,
 'eval_rougeL': 26.2032,
 'eval_rougeLsum': 36.6787,
 'eval_BLEU': 12.1396,
 'eval_SARI': 44.1714,
 'eval_Generated text length': 119.3677,
 'eval_runtime': 99.9563,
 'eval_samples_per_second': 6.503,
 'eval_steps_per_second': 0.41,
 'epoch': 15.57}

In [22]:
for text in (text1, text2, text3):
    print(simplification_model.simplify(text))
    print()

People find the oldest human footprints outside of Africa in Britain. They find them on a beach in the east of England. They are direct evidence of the earliest known humans in Northern Europe. People first find the prints in May 2013 during low tide. The sand erodes to reveal hollows. People then unveiled the images and a model at a news conference in London. Scientists say that the discovery will rewrite our understanding of the human occupation of Britain and Europe.

This news is from China. A man drives a bike. Two girls sit at the front of the bike. Three women with two more kids sit at the back of the bike. China’s Traffic Management Department warns people of the dangers of crazy driving.

Emma Allen is a British artist from London. She takes 750 photographs. She changes her face paint every time. She creates a stop-motion animation. The animation shows the pictures in a video. It gives the impression of movement. Ruby is about rebirth and the transfer of energy. It takes Emma 

## t5-base

In [41]:
t5_base_model = SimplificationModel(
    checkpoint="t5-base",
    model_type=T5ForConditionalGeneration,
    tokenizer=T5TokenizerFast,
    batch_size=8
)

In [44]:
simplification_model = t5_base_model
print(simplification_model.checkpoint)
tokenized_train, tokenized_validation, tokenized_test = simplification_model.create_tokenized_data(train, validation, test)
validation_evaluation, test_evaluation, history = simplification_model.train(
    tokenized_train, tokenized_validation, tokenized_test, epochs=20)

t5-base


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Sari,Generated text length
100,2.5115,1.815315,25.8765,9.7372,18.5719,25.0487,4.2325,42.2401,258.2846
200,1.9029,1.735013,28.7401,11.6502,20.4785,28.0187,5.3475,43.9381,232.9769
300,1.8172,1.702723,32.3272,13.5914,22.6495,31.5542,6.1586,44.9021,224.0877
400,1.7586,1.680441,32.1988,13.4421,22.5748,31.3823,6.5019,44.9485,213.2908
500,1.7231,1.665341,35.043,14.8573,24.6775,34.2088,8.0396,45.0963,179.4046
600,1.676,1.653485,38.1298,16.3816,26.4315,37.2854,9.5885,45.9404,162.7815
700,1.6255,1.648362,36.8032,15.735,25.3735,36.0024,8.5727,45.6487,179.0062
800,1.5818,1.644971,40.4335,17.3804,27.8094,39.5278,10.8644,46.2851,152.9754
900,1.5616,1.635146,40.6188,17.4912,27.9136,39.7422,12.0214,46.2451,136.34
1000,1.5556,1.641487,43.288,18.7345,29.4282,42.3842,13.4929,46.5577,129.3892


In [45]:
validation_evaluation

{'eval_loss': 1.6289434432983398,
 'eval_rouge1': 40.9063,
 'eval_rouge2': 17.8781,
 'eval_rougeL': 28.0671,
 'eval_rougeLsum': 40.0418,
 'eval_BLEU': 11.6467,
 'eval_SARI': 46.7343,
 'eval_Generated text length': 146.6538,
 'eval_runtime': 429.2086,
 'eval_samples_per_second': 1.514,
 'eval_steps_per_second': 0.191,
 'epoch': 5.33}

In [46]:
test_evaluation

{'eval_loss': 1.6236543655395508,
 'eval_rouge1': 41.8334,
 'eval_rouge2': 18.4783,
 'eval_rougeL': 28.4659,
 'eval_rougeLsum': 40.9309,
 'eval_BLEU': 12.1444,
 'eval_SARI': 46.5778,
 'eval_Generated text length': 144.3262,
 'eval_runtime': 411.4006,
 'eval_samples_per_second': 1.58,
 'eval_steps_per_second': 0.199,
 'epoch': 5.33}

In [47]:
history

[{'loss': 2.5115,
  'learning_rate': 4.8975409836065575e-05,
  'epoch': 0.41,
  'step': 100},
 {'eval_loss': 1.8153151273727417,
  'eval_rouge1': 25.8765,
  'eval_rouge2': 9.7372,
  'eval_rougeL': 18.5719,
  'eval_rougeLsum': 25.0487,
  'eval_BLEU': 4.2325,
  'eval_SARI': 42.2401,
  'eval_Generated text length': 258.2846,
  'eval_runtime': 629.2869,
  'eval_samples_per_second': 1.033,
  'eval_steps_per_second': 0.13,
  'epoch': 0.41,
  'step': 100},
 {'loss': 1.9029,
  'learning_rate': 4.796106557377049e-05,
  'epoch': 0.82,
  'step': 200},
 {'eval_loss': 1.7350130081176758,
  'eval_rouge1': 28.7401,
  'eval_rouge2': 11.6502,
  'eval_rougeL': 20.4785,
  'eval_rougeLsum': 28.0187,
  'eval_BLEU': 5.3475,
  'eval_SARI': 43.9381,
  'eval_Generated text length': 232.9769,
  'eval_runtime': 627.2322,
  'eval_samples_per_second': 1.036,
  'eval_steps_per_second': 0.131,
  'epoch': 0.82,
  'step': 200},
 {'loss': 1.8172,
  'learning_rate': 4.6946721311475414e-05,
  'epoch': 1.23,
  'step': 300

In [48]:
for text in (text1, text2, text3):
    print(simplification_model.simplify(text))
    print()

People find human footprints on a beach in England. The beach is in the east of England. People find the footprints in May 2013. They are a million years old. They are the oldest human footprints outside of Africa. They are the earliest known footprints in Northern Europe. Scientists record the sand. They make a 3D image of the sand. They show the image at a news conference in London. They say that the discovery will change our understanding of the occupation of Britain and Europe.

This news is from China. A man drives a motorbike. He has 8 people on the motorbike. The man is worried about the safety of the people on the motorbike. He films the man. Two girls sit at the front of the motorbike. Three women with two more kids sit at the back of the motorbike. China warns people about crazy driving.

Emma Allen is a British artist. She is from London. She changes her face paint every time. She takes over 750 photographs. She makes a stop-motion animation from the pictures. She shows the 

## flan-t5-small

In [55]:
flan_t5_small_model = SimplificationModel(
    checkpoint="google/flan-t5-small",
    model_type=T5ForConditionalGeneration,
    tokenizer=T5TokenizerFast,
    batch_size=16
)

In [56]:
simplification_model = flan_t5_small_model
print(simplification_model.checkpoint)
tokenized_train, tokenized_validation, tokenized_test = simplification_model.create_tokenized_data(train, validation, test)
validation_evaluation, test_evaluation, history = simplification_model.train(
    tokenized_train, tokenized_validation, tokenized_test, epochs=20)

google/flan-t5-small


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Sari,Generated text length
100,2.3605,2.035811,11.5237,3.9963,9.2081,11.0203,1.5309,39.7966,464.02
200,2.1886,1.971376,13.9665,5.0969,10.7863,13.3975,1.8765,40.8043,438.34
300,2.1232,1.93456,14.1959,5.2476,11.1412,13.6592,1.9246,41.1234,436.7446
400,2.0615,1.920841,18.0577,6.957,13.665,17.4267,2.4242,41.6018,393.9369
500,2.0339,1.902806,16.4892,6.3206,12.6716,15.9642,2.2734,41.7078,414.8
600,2.0109,1.890634,17.2329,6.6211,13.1367,16.6327,2.4053,41.8154,396.5538
700,1.989,1.881843,18.9542,7.3482,14.3106,18.302,2.6569,42.1676,375.9446
800,1.9601,1.879464,21.2941,8.2859,15.7662,20.5488,3.0523,42.5786,354.0769
900,1.9461,1.869575,19.9271,7.7158,14.8705,19.2628,2.8413,42.4109,365.9277
1000,1.9321,1.8637,19.231,7.484,14.4512,18.6052,2.7901,42.691,373.1954


In [57]:
validation_evaluation

{'eval_loss': 1.8531841039657593,
 'eval_rouge1': 21.7309,
 'eval_rouge2': 8.4213,
 'eval_rougeL': 16.2257,
 'eval_rougeLsum': 21.0607,
 'eval_BLEU': 3.2544,
 'eval_SARI': 42.7193,
 'eval_Generated text length': 330.3338,
 'eval_runtime': 265.8721,
 'eval_samples_per_second': 2.445,
 'eval_steps_per_second': 0.154,
 'epoch': 12.3}

In [58]:
test_evaluation

{'eval_loss': 1.853136658668518,
 'eval_rouge1': 21.0938,
 'eval_rouge2': 8.6068,
 'eval_rougeL': 15.7264,
 'eval_rougeLsum': 20.5319,
 'eval_BLEU': 3.393,
 'eval_SARI': 42.819,
 'eval_Generated text length': 336.5785,
 'eval_runtime': 268.032,
 'eval_samples_per_second': 2.425,
 'eval_steps_per_second': 0.153,
 'epoch': 12.3}

In [59]:
history

[{'loss': 2.3605,
  'learning_rate': 4.795081967213115e-05,
  'epoch': 0.82,
  'step': 100},
 {'eval_loss': 2.035810947418213,
  'eval_rouge1': 11.5237,
  'eval_rouge2': 3.9963,
  'eval_rougeL': 9.2081,
  'eval_rougeLsum': 11.0203,
  'eval_BLEU': 1.5309,
  'eval_SARI': 39.7966,
  'eval_Generated text length': 464.02,
  'eval_runtime': 280.2818,
  'eval_samples_per_second': 2.319,
  'eval_steps_per_second': 0.146,
  'epoch': 0.82,
  'step': 100},
 {'loss': 2.1886,
  'learning_rate': 4.59016393442623e-05,
  'epoch': 1.64,
  'step': 200},
 {'eval_loss': 1.9713761806488037,
  'eval_rouge1': 13.9665,
  'eval_rouge2': 5.0969,
  'eval_rougeL': 10.7863,
  'eval_rougeLsum': 13.3975,
  'eval_BLEU': 1.8765,
  'eval_SARI': 40.8043,
  'eval_Generated text length': 438.34,
  'eval_runtime': 276.9207,
  'eval_samples_per_second': 2.347,
  'eval_steps_per_second': 0.148,
  'epoch': 1.64,
  'step': 200},
 {'loss': 2.1232,
  'learning_rate': 4.3852459016393444e-05,
  'epoch': 2.46,
  'step': 300},
 {'ev

In [60]:
for text in (text1, text2, text3):
    print(simplification_model.simplify(text))
    print()

This news is about the oldest human footprints in Britain. It is about a million years ago. People find them on a beach in the east of England. They find them on a beach in the east of England. They make these footprints incredibly important. People find them on a beach on the Norfolk coast in the east of England. They are direct evidence of the earliest humans in Northern Europe. Scientists record the footprints. They make a record and a 3D image of an imprint. They make a model at a news conference at the British Museum in London. Scientists say that the discovery will rewrite our understanding of the human occupation of Britain and Europe.

A man is in a car. He drives a motorbike. The bike is overloaded. The man is in a vehicle. He drives the bike. Two girls sit at the front and three women with two more kids sit at the back. China’s Traffic Management Department warns people of crazy driving.

Emma Allen is a British artist. She is from London. She paints 750 pictures. She changes

## led-base-16384

In [17]:
led_base_model = SimplificationModel(
    checkpoint="allenai/led-base-16384",
    model_type=LEDForConditionalGeneration,
    tokenizer=LEDTokenizerFast,
    batch_size=8
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [18]:
simplification_model = led_base_model
print(simplification_model.checkpoint)
tokenized_train, tokenized_validation, tokenized_test = simplification_model.create_tokenized_data(train, validation, test)
validation_evaluation, test_evaluation, history = simplification_model.train(
    tokenized_train, tokenized_validation, tokenized_test, epochs=20)

allenai/led-base-16384


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a LEDTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Sari,Generated text length
100,1.9021,1.670124,18.429,7.0734,13.6649,17.9383,3.2379,44.2521,393.2785
200,1.6735,1.620038,28.9338,11.6238,20.3679,28.2387,5.4201,45.0383,276.28
300,1.4957,1.609993,36.3943,15.3123,24.9416,35.6411,8.6577,45.8914,197.0923
400,1.366,1.590562,39.4028,16.7347,26.8887,38.5853,10.3347,47.0716,175.8185
500,1.3354,1.632535,43.1034,18.4976,28.9513,42.2868,12.2966,47.5556,159.2031


In [19]:
validation_evaluation

{'eval_loss': 1.590561866760254,
 'eval_rouge1': 39.4028,
 'eval_rouge2': 16.7347,
 'eval_rougeL': 26.8887,
 'eval_rougeLsum': 38.5853,
 'eval_BLEU': 10.3347,
 'eval_SARI': 47.0716,
 'eval_Generated text length': 175.8185,
 'eval_runtime': 253.7053,
 'eval_samples_per_second': 2.562,
 'eval_steps_per_second': 0.323,
 'epoch': 2.05}

In [21]:
history

[{'loss': 1.9021,
  'learning_rate': 4.898565573770492e-05,
  'epoch': 0.41,
  'step': 100},
 {'eval_loss': 1.670123815536499,
  'eval_rouge1': 18.429,
  'eval_rouge2': 7.0734,
  'eval_rougeL': 13.6649,
  'eval_rougeLsum': 17.9383,
  'eval_BLEU': 3.2379,
  'eval_SARI': 44.2521,
  'eval_Generated text length': 393.2785,
  'eval_runtime': 321.8985,
  'eval_samples_per_second': 2.019,
  'eval_steps_per_second': 0.255,
  'epoch': 0.41,
  'step': 100},
 {'loss': 1.6735,
  'learning_rate': 4.796106557377049e-05,
  'epoch': 0.82,
  'step': 200},
 {'eval_loss': 1.6200377941131592,
  'eval_rouge1': 28.9338,
  'eval_rouge2': 11.6238,
  'eval_rougeL': 20.3679,
  'eval_rougeLsum': 28.2387,
  'eval_BLEU': 5.4201,
  'eval_SARI': 45.0383,
  'eval_Generated text length': 276.28,
  'eval_runtime': 312.2238,
  'eval_samples_per_second': 2.082,
  'eval_steps_per_second': 0.263,
  'epoch': 0.82,
  'step': 200},
 {'loss': 1.4957,
  'learning_rate': 4.693647540983607e-05,
  'epoch': 1.23,
  'step': 300},
 {

In [27]:
for text in (text1, text2, text3):
    print(simplification_model.simplify(text))
    print()

People find the oldest human footprints in Britain.
They are around a million years old.
They are on a beach in England.
They are on a beach in the east of England.
They are the earliest human footprints in Northern Europe.
People find the prints in 2013.
They find them during low tide.
The sand gets eroded.
This is a big problem.
Scientists do not know where the footprints come from.
They use photogrammetry.
This is a special technology.
People put together digital photographs.
They make a 3D image of the imprint.
Scientists show the images at a news conference at the British Museum in London.
Scientists are very happy.
They say that this discovery will change our understanding of the human occupation of Britain and Europe.

This news is from China.
It is about a motorcycle.
It is a motorcycle.
There are 8 people on it.
One man drives the motorcycle.
Two girls sit at the front and three women with two more kids sit at the back.
The video shows one man driving the bike.
Two girls sit a