I used the following [link](https://medium.com/@imjeffhi4/creating-a-paraphrase-generator-model-using-t5-and-deploying-on-ainize-7742bc83532a) to implement it

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install -U transformers
!pip install datasets
!pip install pytorch-lightning

### prepre data

In [None]:

from datasets import load_dataset

paws_data = load_dataset('paws', 'labeled_final')['train']
paraphrase_data = []
for item in paws_data:
    if item['label'] == 1:
        paraphrase_data.append(
            {"Source": item['sentence1'], "Target": item['sentence2']})

Reusing dataset paws (/root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
paws_data[0]

{'id': 1,
 'label': 0,
 'sentence1': 'In Paris , in October 1560 , he secretly met the English ambassador , Nicolas Throckmorton , asking him for a passport to return to England through Scotland .',
 'sentence2': 'In October 1560 , he secretly met with the English ambassador , Nicolas Throckmorton , in Paris , and asked him for a passport to return to Scotland through England .'}

In [13]:
len(paraphrase_data)

21829

### train

In [None]:
import os
from transformers import T5TokenizerFast, T5ForConditionalGeneration
import pytorch_lightning as pl
import torch
import json
from torch.utils.data import TensorDataset, random_split
from transformers.optimization import AdamW
from pytorch_lightning.callbacks import Callback
from tqdm import tqdm

save_path = '/content/drive/MyDrive/Codes/Experiments/paraphrasing/Models'

try:
    os.mkdir(save_path)
except:
    pass


class ParaphraseGenerator(pl.LightningModule):
    def __init__(self):
        super().__init__()
        model_name = 't5-base'
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        self.tokenizer = T5TokenizerFast.from_pretrained(model_name)
        self.batch_size = 8
        self.lr = 4e-5

    def encode_text(self, data_path):
        # with open(data_path, 'r', encoding='utf-8') as r:
        #     data = json.load(r)
        for item in tqdm(paraphrase_data):
            # tokenizing original and paraphrase:
            source = self.tokenizer(
                item['Source'], max_length=64, truncation=True, padding='max_length', return_tensors='pt')
            target = self.tokenizer(
                item['Target'], max_length=64, truncation=True, padding='max_length', return_tensors='pt')
            yield source['input_ids'], target['input_ids']

    def to_tensor(self, source_ids, target_ids):
        source_ids = torch.cat(source_ids, dim=0)
        target_ids = torch.cat(target_ids, dim=0)
        data = TensorDataset(source_ids, target_ids)
        return random_split(data, [len(data), 0])[0]

    def prepare_data(self):
        source_ids, target_ids = list(
            zip(*tuple(self.encode_text('train_ds.json'))))
        self.train_ds = self.to_tensor(source_ids, target_ids)

        source_ids, target_ids = list(
            zip(*tuple(self.encode_text('test_ds.json'))))
        self.test_ds = self.to_tensor(source_ids, target_ids)

    def forward(self, batch, batch_idx):
        source_ids, target_ids = batch[:2]
        return self.model(input_ids=source_ids, labels=target_ids)

    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('train_loss', loss)
        return loss


    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('val_loss', loss)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_ds, batch_size=self.batch_size, drop_last=True, shuffle=True, num_workers=0)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.test_ds, batch_size=self.batch_size, drop_last=False, shuffle=False, num_workers=0)

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr, weight_decay=0.01)

In [None]:
class SaveCallback(Callback):
    def on_epoch_start(self, trainer, pl_module):
        if pl_module.current_epoch > 0:
            current_epoch = str(pl_module.current_epoch)
            fn = f'epoch_{current_epoch}'
            new_path = f"{save_path}/{fn}/"
            if fn not in os.listdir(save_path):
                os.mkdir(new_path)
            pl_module.tokenizer.save_vocabulary(new_path)
            pl_module.model.save_pretrained(new_path)


In [None]:
trainer = pl.Trainer(
    default_root_dir='logs',
    min_epochs=4,
    accelerator='gpu',
    devices=1,
    max_epochs=5,
    val_check_interval=0.5,
    callbacks=[SaveCallback()],
    logger=pl.loggers.TensorBoardLogger('logs/', name='paraphrase', version=0)
)


para_model = ParaphraseGenerator()
trainer.fit(para_model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  f"The `Callback.{hook}` hook was deprecated in v1.6 and"

  0%|          | 0/21829 [00:00<?, ?it/s][A
  1%|          | 191/21829 [00:00<00:11, 1900.76it/s][A
  2%|▏         | 382/21829 [00:00<00:42, 504.02it/s] [A
  2%|▏         | 544/21829 [00:00<00:30, 706.34it/s][A
  3%|▎         | 703/21829 [00:00<00:23, 887.43it/s][A
  4%|▍         | 862/21829 [00:00<00:20, 1046.18it/s][A
  5%|▍         | 1021/21829 [00:01<00:17, 1177.98it/s][A
  5%|▌         | 1198/21829 [00:01<00:15, 1329.99it/s][A
  6%|▋         | 1375/21829 [00:01<00:14, 1447.28it/s][A
  7%|▋         | 1541/21829 [00:01<00:13, 1505.92it/s][A
  8%|▊         | 1720/21829 [00:01<00:12, 1585.26it/s][A
  9%|▊         | 1890/21829 [00:01<00:12, 1542.12it/s][A
  9%|▉         | 2058/21829 [00:01<00:12, 1580.56it/s][A
 10%|█         | 2239/21829 [00:01<00:11, 1644.73it/s][A
 11

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
