In [1]:
!pip install --quiet transformers
!pip install --quiet pytorch-lightning
!pip install --quiet tokenizers
!pip install --quiet SentencePiece 

[0m

In [2]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

from transformers import(
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer
)
from tqdm.auto import tqdm

In [3]:
class ParaphraseModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        self.model =  T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict= True)
    
    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels= None):
        
        output = self.model(
            input_ids,
            attention_mask = attention_mask,
            labels = labels,
            decoder_attention_mask = decoder_attention_mask
        )
        
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]
        
        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )
        
        self.log("train_loss", loss, prog_bar= True, logger= True)
        return loss
        
        
    def validation_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]
        
        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )
        
        self.log("val_loss", loss, prog_bar= True, logger= True)
        return loss
        
    def test_step(self, batch, batch_idx):
        input_ids = batch["text_input_ids"]
        attention_mask = batch["text_attention_mask"]
        labels = batch["labels"]
        labels_attention_mask = batch["labels_attention_mask"]
        
        loss, outputs = self(
            input_ids = input_ids,
            attention_mask = attention_mask,
            decoder_attention_mask = labels_attention_mask,
            labels = labels
        )
        
        self.log("test_loss", loss, prog_bar= True, logger= True)
        return loss
    
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr = 0.0001)

In [4]:
MODEL_NAME = "google/mt5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

trained_model = ParaphraseModel.load_from_checkpoint(
    "../input/finetune-mt5/checkpoint-checkpoints.ckpt"
)
trained_model.freeze()




Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/553 [00:00<?, ?B/s]

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [5]:
def paraphraser(text):
  text_encoding = tokenizer(
      text,
      max_length= 90,
      padding= True,
      return_attention_mask= True,
      add_special_tokens= True,
      return_tensors= "pt"
  )

  generated_ids = trained_model.model.generate(
      input_ids= text_encoding["input_ids"],
      attention_mask= text_encoding["attention_mask"],
      max_length= 512,
      num_beams= 2,
      early_stopping= True
  )

  preds =  [
      tokenizer.decode(gen_id, skip_special_tokens= True, clean_up_tokenization_spaces= True)
      for gen_id in generated_ids
      ]
  return "".join(preds)

> ***Please press Run all at the top of the page and then enter your sentence in the red "" in the box below.***

In [8]:
paraphraser("حتمالا شما مادر بسیار سلطه پذیری داشتید.")


'شما باید مادر بسیار سلطه پذیری داشته باشید'