# Installing packages

In [1]:
!pip install nlp
!pip install captum
!pip install bio
!pip install evaluate
!pip install bert_score

Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl.metadata (5.0 kB)
Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: nlp
Successfully installed nlp-0.4.0
Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl.metadata (26 kB)
Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: captum
Successfully installed captum-0.7.0
Collecting bio
  Downloading bio-1.7.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from bio)
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting gprofiler-official (from bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (1

In [2]:
!pip install git+https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git


Collecting git+https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git
  Cloning https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git to /tmp/pip-req-build-3isniy9h
  Running command git clone --filter=blob:none --quiet https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git /tmp/pip-req-build-3isniy9h
  Resolved https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git to commit 03084c54b64019ba5fa0b620b9c70ad81123e458
  Preparing metadata (setup.py) ... [?25ldone
Collecting python-Levenshtein (from parrot==1.0)
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting sentence-transformers (from parrot==1.0)
  Downloading sentence_transformers-3.3.0-py3-none-any.whl.metadata (10 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein->parrot==1.0)
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python

In [3]:

from typing import Dict

# Basic imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time


# NLP related libraries
import nlp
import torch
from torch.utils.data import Dataset
import transformers
from transformers import (ElectraForSequenceClassification,
                          ElectraTokenizerFast, EvalPrediction, InputFeatures,
                          Trainer, TrainingArguments, glue_compute_metrics, pipeline,
                         AutoTokenizer, AutoModelForSequenceClassification,
                         T5ForConditionalGeneration, T5TokenizerFast,
                         BartTokenizer, BartForConditionalGeneration,
                         AutoModelForSeq2SeqLM)

# XAI
from captum.attr import (IntegratedGradients, LayerIntegratedGradients,
                         configure_interpretable_embedding_layer,
                         remove_interpretable_embedding_layer)
from captum.attr import visualization as viz

# Alignment
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

# BERT score
from evaluate import load

# Supressing warnings
import warnings
warnings.filterwarnings('ignore')

from parrot import Parrot

transformers.__version__



'4.45.1'

In [4]:

train_dataset = nlp.load_dataset('imdb', split='train')

Downloading:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.06 MiB, post-processed: Unknown sizetotal: 207.28 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/76cdbd7249ea3548c928bbf304258dab44d09cd3638d9da8d42480d1d1be3743...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/76cdbd7249ea3548c928bbf304258dab44d09cd3638d9da8d42480d1d1be3743. Subsequent calls will reuse this data.


In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:

class TrainerDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer

        # Tokenize the input
        self.tokenized_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")   

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return InputFeatures(
            input_ids=self.tokenized_inputs['input_ids'][idx],
#             token_type_ids=self.tokenized_inputs['token_type_ids'][idx],
            attention_mask=self.tokenized_inputs['attention_mask'][idx],
            label=self.targets[idx])   

In [8]:
train_dataset = TrainerDataset(train_dataset["text"],
                               train_dataset["label"], tokenizer)

In [9]:
a = [1, 2, 3,4]
b = ["a", "b", "c","d"]
np.random.seed(123)
np.random.shuffle(a)
np.random.seed(123)
np.random.shuffle(b)
a,b

([4, 1, 2, 3], ['d', 'a', 'b', 'c'])

# Alignment measures

In [10]:



string1 = 'my channel is youtube dot com slash example and then I also do live streaming on twitch.'
string2 = 'my channel is youtube.com/example and then I also do livestreaming on twitch.'

alignments = pairwise2.align.globalxx(string1.split(), 
                                      string2.split(),
                                      gap_char=['-']
                                     )
print(format_alignment(*alignments[0]))

alignments2 = pairwise2.align.globalxx(string1.split(), 
                                      string1.split(),
                                      gap_char=['-']
                                     )
print(format_alignment(*alignments2[0]))
print(f"Score: {alignments[0].score/alignments2[0].score}") #<- less is btter


my channel is youtube dot com slash example          -          and then I also do live streaming       -       on twitch. 
 |    |     |                                                    |    |  |   |   |                               |    |    
my channel is    -     -   -    -      -    youtube.com/example and then I also do  -       -     livestreaming on twitch. 
  Score=10

my channel is youtube dot com slash example and then I also do live streaming on twitch. 
 |    |     |    |     |   |    |      |     |    |  |   |   |   |      |      |    |    
my channel is youtube dot com slash example and then I also do live streaming on twitch. 
  Score=17

Score: 0.5882352941176471


# BERT Score

In [11]:

bertscore = load("bertscore")
predictions = ["hello there", "general kenobi"]
references = ["goodbye here", "admiral skywalker"]
results = bertscore.compute(predictions=predictions, references=references, lang="en", model_type="distilbert-base-uncased", verbose=0)
results

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

{'precision': [0.8584095239639282, 0.6368807554244995],
 'recall': [0.8584095239639282, 0.6684491634368896],
 'f1': [0.8584095239639282, 0.652283251285553],
 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.45.1)'}

In [12]:
positives = np.array(train_dataset.inputs)[np.array(train_dataset.targets)==1]
n_positive = len(positives)
sentences = str(positives[np.random.randint(n_positive)])

parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=False)
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM

from transformers import AutoTokenizer, BigBirdPegasusModel
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import re

tokenizer_bigbird = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
paraphraser_bigbird = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")

tokenizer_pegasus = PegasusTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
paraphraser_pegasus = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")

paraphraser_t5small = T5ForConditionalGeneration.from_pretrained("mrm8488/t5-small-finetuned-quora-for-paraphrasing")
tokenizer_t5small = T5TokenizerFast.from_pretrained("mrm8488/t5-small-finetuned-quora-for-paraphrasing")

paraphraser_t5 = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")
tokenizer_t5 = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")

tokenizer_bert = BartTokenizer.from_pretrained('facebook/bart-base')
paraphraser_bert = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

tokenizer_config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/913 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.92M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.51M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [13]:

tokenizer = PegasusTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
paraphraser = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")

# input sentences
positives = np.array(train_dataset.inputs)[np.array(train_dataset.targets)==1]
n_positive = len(positives)
sentences = str(positives[np.random.randint(n_positive)])
print(sentences,end="\n\n\n")

def paraphrase(sentences, paraphraser, tokenizer, cat = False):
    # Paraphrase the sentences. Reviews are too long it's best to paraphrase one sentence at a time
    output = []
    reference = re.split(r'[.?!]', sentences)
    for sentence in reference:
        if len(sentence)>0 :

            # Tokenize the input sentence
            input_ids = tokenizer.encode(sentence, return_tensors='pt')

            if len(input_ids[0])>=50:
                output.append(sentence)
                continue
                
            # Generate paraphrased sentence
            paraphrase_ids = paraphraser.generate(input_ids, num_beams=5, max_length=1024, early_stopping=True, no_repeat_ngram_size=3)
        
            # Decode and print the paraphrased sentence
            paraphrase = tokenizer.decode(paraphrase_ids[0], skip_special_tokens=True, verbose=0)
            if cat:
                print(f"Original: {sentence}")
                print(f"Paraphrase: {paraphrase}")
                print()
            output.append(paraphrase)
        else:
            output.append(sentence)
    return output, reference


def paraphrase_parrot(sentences,cat=False):
    # Paraphrase the sentences. Reviews are too long it's best to paraphrase one sentence at a time
    output = []
    reference = re.split(r'[.?!]', sentences)
    for sentence in reference:
        if len(sentence)>0 :

            if len(sentence)>=100:
                output.append(sentence)
                continue
                
            # Generate paraphrased sentence
            paraphrase = parrot.augment(input_phrase=sentence)
            if paraphrase==None:
                paraphrase = sentence
            else:
                paraphrase, _ = paraphrase[0]
            output.append(paraphrase)
            if cat:
                print(f"Original: {sentence}")
                print(f"Paraphrase: {paraphrase}")
                print()
            
        else:
            output.append(sentence)
    return output, reference


def paraphrase_bigbird(sentences,cat=False):
    # Paraphrase the sentences. Reviews are too long it's best to paraphrase one sentence at a time
    output = []

    if len(sentences)>0 :
        input_ids = tokenizer_bigbird.encode(sentences, return_tensors='pt')
            
        # Generate paraphrased sentence
        paraphrase_ids = paraphraser_bigbird.generate(input_ids)
        
        output = tokenizer_bigbird.decode(paraphrase_ids[0], skip_special_tokens=True, verbose=0)
        
        if cat:
            print(f"Original: {sentences}")
            print(f"Paraphrase: {output}")
            print()
        
    
    return output, sentences


# output, reference = paraphrase(sentences, paraphraser, tokenizer, True)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Three horror stories based on members of a transgressive Hindu cult that return home but changed in some way. In the first story our former cult member is now in an insane asylum and is visited by a reported who wants to find out about what went on at the cult. Somewhat slow going as story is told in flashbacks while the two sit on chairs and face each other. Reporter is particularly interested in what lead to the death of the participants. What seemed rather boring suddenly turns very exciting with a surprising twist in the story. Things get quite bloody.<br /><br />Second story has a violent young criminal visiting a psychiatrist for mandatory therapy. The patient seems to have some type of agenda but the psychiatrist is up to the task. Again, things slow down a bit and get weird. Then there's a strange twist in the story that is very well written and surprising.<br /><br />Final story deals with spiritual healer who claims to be able to remove the persons illness from them with his 

In [14]:


def evaluate_paraphrase(sentences, paraphraser, tokenizer, name):
    
    start = time.time()
    if name ==  "bigbird":
        
        output, reference = paraphrase_bigbird(sentences)
    elif name == "parrot":
        output, reference = paraphrase_parrot(sentences)
    else:
        
        output, reference = paraphrase(sentences, paraphraser, tokenizer, False)
    time_stat = time.time()-start


    

    if name ==  "bigbird":
        output_cleaned = [output]
        reference_cleaned = [reference]
        alignment_score =  0
    else:
        # Remove empty sentences
        output_cleaned = []
        reference_cleaned = []
        alignment_score = []
        for i in range(len(reference)):
            if len(reference[i])!=0 and reference[i]!=" " and len(output[i])!=0 and output[i]!=" ":
                output_cleaned.append(output[i])
                reference_cleaned.append(reference[i])
                alignments = pairwise2.align.globalxx(reference[i].split(), 
                                              output[i].split(),
                                              gap_char=['-']
                                             )
    
    
                alignments2 = pairwise2.align.globalxx(reference[i].split(), 
                                                      reference[i].split(),
                                                      gap_char=['-']
                                                     )
                alignment_score.append(alignments[0].score/alignments2[0].score)




    results = bertscore.compute(predictions=output_cleaned, references=reference_cleaned, lang="en", model_type="distilbert-base-uncased", verbose=0)

    measures = {"paraphraser": name,
                "time":time_stat,
                "avg alignment": np.mean(alignment_score)}
    for k,v in results.items():
        if k != 'hashcode':
            measures["avg BERT Score "+ k]=np.mean(v)
            
    result = " ".join(output)
    
    alignments = pairwise2.align.globalxx(sentences.split(), 
                                          result.split(),
                                          gap_char=['-']
                                         )


    alignments2 = pairwise2.align.globalxx(sentences.split(), 
                                          sentences.split(),
                                          gap_char=['-']
                                         )

    measures["entire sentence alignment"]=(alignments[0].score/alignments2[0].score)
    results = bertscore.compute(predictions=[result], references=[sentences], lang="en", model_type="distilbert-base-uncased", verbose=0)
    for k,v in results.items():
        if k != 'hashcode':
            measures["entire sentence BERT Score "+ k]=v[0]
            
    measures["sentence"] = sentences
    measures["paraphrase"] = result
    
    
    return measures




In [15]:
np.random.seed(123)
results = []
for sentences in [str(positives[np.random.randint(n_positive)]) for _ in range(30)]:
    for paraphraser, tokenizer, name in [(None, None, "bigbird"),
                                         (paraphraser_pegasus, tokenizer_pegasus, "Pegasus"),
                                         (paraphraser_t5small, tokenizer_t5small, "t5-small"),
                                         (paraphraser_t5, tokenizer_t5, "t5"),
                                         (paraphraser_bert, tokenizer_bert, "BERT"),
                                        (None, None, "parrot")
                                         
                                        ]:
        results.append(evaluate_paraphrase(sentences, paraphraser, tokenizer, name))
res = pd.DataFrame(results)

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values ), props, '')

def highlight_min(s, props=''):
    return np.where(s == np.nanmin(s.values), props, '')
(
results.tail(10).style.apply(highlight_max, axis=0, props='background-color:green;', subset=["avg BERT Score precision", "avg BERT Score recall", "avg BERT Score f1", "entire sentence BERT Score precision", "entire sentence BERT Score recall", "entire sentence BERT Score f1"])
         .apply(highlight_min, axis=0, props='background-color:green;', subset=['time', 'avg alignment', "entire sentence alignment"])
)

Attention type 'block_sparse' is not possible if sequence_length: 413 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


[('first comments on this movie were so vicious that i had to go see for myself', 16), ('first comments on this movie were so vicious that i had to see it for myself', 16), ('the first comments made on this movie were so vicious that i had to see for myself', 15), ('the first comments on this film were so vicious that i had to see for myself', 15), ('the first comments on this movie were so vicious that i had to see for myself', 10)]
[(' Michael Caton-Jones is not Paul Verhoeven, neither Henry Bean and Leora Barish are Joe Eszterhas', 0)]
[('unfortunately she seems to be the main target for those who like to trash this movie', 27), ('unfortunately she seems to be the main target for those who like to trash this film', 25), ('unfortunately she appears to be the main target for those who enjoy trashing this flick', 18), ('unfortunately she seems to be the main target for those who enjoy trashing this movie', 18), ('unfortunately she seems to be the main target for those who enjoy trashin

Token indices sequence length is longer than the specified maximum sequence length for this model (89 > 60). Running this sequence through the model will result in indexing errors


[('this is a serious work whose unjust reputation needs to be restored and its reputation restored', 68), ('this is a serious work whose unjust reputation deserves rediscovery and restoration', 65), ('this is a serious work which merits the rediscovery and restoration of its unjustly tarnished reputation', 27), ('this is a serious job requiring the discovery and restoration of its unjustly tarnished reputation', 24), ("it's a serious work requiring rediscovery and restoration of its unjustly tarnished reputation", 23), ('this is a serious work which deserves rediscovery and restoration of its unjustly tarnished reputation', 22), ('this is a serious work that deserves rediscovery and restoration of its unjustly tarnished reputation', 21), ('this is a serious work deserving of rediscovery and restoration of its unjustly tarnished reputation', 16)]
[('the gorilla looks tremendous and the eyes were extremely realistic', 40), ('the gorilla looks fantastic and the eyes were particularly real

TypeError: 'NoneType' object is not subscriptable

In [None]:
res = pd.DataFrame(results)

In [None]:
res

In [None]:
import seaborn as sns
for measure in res.columns:
    if measure in ["paraphraser", "sentence", "paraphrase"]:
        continue
    plt.figure(figsize=(8, 5))
    for par in np.unique(res.paraphraser):
        df = res[res.paraphraser==par][measure]
        sns.kdeplot(df, label=par)
    plt.legend()
    plt.title(measure)
    plt.show()

In [None]:
def get_time(time):
    result = ""
    if time//3600 > 0:
        result += str(int(time//3600)) + " h  "
        time %= 3600
    if time//60 > 0:
        result += str(int(time//60)) + " m  "
        time %= 60
    if time//1 > 0:
        result += str(np.round(time,2)) + " s  "
    return result


In [None]:
for par in np.unique(res.paraphraser):
    text = f"Average time {par}:"
    for _ in range(25-len(text)):
        text += " "
        
    print(text+f"{get_time(np.mean(res[res.paraphraser==par]['time']))}")

In [None]:
for par in np.unique(res.paraphraser):
    text = f"Average time {par}:"
    for _ in range(25-len(text)):
        text += " "
        
    print(text+f"{get_time(np.mean(res[res.paraphraser==par]['time'])*9500)}")