# Pegasus paraphrase basic - Fine tuning HuggingFace model 

Following https://huggingface.co/course/chapter3/3?fw=tf
https://huggingface.co/tuner007/pegasus_paraphrase

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("quora")

Using custom data configuration default
Reusing dataset quora (/home/pablo/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)


  0%|          | 0/1 [00:00<?, ?it/s]

## Duplicated questions
We check first the duplicated questions

In [9]:
for d in dataset["train"]:
    if d["is_duplicate"]:
        print(d)
        break

{'questions': {'id': [11, 12], 'text': ['Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?', "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?"]}, 'is_duplicate': True}


In [10]:
ds = [d for d in dataset["train"] if d["is_duplicate"]]

In [5]:
len(ds)

149263

In [6]:
ds[7]

{'questions': {'id': [37, 38],
  'text': ['Why are so many Quora users posting questions that are readily answered on Google?',
   'Why do people ask Quora questions which can be answered easily by Google?']},
 'is_duplicate': True}

## Loading the model


In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)


Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [4]:
torch_device = "cpu"#'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(torch_device)

In [5]:
def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

## Testing the model

In [8]:
num_beams = 10
num_return_sequences = 10
context = "who is Iban Rios?"
get_response(context,num_return_sequences,num_beams)

['Who is Iban Rios?',
 'Iban Rios, who is he?',
 "I don't know who Iban Rios is.",
 'Iban Rios is a person.',
 'What is the name of Iban Rios?',
 'Iban Rios is not known.',
 'Iban Rios is not known to the public.',
 'Is Iban Rios?',
 'Iban Rios is who?',
 'who is Iban Rios?']

In [11]:
import random

In [24]:
dt = random.choice(ds)
print(dt["questions"]["text"])
context = dt["questions"]["text"][0]
get_response(context,num_return_sequences,num_beams)

['How should you start a career in Machine Learning?', 'How do i know i can start career in machine learning?']


['How should you start working in machine learning?',
 'How should you start your career in machine learning?',
 'How should you get started in machine learning?',
 'How should you make a living in machine learning?',
 'Do you know how to start a career in machine learning?',
 'How should you begin your career in machine learning?',
 'Do you know how to start a machine learning career?',
 'How do you get started in machine learning?',
 'What should you do to start a career in machine learning?',
 'How should you get into machine learning?']