# Translating dataset duplicated Qs in Quora

Following https://huggingface.co/course/chapter3/3?fw=tf
https://huggingface.co/tuner007/pegasus_paraphrase

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("quora")

Using custom data configuration default
Reusing dataset quora (/home/pablo/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)


  0%|          | 0/1 [00:00<?, ?it/s]

## Duplicated questions
We check first the duplicated questions

In [3]:
for d in dataset["train"]:
    if d["is_duplicate"]:
        print(d)
        break

{'questions': {'id': [11, 12], 'text': ['Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?', "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?"]}, 'is_duplicate': True}


In [4]:
ds = [d for d in dataset["train"] if d["is_duplicate"]]

In [5]:
len(ds)

149263

In [6]:
ds[7]

{'questions': {'id': [37, 38],
  'text': ['Why are so many Quora users posting questions that are readily answered on Google?',
   'Why do people ask Quora questions which can be answered easily by Google?']},
 'is_duplicate': True}

## Loading the model


In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-es")

Downloading:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/783k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/807k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/298M [00:00<?, ?B/s]

In [9]:
torch_device = "cpu"#'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(torch_device)

In [8]:
def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

## Testing the model

In [10]:
num_beams = 10
num_return_sequences = 1
context = "who is Iban Rios?"
get_response(context,num_return_sequences,num_beams)

['¿Quién es Iban Ríos?']

In [11]:
import random

In [20]:
dt = random.choice(ds)
print(dt["questions"]["text"])
context = dt["questions"]["text"][0]
r1 = get_response(context,num_return_sequences,num_beams)
context = dt["questions"]["text"][1]
r2 = get_response(context,num_return_sequences,num_beams)
print(r1,r2)

['Where can you find out what needs to be improved if your question was marked for needing revision?', 'Is there a way on Quora to ask why a specific question was marked as needs improvement?']
['¿Dónde puede usted averiguar lo que necesita ser mejorado si su pregunta fue marcada para la necesidad de revisión?'] ['¿Hay alguna manera en Quora de preguntar por qué una pregunta específica fue marcada como mejora de las necesidades?']
