In [41]:
# imports
from transformers import pipeline
from tqdm import tqdm
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

In [42]:
# define the rephraser model (pegasus paraphraser)
# its later used in rephrasing the dataset
model_name = 'tuner007/pegasus_paraphrase'
# find the device for the model (gpu if available or cpu)
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
# define the tokenizer (download if not already installed)
tokenizer = PegasusTokenizer.from_pretrained(model_name)
# define the model (download if not already installed)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [43]:
# function to rephrase a sentence
# input_text             >    text to be rephrased
# num_return_sequence    >    batch size
# num beams              >    batch size
def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [39]:
def expand(txt_fpath):
    # open the specified file to be expanded
    file = open(txt_fpath, "r")
    # read it to a list for every line
    lines = file.read().split("\n")
    # close it
    file.close()
    expanded = []
    
    for x in tqdm(range(len(lines))):
        expanded.append(get_response(lines[x], 1, 1)[0])

    print("finished expanding")
    
    file = open(txt_fpath, "a+")

    for x in tqdm(expanded):
        file.write("\n" + x)

    file.close()
    
    print("finished writing")

In [40]:
expand_txt(r"C:\Users\user\Desktop\simulation\test.txt")

100%|██████████| 200/200 [02:53<00:00,  1.15it/s]


finished expanding


100%|██████████| 200/200 [00:00<?, ?it/s]

finished writing



