In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") # This always results in MPS
device = "cpu"

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=3,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128,
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


In [3]:
text = 'An architect is not drawing blueprints for a new building .'
paraphrase(text)



['The architect is not in charge of creating the blueprints for a new building.',
 'A new building blueprints are not being prepared by an architect.',
 'Architects are not in the process of creating blueprints for a new building.']

In [4]:
import pandas as pd
df = pd.read_csv("/Users/raunakpandey/Documents/programming/projects/augmentation/augmented_data/negations_final.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,premise,swap_sub_obj,negation,numbers_change,change_nouns1,change_nouns2,change_verbs,paraphrase_negations_0,paraphrase_negations_1,paraphrase_negations_2,paraphrase_negations_3,paraphrase_negations_4
0,0,The cat is sleeping on the windowsill.,The windowsill is sleeping on the cat,The cat is not sleeping on the windowsill .,The cat is sleeping on the windowsill .,The dog is sleeping on the windowsill.,The cat is sleeping on the window-sill.,The cat is doze on the windowsill.,The cat is not slumbering on the windowsill.,But the cat is not sleeping on the windowsill.,The cat is no longer residing on the windowsill.,There is no chance that the cat is sleeping on...,It's not the cat sleeping on a window sill.
1,1,A group of friends is playing soccer in the park.,A park of friends is playing soccer in the group,A group of friends is not playing soccer in th...,A group of friends is playing soccer in the pa...,A grouping of friends is playing soccer in the...,A group of friends is playing football in the ...,A group of friends is understudy soccer in the...,"In the park, a group of friends is not engagin...",A group of friends is not participating in soc...,"Rather than playing soccer in the park, a grou...",There are a few friends who are not playing so...,The park is not the site of a soccer match amo...
2,2,The chef is preparing a delicious meal in the ...,The kitchen is preparing a delicious meal in t...,The chef is not preparing a delicious meal in ...,The chef is preparing a delicious meal in the ...,The cook is preparing a delicious meal in the ...,The chef is preparing a delicious breakfast in...,The chef is de-brief a delicious meal in the k...,The chef is not preparing a tasty meal in the ...,There is no indication that the chef is cookin...,A delicious meal is not being prepared by the ...,It appears that the chef is not cooking up a d...,The chef is not cooking a tasty meal in the ki...
3,3,A bright rainbow stretches across the sky afte...,A bright rain stretches across the sky after t...,A bright rainbow doesn't stretches across the ...,A bright rainbow stretches across the sky afte...,A bright anya stretches across the sky after t...,A bright rainbow stretches across the cloud af...,A bright rainbow bicep across the sky after th...,The sky lacks a clear rainbow after the rain.,"After the rain, the sky lacks a bright rainbow.",A bright rainbow is not visible in the sky aft...,The rain does not lead to a bright rainbow in ...,"Following the rainfall, a bright rainbow does ..."
4,4,The students are listening attentively during ...,The lecture are listening attentively during t...,The students are not listening attentively dur...,The students are listening attentively during ...,The students are listening attentively during ...,The students are listening attentively during ...,The students are talk attentively during the l...,The students are not paying enough attention d...,"During the lecture, the students are not payin...",Students are not paying enough attention durin...,The lecture is being overheard by the students.,Lecture: The students are not paying enough at...


In [5]:
df.drop(['Unnamed: 0'] , axis=1, inplace=True)

In [6]:
def apply_paraphrase(df, column):
    import random
    df_new = df.copy()
    from tqdm.auto import tqdm
    tqdm.pandas()
    output = df_new[column].progress_apply(paraphrase)
    df_new[column + "_grammer"] = output.apply(lambda x: random.choice(x))
    return df_new

In [7]:
df_new = apply_paraphrase(df, "change_verbs")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/317 [00:00<?, ?it/s]

100%|██████████| 317/317 [08:32<00:00,  1.62s/it]


In [8]:
df['change_verbs'] = df_new['change_verbs_grammer']

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   premise                 317 non-null    object
 1   swap_sub_obj            317 non-null    object
 2   negation                317 non-null    object
 3   numbers_change          317 non-null    object
 4   change_nouns1           317 non-null    object
 5   change_nouns2           317 non-null    object
 6   change_verbs            317 non-null    object
 7   paraphrase_negations_0  317 non-null    object
 8   paraphrase_negations_1  317 non-null    object
 9   paraphrase_negations_2  317 non-null    object
 10  paraphrase_negations_3  317 non-null    object
 11  paraphrase_negations_4  317 non-null    object
dtypes: object(12)
memory usage: 29.8+ KB


In [10]:
df.to_csv("/Users/raunakpandey/Documents/programming/projects/augmentation/augmented_data/negations_final_grammerCorrection.csv")