# paraphrase

In [48]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") # This always results in MPS
device = "cpu"

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)


In [49]:
def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


In [50]:
text = 'An architect is not drawing blueprints for a new building .'
paraphrase(text)




['The architect is not in charge of creating the blueprints for a new building.',
 'A new building blueprints are not being prepared by an architect.',
 'Architects are not in the process of creating blueprints for a new building.',
 'There is no architect who has designed the blueprints for a new building.',
 'An architect is refraining from creating plans for a new building.']

In [52]:
import pandas as pd
import numpy as np
df = pd.read_csv('/Users/raunakpandey/Documents/programming/projects/augmentation/Contradictions.csv')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,premise,label,swap_sub_obj,negation,numbers_change,change_nouns1,change_nouns2,change_verbs
0,0,0,The cat is sleeping on the windowsill.,1,The windowsill is sleeping on the cat,The cat is not sleeping on the windowsill .,The cat is sleeping on the windowsill .,The dog is sleeping on the windowsill.,The cat is sleeping on the window-sill.,The cat is doze on the windowsill.
1,1,1,A group of friends is playing soccer in the park.,1,A park of friends is playing soccer in the group,A group of friends is not playing soccer in th...,A group of friends is playing soccer in the pa...,A grouping of friends is playing soccer in the...,A group of friends is playing football in the ...,A group of friends is understudy soccer in the...
2,2,2,The chef is preparing a delicious meal in the ...,1,The kitchen is preparing a delicious meal in t...,The chef is not preparing a delicious meal in ...,The chef is preparing a delicious meal in the ...,The cook is preparing a delicious meal in the ...,The chef is preparing a delicious breakfast in...,The chef is de-brief a delicious meal in the k...
3,3,3,A bright rainbow stretches across the sky afte...,1,A bright rain stretches across the sky after t...,A bright rainbow doesn't stretches across the ...,A bright rainbow stretches across the sky afte...,A bright anya stretches across the sky after t...,A bright rainbow stretches across the cloud af...,A bright rainbow bicep across the sky after th...
4,4,4,The students are listening attentively during ...,1,The lecture are listening attentively during t...,The students are not listening attentively dur...,The students are listening attentively during ...,The students are listening attentively during ...,The students are listening attentively during ...,The students are talk attentively during the l...


In [58]:
df.drop(['Unnamed: 0'] , axis=1, inplace=True)
df.drop(['Unnamed: 0.1'] , axis=1, inplace=True)

## for negations

In [62]:
def apply_paraphrase(df):
    from tqdm.auto import tqdm
    tqdm.pandas()
    outputs = df['negation'].progress_apply(paraphrase)
    for i in range(5):
        df[f'paraphrase_negations_{i}'] = outputs.apply(lambda x: x[i])
    return df

In [63]:
df_paraphrased = apply_paraphrase(df)

100%|██████████| 317/317 [08:25<00:00,  1.59s/it]


In [64]:
df_paraphrased.head()

Unnamed: 0,premise,label,swap_sub_obj,negation,numbers_change,change_nouns1,change_nouns2,change_verbs,paraphrase_negations_0,paraphrase_negations_1,paraphrase_negations_2,paraphrase_negations_3,paraphrase_negations_4
0,The cat is sleeping on the windowsill.,1,The windowsill is sleeping on the cat,The cat is not sleeping on the windowsill .,The cat is sleeping on the windowsill .,The dog is sleeping on the windowsill.,The cat is sleeping on the window-sill.,The cat is doze on the windowsill.,The cat is not slumbering on the windowsill.,But the cat is not sleeping on the windowsill.,The cat is no longer residing on the windowsill.,There is no chance that the cat is sleeping on...,It's not the cat sleeping on a window sill.
1,A group of friends is playing soccer in the park.,1,A park of friends is playing soccer in the group,A group of friends is not playing soccer in th...,A group of friends is playing soccer in the pa...,A grouping of friends is playing soccer in the...,A group of friends is playing football in the ...,A group of friends is understudy soccer in the...,"In the park, a group of friends is not engagin...",A group of friends is not participating in soc...,"Rather than playing soccer in the park, a grou...",There are a few friends who are not playing so...,The park is not the site of a soccer match amo...
2,The chef is preparing a delicious meal in the ...,1,The kitchen is preparing a delicious meal in t...,The chef is not preparing a delicious meal in ...,The chef is preparing a delicious meal in the ...,The cook is preparing a delicious meal in the ...,The chef is preparing a delicious breakfast in...,The chef is de-brief a delicious meal in the k...,The chef is not preparing a tasty meal in the ...,There is no indication that the chef is cookin...,A delicious meal is not being prepared by the ...,It appears that the chef is not cooking up a d...,The chef is not cooking a tasty meal in the ki...
3,A bright rainbow stretches across the sky afte...,1,A bright rain stretches across the sky after t...,A bright rainbow doesn't stretches across the ...,A bright rainbow stretches across the sky afte...,A bright anya stretches across the sky after t...,A bright rainbow stretches across the cloud af...,A bright rainbow bicep across the sky after th...,The sky lacks a clear rainbow after the rain.,"After the rain, the sky lacks a bright rainbow.",A bright rainbow is not visible in the sky aft...,The rain does not lead to a bright rainbow in ...,"Following the rainfall, a bright rainbow does ..."
4,The students are listening attentively during ...,1,The lecture are listening attentively during t...,The students are not listening attentively dur...,The students are listening attentively during ...,The students are listening attentively during ...,The students are listening attentively during ...,The students are talk attentively during the l...,The students are not paying enough attention d...,"During the lecture, the students are not payin...",Students are not paying enough attention durin...,The lecture is being overheard by the students.,Lecture: The students are not paying enough at...


## for premise

In [65]:
def apply_paraphrase(df, column):
    df_new = df.copy()
    from tqdm.auto import tqdm
    tqdm.pandas()
    outputs = df_new[column].progress_apply(paraphrase)
    for i in range(5):
        df_new[f'paraphrase_{column}_{i}'] = outputs.apply(lambda x: x[i])
    return df_new

In [66]:
df_premise_neg_para = apply_paraphrase(df_paraphrased, 'premise')

100%|██████████| 317/317 [07:59<00:00,  1.51s/it]


In [86]:
df_negations_para = apply_paraphrase(df, 'negation')

100%|██████████| 317/317 [08:34<00:00,  1.62s/it]


In [87]:
df_negations_para.iloc[:5,-5:]

Unnamed: 0,paraphrase_negations_0,paraphrase_negations_1,paraphrase_negations_2,paraphrase_negations_3,paraphrase_negations_4
0,The cat is not slumbering on the windowsill.,But the cat is not sleeping on the windowsill.,The cat is no longer residing on the windowsill.,There is no chance that the cat is sleeping on...,It's not the cat sleeping on a window sill.
1,"In the park, a group of friends is not engagin...",A group of friends is not participating in soc...,"Rather than playing soccer in the park, a grou...",There are a few friends who are not playing so...,The park is not the site of a soccer match amo...
2,The chef is not preparing a tasty meal in the ...,There is no indication that the chef is cookin...,A delicious meal is not being prepared by the ...,It appears that the chef is not cooking up a d...,The chef is not cooking a tasty meal in the ki...
3,The sky lacks a clear rainbow after the rain.,"After the rain, the sky lacks a bright rainbow.",A bright rainbow is not visible in the sky aft...,The rain does not lead to a bright rainbow in ...,"Following the rainfall, a bright rainbow does ..."
4,The students are not paying enough attention d...,"During the lecture, the students are not payin...",Students are not paying enough attention durin...,The lecture is being overheard by the students.,Lecture: The students are not paying enough at...


In [84]:
df_paraphrased.to_csv('/Users/raunakpandey/Documents/programming/projects/augmentation/premise_para.csv')

In [88]:
df_negations_para.to_csv('/Users/raunakpandey/Documents/programming/projects/augmentation/negations_final.csv')

## for hypothesis

In [92]:
df_hypothesis = pd.read_csv("/Users/raunakpandey/Documents/programming/projects/augmentation/cgptsamples-1.csv")

In [93]:
df_hypothesis.head()

Unnamed: 0,premise,hypothesis,label
0,The cat is sleeping on the windowsill.,The cat is taking a nap.,1
1,A group of friends is playing soccer in the park.,People are enjoying the outdoors.,1
2,The chef is preparing a delicious meal in the ...,A meal is being cooked.,1
3,A bright rainbow stretches across the sky afte...,The rain has stopped.,1
4,The students are listening attentively during ...,The teacher is giving a presentation.,1


In [95]:
df_hypothesis_para = apply_paraphrase(df_hypothesis, 'hypothesis')

100%|██████████| 317/317 [03:52<00:00,  1.36it/s]


In [96]:
df_hypothesis_para.head()

Unnamed: 0,premise,hypothesis,label,paraphrase_negations_0,paraphrase_negations_1,paraphrase_negations_2,paraphrase_negations_3,paraphrase_negations_4
0,The cat is sleeping on the windowsill.,The cat is taking a nap.,1,The cat is taking a break for sleep.,Sleeping time: The cat is enjoying a good nap.,This is the cat taking a nap.,It's sleeping in.,The feline is currently sleeping.
1,A group of friends is playing soccer in the park.,People are enjoying the outdoors.,1,People are fond of being outside.,The outdoors are being enjoyed by people.,Those who like to be outdoors are in the mood.,People have a passion for being outside.,Outdoor activities are favored by many people.
2,The chef is preparing a delicious meal in the ...,A meal is being cooked.,1,A meal is being prepared.,The preparation of food is underway.,Cooking a meal is currently in progress.,It is the meal that is being prepared.,We are currently preparing a meal.
3,A bright rainbow stretches across the sky afte...,The rain has stopped.,1,The rain has ceased.,Rainfall is now non-existent.,There is no more rain on the ground.,It has stopped raining.,We have ceased the rain.
4,The students are listening attentively during ...,The teacher is giving a presentation.,1,The teacher is presenting.,A lecturer is presenting.,Teacher: The teacher is presenting.,There is a presentation from the teacher.,The educator is giving a talk.


In [99]:
df_hypothesis_para.rename(columns=
    {'paraphrase_negations_0': 'paraphrase_hypothesis_0',
    'paraphrase_negations_1': 'paraphrase_hypothesis_1',
    'paraphrase_negations_2': 'paraphrase_hypothesis_2',
    'paraphrase_negations_3': 'paraphrase_hypothesis_3',
    'paraphrase_negations_4': 'paraphrase_hypothesis_4'}, inplace=True)

In [100]:
df_hypothesis_para.head()

Unnamed: 0,premise,hypothesis,label,paraphrase_hypothesis_0,paraphrase_hypothesis_1,paraphrase_hypothesis_2,paraphrase_hypothesis_3,paraphrase_hypothesis_4
0,The cat is sleeping on the windowsill.,The cat is taking a nap.,1,The cat is taking a break for sleep.,Sleeping time: The cat is enjoying a good nap.,This is the cat taking a nap.,It's sleeping in.,The feline is currently sleeping.
1,A group of friends is playing soccer in the park.,People are enjoying the outdoors.,1,People are fond of being outside.,The outdoors are being enjoyed by people.,Those who like to be outdoors are in the mood.,People have a passion for being outside.,Outdoor activities are favored by many people.
2,The chef is preparing a delicious meal in the ...,A meal is being cooked.,1,A meal is being prepared.,The preparation of food is underway.,Cooking a meal is currently in progress.,It is the meal that is being prepared.,We are currently preparing a meal.
3,A bright rainbow stretches across the sky afte...,The rain has stopped.,1,The rain has ceased.,Rainfall is now non-existent.,There is no more rain on the ground.,It has stopped raining.,We have ceased the rain.
4,The students are listening attentively during ...,The teacher is giving a presentation.,1,The teacher is presenting.,A lecturer is presenting.,Teacher: The teacher is presenting.,There is a presentation from the teacher.,The educator is giving a talk.


In [101]:
df_hypothesis_para.to_csv('/Users/raunakpandey/Documents/programming/projects/augmentation/augmented_data/hypothesis_para.csv')