In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import tensorflow as tf
import tensorflow_hub as hub

In [2]:
train_df=pd.read_csv('../input/ire-major-data-v2/train_df.csv')
test_df=pd.read_csv('../input/ire-major-data-v2/test_set_with_paraphrases.csv')

In [3]:
train_df.head(2)

Unnamed: 0.1,Unnamed: 0,SNO,Intensity,Text,label,#tokens
0,0,839551577044234245,0.533333,Perspective: These agencies helped make Americ...,clickbait,26
1,1,844843703847944192,0.133333,Seven arrests made in raids after London attac...,no-clickbait,21


In [4]:
test_df.head(2)

Unnamed: 0.1,Unnamed: 0,SNO,Intensity,Text,label,#tokens,Paraphrased_Text,Paraphrased_Text_Pegasus
0,0,833654544412340224,0.0,Abu Dhabi awards China’s CNPC a stake in its l...,no-clickbait,25,Abu Dhabi awarded China's CNPC a stake in its ...,The largest oil concession in Abu Dhabi has a ...
1,1,821482229213728768,0.066667,Former CIA rendition agent issues plea to Dona...,no-clickbait,17,Former CIA rendition agent issues plea to Dona...,The former CIA rendition agent is facing jail ...


In [5]:
print('Length of train set\t:',len(train_df))
print('Length of test set\t:',len(test_df))

Length of train set	: 17497
Length of test set	: 4337


In [6]:
train_df['label'].value_counts()

no-clickbait    13221
clickbait        4276
Name: label, dtype: int64

## Paraphrasing Using Pegasus

In [7]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [8]:
def get_response(input_text,num_return_sequences,num_beams):
    
    batch = tokenizer([input_text],truncation=True,padding='longest',max_length=50, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch,max_length=50,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    del batch
    return tgt_text

In [9]:
def paraphrase(text):
    num_beams = 10
    num_return_sequences = 5
    out=get_response(text,num_return_sequences,num_beams)
    return out

### Train

In [10]:
original_sent=[]
original_intensity=[]
original_label=[]
paraphrased_sent=[]
for i in tqdm(range(len(train_df))): 
    text= train_df['Text'].iloc[i]
    intensity = train_df['Intensity'].iloc[i]
    label = train_df['label'].iloc[i]
    paras=paraphrase(text)
    for para in paras:
        paraphrased_sent.append(para)
    original_sent.extend([text]*len(paras))
    original_intensity.extend([intensity]*len(paras))
    original_label.extend([label]*len(paras))

100%|██████████| 17497/17497 [2:12:33<00:00,  2.20it/s]


In [11]:
out_train_df=pd.DataFrame(list(zip(original_sent,original_intensity,original_label,paraphrased_sent)),
                          columns=['Actual_Sentence','Actual_Intensity','Actual_Label','Paraphrased_Sentence'])

In [12]:
out_train_df.to_csv('train_paraphrased_output.csv')

### Test

In [13]:
original_sent=[]
original_intensity=[]
original_label=[]
paraphrased_sent=[]
for i in tqdm(range(len(test_df))): 
    text= test_df['Text'].iloc[i]
    intensity = test_df['Intensity'].iloc[i]
    label = test_df['label'].iloc[i]
    paras=paraphrase(text)
    for para in paras:
        paraphrased_sent.append(para)
    original_sent.extend([text]*len(paras))
    original_intensity.extend([intensity]*len(paras))
    original_label.extend([label]*len(paras))

100%|██████████| 4337/4337 [33:36<00:00,  2.15it/s]


In [14]:
out_test_df=pd.DataFrame(list(zip(original_sent,original_intensity,original_label,paraphrased_sent)),
                          columns=['Actual_Sentence','Actual_Intensity','Actual_Label','Paraphrased_Sentence'])

In [15]:
out_test_df.to_csv('test_paraphrased_output.csv')