In [None]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from google.colab import files
from google.colab import drive
import pandas as pd 

In [None]:
# Open a file upload dialog
# Select here all files to upload!
# If already uploaded, just press 'Cancel Upload'
uploaded = files.upload()

In [None]:
# Set the path to the data
# On local machine use the relative path, for example
# path = 'NLP labelled data preview/english set/'
# On Google Colab use this path
# '/content/'
path = '/content/'

In [None]:
# Load the dataset
file = 'combined_original_90_to_95.csv'
all_comments = pd.read_csv(path + file)

In [None]:

removed_rows = []
# We want to augment such that we keep the original balance of our dataset
if all_comments['Label'].value_counts()[0] > all_comments['Label'].value_counts()[2]:
    # Get the difference in labels
    diff = all_comments['Label'].value_counts()[0] - all_comments['Label'].value_counts()[2]
    for i in range(diff):
        # Remove random rows with label '0', but save it for later
        sampled_row = all_comments[all_comments['Label'] == 0].sample()

        # Append the sampled row to the list of removed rows
        removed_rows.append(sampled_row)

        # Drop the sampled row from the original DataFrame
        all_comments = all_comments.drop(sampled_row.index)
        
        

else:
    # Get the difference in labels
    diff = all_comments['Label'].value_counts()[2] - all_comments['Label'].value_counts()[0]
    for i in range(diff):
        # Remove random rows with label '2', but save it for later
        sampled_row = all_comments[all_comments['Label'] == 2].sample()

        # Append the sampled row to the list of removed rows
        removed_rows.append(sampled_row)

        # Drop the sampled row from the original DataFrame
        all_comments = all_comments.drop(sampled_row.index)


# Concatenate the list of removed rows into a single DataFrame
removed_rows_df = pd.concat(removed_rows).reset_index(drop=True)



In [None]:
# Load Tokenizer and model 
tokenizer = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase')
model = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase')

In [None]:
# Try to use GPU 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

In [None]:
def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [None]:
# Function to generate paraphrases and expand DataFrame
# We want to add the paraphrased comments right under the original comment, for each comment respectively
def expand_comments(df, num_return_sequences, num_beams):
    new_rows = []
    
    for index, row in df.iterrows():
        original_comment = row['Comment']
        label = row['Label']  # Change 'labels' to 'Label'
        score = row['Score']
        new_rows.append({'Comment': original_comment, 'Label': label, 'Score': score})  
        
        generated_comments = get_response(original_comment, num_return_sequences, num_beams)
        
        for gen_comment in generated_comments:
            new_rows.append({'Comment': gen_comment, 'Label': label, 'Score': score})  
    
    expanded_df = pd.DataFrame(new_rows)
    expanded_df.reset_index(drop=True, inplace=True)
    return expanded_df

In [None]:
num_beams = 10 # parameter for beam search, a search strategy used to generate sequences in language generation tasks
num_return_sequences = 2 # How many different sequences to generate

comments_augmented = expand_comments(all_comments, num_return_sequences, num_beams)



In [None]:
# Add the augmented comments dataframe and the removed rows dataframe together again
all_comments_augmented = pd.concat([comments_augmented, removed_rows_df], ignore_index=True)

In [None]:
all_comments_augmented.to_csv(path + 'combined_original_90_to_95_augmented.csv', index=True)


In [None]:
# Download the file to your local machine (from google colab)
files.download(path + 'combined_original_90_to_95_augmented.csv')