# ParaphraseAugmentation

Here we experiment with augmenting the dataset with paraphrases in order to see if the paraphrased options still yield similar stereotypical biases.

Two approaches were attempted: 
+ An LLM-based approach *and* using the Parrot-paraphraser.

Because the generated possible image descriptions follow a relatively canonical structure we can use **instruction-tuned LLMs** for this task too. Add feed linguistic information during the prompt creation.

## Main Code

### Preliminaries

In [None]:
# Install parrot paraphraser
%pip install boto3

In [None]:
%pip install python-dotenv

In [28]:
# Declare Imports
import os, sys, json
import tabulate
import pandas as pd
pd.set_option('display.max_columns', None)

In [200]:
import sys
sys.path.append("../")

In [211]:
from importlib import reload
import utils.utils as utils
reload(utils)
from utils.utils import \
    KVCache

In [202]:
# Create some relevant folders for data persistence
os.makedirs("./data/augmented", exist_ok=True)

In [283]:
# Define some paths (e.g. to load, save data)

DATASET_URL = "https://raw.githubusercontent.com/K-Square-00/VLStereo/refs/heads/main/data/VLStereoSet.csv"

MODEL = "meta.llama3-3-70b-instruct-v1:0"

MODEL = "us.meta.llama3-3-70b-instruct-v1:0"

DATASET_SAVE_PATH = "./data/VLStereoSet_augm.csv"

DEBUG = True

### Download Dataset

In [72]:
import requests

# Download a file and store it in ./data
def download_file(url, filename):
    with open(filename, "wb") as file:
        response = requests.get(url)
        file.write(response.content)

download_file(DATASET_URL, f"data/{ DATASET_URL.split('/')[-1] }")

In [73]:
# Filter-out "dead" samples
df = pd.read_csv(f"data/{ DATASET_URL.split('/')[-1] }")
df = df.rename(columns={"Imaeg URL": "image_url"}).drop(columns=["Unnamed: 8"])

In [74]:
from parrot import Parrot
import torch
import warnings
warnings.filterwarnings("ignore")

In [None]:
df["stereotype"]

In [None]:
df["anti-stereotype"]

In [None]:
df.unrelated

### Discard samples with unavailable-images KV-Cache

In [212]:
kv = KVCache("../BiasConventionality/cache.pickle")

In [None]:
# Get keys of files
kv.cache.keys()
df = df[df["image_url"].isin(kv.cache.keys())] # discard 396 rows

### Build the dataset

In [218]:
# More code using the function 
import boto3
import json
from dotenv import load_dotenv

In [243]:
import pprint

PrettyPrinter = pprint.PrettyPrinter(indent=4)

In [None]:
load_dotenv()

In [78]:
aws_access_key = os.environ.get("AWS_ACCESS_KEY_ID")
aws_secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY")

In [220]:
# Set up the Amazon Bedrock client
bedrock_client = boto3.client(
    	service_name="bedrock-runtime",
    region_name="us-east-1",
)

In [None]:
bedrock_client

In [None]:
sentence = df.iloc[10]["stereotype"]

PrettyPrinter.pprint(sentence)

In [276]:

get_prompt1 = lambda sentence: f"""
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful system generating paraphrases.<|eot_id|>
<|start_header_id|>user<|end_header_id|>

Generate three paraphrases of the most pertinent noun phrase of the following statement and make sure it fits the rest of the sentence: { sentence }
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

In [None]:
PrettyPrinter.pprint(get_prompt1("TEST TEST TEST"))

In [278]:
def compile_payload(prompt):
    return {
        "prompt": prompt,
        "max_gen_len": 150,
        "temperature": 0.2,
        "top_p": 0.9,
        }
    
payload = compile_payload(get_prompt1(sentence))

In [None]:
PrettyPrinter.pprint(payload)

In [None]:
def get_response(payload):
    body = json.dumps(payload)

    response = bedrock_client.invoke_model(
        body=body, 
        modelId=MODEL,
        )

    return json.loads(response.get("body").read())
    # response_body

response_body = get_response(payload)

print(response_body.get("generation"))

In [None]:
generated_response = response_body.get("generation")

get_prompt2 = lambda pre_request, generated_response: f"""
{ pre_request }
{ generated_response }<|eot_id|>
<|start_header_id|>user<|end_header_id|>

From the below response you provided, extract only the complete paraphrased sentences. Each sentence should be on a new line.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

payload = compile_payload(get_prompt2(get_prompt1(sentence), generated_response))

response_body_2 = get_response(payload)

print(response_body_2.get("generation"))

In [None]:
response_body_2.get("generation").split("\n")

In [None]:
from itertools import islice

kv_p = KVCache("paraphrase-cache.pickle")

for i, row in islice(df.iterrows(), 3 if DEBUG else None):
    for func in ["stereotype", "anti-stereotype", "unrelated"]:
        sentence = row[func]
        if kv_p.get(sentence):
            paraphrases = kv_p.get(sentence)
        else:
            payload = compile_payload(get_prompt1(sentence))
            response_body = get_response(payload)
            generated_response = response_body.get("generation")

            payload = compile_payload(get_prompt2(get_prompt1(sentence), generated_response))
            response_body_2 = get_response(payload)
            paraphrases = response_body_2.get("generation").split("\n")
            
            # add sentence-paraphrases to cache
            kv_p.set(sentence, paraphrases)

        for j, paraphrase in enumerate(paraphrases):
            if paraphrase:
                df.loc[i, f"{ func }_augmented_{j}"] = paraphrase
                print(paraphrase)

In [None]:
df

In [288]:
df.to_csv(DATASET_SAVE_PATH, index=False)