# ReplicationExperiments

This notebooks aims to replicate some results of the paper **VLStereoSet: A Study of Stereotypical Bias in Pre-trained Vision-Language Models** by Zhou et al. (2022).

+ For evaluation we use:
  + `openai/clip-vit-large-patch14`?

## Main Code

### Preliminaries

In [62]:
# Declare Imports
import os, sys, json
import tabulate
import pandas as pd
import requests
pd.set_option('display.max_columns', None)

In [63]:
import sys
sys.path.append("../")

In [64]:
from importlib import reload
import utils.utils as utils
reload(utils)
from utils.utils import \
    calculate_vlrs, \
    calculate_vlbs, \
    calculate_ivlas, \
    read_jsonl, \
    save_jsonl

In [72]:
# Create some relevant folders for data persistence
DATASET_URL = "https://raw.githubusercontent.com/K-Square-00/VLStereo/refs/heads/main/data/VLStereoSet.csv"
RESULTS_DIR = "./results"
MODEL = "openai/clip-vit-large-patch14"
DATASET_TO_SAVE_FILENAME = f"{ RESULTS_DIR }/res_{ MODEL.replace('/', '_') }.jsonl"
DEBUG = False
RANDOM_SEED = 41
DEVICE = "mps"

In [73]:
# Create results dir
os.makedirs(RESULTS_DIR, exist_ok=True)

In [74]:
# Load the data as pandas dataframe
df = pd.read_csv(f"data/{ DATASET_URL.split('/')[-1] }")
df = df.rename(columns={"Imaeg URL": "image_url"}).drop(columns=["Unnamed: 8"])

In [None]:
df

In [105]:
import requests
from PIL import Image
from time import sleep
import random
from io import BytesIO

def get_image_data(url):
    sleep(random.randint(1, 3))
    try:
        response = requests.get(url, timeout=20) 
    except requests.exceptions.Timeout:
        raise Exception("Timeout error")
    if response.status_code != 200:
        raise Exception(f"Error: { response.status_code }")
    if "image" not in response.headers['Content-Type']:
        raise Exception(f"Error: { response.headers['Content-Type'] }")
    return Image.open(BytesIO(response.content))

In [106]:
from transformers import CLIPProcessor, CLIPModel

# Load models
model = CLIPModel.from_pretrained(MODEL).to(DEVICE)
processor = CLIPProcessor.from_pretrained(MODEL)


In [107]:
# Set empty samples list
samples = []

In [None]:
from pathlib import Path

START_WHERE_LEFT_OFF = True

if START_WHERE_LEFT_OFF:
    print("Taking off from where we left off")
    if Path(DATASET_TO_SAVE_FILENAME).exists():
        samples = read_jsonl(DATASET_TO_SAVE_FILENAME)

In [None]:
from tqdm import tqdm
from itertools import islice
import numpy as np

non_available_images = 0
skip_until = 0

# Process data
for index, row in tqdm(islice(df.iterrows(), 3 if DEBUG else None), total=3 if DEBUG else len(df)):

    if index < len(samples):
        print("Skipping existing")
        skip_until = index + 1
        continue

    # Get three options
    stereotypical_option = row["stereotype"]
    non_stereotypical_option = row["anti-stereotype"]
    neutral_option = row["unrelated"]

    # Get encoded image
    try:
        img = get_image_data(row["image_url"])
    except Exception as e:
        print(f"Error: { e }")
        samples.append({ 
            **row.to_dict(),
            "response": "Image URL not available",
            "order": pd.NA
        }) # insert dummy value
        continue

    inputs = processor(
        text=[stereotypical_option, non_stereotypical_option, neutral_option], 
        images=img, 
        return_tensors="pt", 
        padding=True)
    outputs = model(**inputs.to(DEVICE))
    logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
    choice = np.argmax(probs.cpu().detach().numpy())
    samples.append({ 
        **row.to_dict(),
        "response": ["a", "b", "c"][choice], # encode answer as letter
        "order": [0, 1, 2] # order is not shuffled
    })

In [110]:
save_jsonl(samples, DATASET_TO_SAVE_FILENAME, skip_until=skip_until)

### Evaluation

In [None]:
processed_samples = read_jsonl(DATASET_TO_SAVE_FILENAME)

vlrs, vlbs = calculate_vlrs(processed_samples), calculate_vlbs(processed_samples)
print(f"VLRS: { vlrs }")
print(f"VLBS: { vlbs }")

ivlas = calculate_ivlas(vlrs[0], vlbs[0])
print(f"IVLAS: { ivlas }")