In [1]:
!pip install 'transformers[torch]'
!pip install --upgrade openai


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[torch]
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.19.0 (from transformers[torch])
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
# Mount google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
PROJECT_ROOT = f"/content/gdrive/MyDrive/GPT-VLU"
%load_ext autoreload
%autoreload 2

In [4]:
import sys, os
sys.path.insert(0, PROJECT_ROOT)

In [5]:
from transformers import pipeline, set_seed

from utils.setup_utils import parse_and_validate_df, DFExperimentInfo

import torch
import numpy as np
from tqdm import tqdm
import pandas as pd
from dataclasses import fields
import openai
from sklearn.metrics import accuracy_score

In [6]:
# OPENAI API KEY

openai.api_key = ""

## Load model

In [7]:
# list models
# models = openai.Model.list()

In [8]:
# completion = openai.Completion.create(model="text-davinci-003", prompt="Hello world")
res = openai.Completion.create(
  model="text-davinci-003",
  prompt="Say this is a test",
  max_tokens=3,
  # temperature=0
)

In [None]:
res = openai.Embedding.create(
  model="text-embedding-ada-002",
  input="Say this is a test",
  max_tokens=3,
)

In [None]:
np.array(res['data'][0]['embedding']).shape

(1536,)

## Load data

In [9]:
# Loading data
dataset_path = '/content/gdrive/MyDrive/GPT-VLU/datasets/color-concrete-objects.csv'

task = 'choice'
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,object,color
0,ash,grey
1,banana,yellow
2,beaver,brown
3,blood,red
4,broccoli,green


In [10]:
color_options = df['color'].unique()

## Colors

In [11]:
colors_orig_prompts = [
    'A picture of a MASK WORD',
    'A photo of a MASK WORD',
    'A photo of the MASK WORD',
    'A MASK WORD',
    'MASK WORD',
    'The normal color of a WORD is MASK',
    'WORD usually has a MASK color',
    'WORDs have a MASK color',
    'What is a color of a WORD? MASK',
    'The natural color of a WORD is MASK' 
  ]

## Stroop probing scoring 

In [18]:
def score_model(orig_prompts):
  y_true = []
  y_pred = []

  for index, row in tqdm(df.iterrows()):
    w = row['object']
    for p in orig_prompts:
      p = p.replace("WORD", w)

      options_results = [] 
      for option in color_options:
        input_prompt = p.replace("MASK", option)

        # Extract features from GPT3 
        res = openai.Embedding.create(
          model="text-similarity-davinci-001",
          input=input_prompt,
          max_tokens=3,
        )
        v = torch.tensor(res['data'][0]['embedding']).squeeze().flatten()
        v /= v.norm(p=2, dim=-1, keepdim=True)
        v = v.cpu().numpy()
        options_results.append(v)

      base = torch.tensor(openai.Embedding.create(
          model="text-similarity-davinci-001",
          input=p.replace("MASK", ''),
          max_tokens=3,
        )['data'][0]['embedding']).squeeze().flatten()

      base /= base.norm(p=2, dim=-1, keepdim=True)
      base = base.cpu().numpy()

      scores = np.array(options_results) @ np.expand_dims(base, -1)
      ind_class = np.argmax(scores)
      y_pred.append(color_options[ind_class])
      y_true.append(row['color'])

  return y_true, y_pred



In [19]:
y_true, y_pred = score_model(colors_orig_prompts)
accuracy_score(y_true, y_pred)

52it [26:11, 30.22s/it]


0.5634615384615385

### Autoregressive scoring

In [None]:
def retrieve_results_options(orig_prompts, max_new=1):
  y_true, y_pred = [], []

  for index, row in df.iloc[1:].iterrows():
    w = row['object']
    gt = row['color']

    for p in orig_prompts:
      p = p.replace("WORD", w)
      llm_prompt = '. Choose one word from: {} to replace MASK'.format(color_options)
      input_prompt = p + llm_prompt

      res = openai.Completion.create(
          model="text-davinci-003",
          prompt=input_prompt,
          max_tokens=2 + max_new,
        )
      out_p = res['choices'][0]['text'].replace('\n', '').replace(' ', '').lower()
      y_pred.append(out_p)
      y_true.append(gt)
      
  return y_true, y_pred
    


In [None]:
y_true, y_pred = retrieve_results_options(colors_orig_prompts)
accuracy_score(y_true, y_pred)

0.707843137254902

In [None]:
def retrieve_results_nooptions(extra_llm_prompt, orig_prompts, max_new=1):
  y_true, y_pred = [], []

  for index, row in df.iloc[1:].iterrows():
    w = row['object']
    gt = row['color']

    for p in orig_prompts:
      p = p.replace("WORD", w)
      input_prompt = p + extra_llm_prompt
      res = openai.Completion.create(
          model="text-davinci-003",
          prompt=input_prompt,
          max_tokens=2 + max_new,
        )
      out_p = res['choices'][0]['text'].replace('\n', '').replace(' ', '').lower()
      y_pred.append(out_p)
      y_true.append(gt)
      
  return y_true, y_pred

   

# PROMPT: instead of MASK should be

In [None]:
y_true, y_pred = retrieve_results_nooptions('. instead of MASK should be', colors_orig_prompts, max_new=1)
accuracy_score(y_true, y_pred)

A picture of a MASK banana. instead of MASK should be
A photo of a MASK banana. instead of MASK should be
A photo of the MASK banana. instead of MASK should be
A MASK banana. instead of MASK should be
MASK banana. instead of MASK should be
The normal color of a banana is MASK. instead of MASK should be
banana usually has a MASK color. instead of MASK should be
bananas have a MASK color. instead of MASK should be
What is a color of a banana? MASK. instead of MASK should be
The natural color of a banana is MASK. instead of MASK should be
A picture of a MASK beaver. instead of MASK should be
A photo of a MASK beaver. instead of MASK should be
A photo of the MASK beaver. instead of MASK should be
A MASK beaver. instead of MASK should be
MASK beaver. instead of MASK should be
The normal color of a beaver is MASK. instead of MASK should be
beaver usually has a MASK color. instead of MASK should be
beavers have a MASK color. instead of MASK should be
What is a color of a beaver? MASK. instead

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


nan

In [None]:
y_true, y_pred = retrieve_results_nooptions('. MASK can be replaced with', colors_orig_prompts, max_new=1)
accuracy_score(y_true, y_pred)

0.00392156862745098

In [None]:
y_true, y_pred = retrieve_results_nooptions('. Fill in the MASK: ', colors_orig_prompts, max_new=1)
accuracy_score(y_true, y_pred)

0.36470588235294116

In [None]:
y_true[:10], y_pred[:10]

(['yellow',
  'yellow',
  'yellow',
  'yellow',
  'yellow',
  'yellow',
  'yellow',
  'yellow',
  'yellow',
  'yellow'],
 ['________',
  '██',
  '████',
  'a',
  'ban',
  'yellow',
  'yellow',
  'yellow',
  'yellow',
  'yellow'])

In [None]:
:10