In [1]:
from transformers import AutoTokenizer, LlamaForCausalLM
import transformers
import torch

In [2]:
model_name= "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Cuda available: ", torch.cuda.is_available())
model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
)

Cuda available:  True


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
pairs_to_contrast =[("Red cockaded Woodpecker", "American Three toed Woodpecker"),
                    ("Louisiana Waterthrush", "Northern Waterthrush"),
                    ("Chuck will Widow","Nighthawk"),
                    ("Anna Hummingbird","Ruby throated Hummingbird"),
                    ("Artic Tern","Common Tern")]

In [5]:
def read_cub_classes():
    cub_classes = []
    with open('/export/scratch/ru86qer/datasets/cub_xxx_modified/CUB_200_2011/classes.txt', 'r') as f:
        cub_classes = [line.strip().split(' ')[1].split('.')[1] for line in f.readlines()]
    return cub_classes
cub_classes=read_cub_classes()
cub_classes

['Chuck_will_Widow',
 'Anna_Hummingbird',
 'Ruby_throated_Hummingbird',
 'Nighthawk',
 'Artic_Tern',
 'Common_Tern',
 'Northern_Waterthrush',
 'Louisiana_Waterthrush',
 'American_Three_toed_Woodpecker',
 'Red_cockaded_Woodpecker']

In [6]:
pairs_to_contrast_with_idx = []
for pair in pairs_to_contrast:
    pairs_to_contrast_with_idx.append((pair[0],cub_classes.index(pair[0].replace(' ','_')),pair[1],cub_classes.index(pair[1].replace(' ','_'))))
    pairs_to_contrast_with_idx.append((pair[1],cub_classes.index(pair[1].replace(' ','_')),))
pairs_to_contrast_with_idx.sort(key=lambda x: x[1])
pairs_to_contrast_with_idx

[('Chuck will Widow', 0, []),
 ('Anna Hummingbird', 1, []),
 ('Ruby throated Hummingbird', 2, []),
 ('Nighthawk', 3, []),
 ('Artic Tern', 4, []),
 ('Common Tern', 5, []),
 ('Northern Waterthrush', 6, []),
 ('Louisiana Waterthrush', 7, []),
 ('American Three toed Woodpecker', 8, []),
 ('Red cockaded Woodpecker', 9, [])]

In [7]:
name_to_idx = {pair[0]:pair[1] for pair in pairs_to_contrast_with_idx}
name_to_idx

{'Chuck will Widow': 0,
 'Anna Hummingbird': 1,
 'Ruby throated Hummingbird': 2,
 'Nighthawk': 3,
 'Artic Tern': 4,
 'Common Tern': 5,
 'Northern Waterthrush': 6,
 'Louisiana Waterthrush': 7,
 'American Three toed Woodpecker': 8,
 'Red cockaded Woodpecker': 9}

In [8]:
prompt_index = 3
with open('/export/home/ru86qer/classify_by_description_release/prompts/prompt'+"_{0}.txt".format(prompt_index), 'r') as f:
    base_prompt = f.read()

In [9]:
prompts = []
for pair in pairs_to_contrast:
    prompts.append(base_prompt.format(category_name_1=pair[0], category_name_0=pair[1]))
    prompts.append(base_prompt.format(category_name_1=pair[1], category_name_0=pair[0]))

In [10]:
responses=[]
for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to('cuda')
    # Generate
    generate_ids = model.generate(inputs.input_ids, max_length=len(inputs.input_ids[0])+200, top_k=1, num_return_sequences=1, do_sample=True)

    decoded_response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    response_text = decoded_response[(len(prompt)-7):]
    responses.append(response_text)

In [11]:
responses

['cker:\n- medium-sized woodpecker\n- three distinct toes on each foot\n- black and white striped back\n- white underside\n- black crown and nape\n- white face with black stripes\n- black bill with red spot near the base\n- Red cockaded Woodpecker has a red cap on the head\n- Red cockaded Woodpecker has a red stripe on the side of the neck\n- Red cockaded Woodpecker has a red patch on the upperwing\n\n',
 'cker:\n- small to medium-sized woodpecker\n- red cap on the head\n- white stripe above the eye\n- black stripe below the eye\n- white underside of the wing\n- black back and wings with white spots\n- black tail with white outer feathers\n- white outer tail feathers\n- black legs and feet\n\n\n',
 'rush:\n- large, slate-gray bird\n- darker gray on the back and wings\n- pale yellow-green upperparts\n- white undersides\n- dark-gray legs and feet\n- distinctive white outer tail feathers\n- dark-gray bill with a yellow-green base\n- dark brown eyes\n- distinctive white eyering\n- distinct

In [12]:
responses_1 = [response[8:] for response in responses]

In [13]:
responses_1 = [response.split('\n') for response in responses_1]
responses_1

[['medium-sized woodpecker',
  '- three distinct toes on each foot',
  '- black and white striped back',
  '- white underside',
  '- black crown and nape',
  '- white face with black stripes',
  '- black bill with red spot near the base',
  '- Red cockaded Woodpecker has a red cap on the head',
  '- Red cockaded Woodpecker has a red stripe on the side of the neck',
  '- Red cockaded Woodpecker has a red patch on the upperwing',
  '',
  ''],
 ['small to medium-sized woodpecker',
  '- red cap on the head',
  '- white stripe above the eye',
  '- black stripe below the eye',
  '- white underside of the wing',
  '- black back and wings with white spots',
  '- black tail with white outer feathers',
  '- white outer tail feathers',
  '- black legs and feet',
  '',
  '',
  ''],
 ['large, slate-gray bird',
  '- darker gray on the back and wings',
  '- pale yellow-green upperparts',
  '- white undersides',
  '- dark-gray legs and feet',
  '- distinctive white outer tail feathers',
  '- dark-gray

In [16]:
def filtered(sublist):
    return [item for item in sublist if item.startswith('-')]

descriptor_dict = {}
for key in name_to_idx.keys():
        descriptor_dict[key]=responses_1[name_to_idx[key]]

In [17]:
descriptor_dict

{'Chuck will Widow': ['medium-sized woodpecker',
  '- three distinct toes on each foot',
  '- black and white striped back',
  '- white underside',
  '- black crown and nape',
  '- white face with black stripes',
  '- black bill with red spot near the base',
  '- Red cockaded Woodpecker has a red cap on the head',
  '- Red cockaded Woodpecker has a red stripe on the side of the neck',
  '- Red cockaded Woodpecker has a red patch on the upperwing',
  '',
  ''],
 'Anna Hummingbird': ['small to medium-sized woodpecker',
  '- red cap on the head',
  '- white stripe above the eye',
  '- black stripe below the eye',
  '- white underside of the wing',
  '- black back and wings with white spots',
  '- black tail with white outer feathers',
  '- white outer tail feathers',
  '- black legs and feet',
  '',
  '',
  ''],
 'Ruby throated Hummingbird': ['large, slate-gray bird',
  '- darker gray on the back and wings',
  '- pale yellow-green upperparts',
  '- white undersides',
  '- dark-gray legs a

In [52]:
import json
import os
path = "/export/home/ru86qer/classify_by_description_release/descriptors"
with open(os.path.join(path, "contrastive_descriptions_1.json"), 'w') as fp:
    json.dump(descriptor_dict, fp, indent=4)