# Installation

In [None]:
!pip install torch transformers accelerate

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-

# Load model

In [None]:
import torch
import accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16, device_map="cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

# Clustering

## Load dataset

In [None]:
!pip install pillow requests datasets

In [None]:
import requests
from PIL import Image
from datasets import load_dataset

dataset = load_dataset("lmms-lab/VQAv2", split="validation")
print(dataset)

In [None]:
import csv

with open('/content/sample_data/wandb_export_2024-04-29T20_46_58.863-04_00.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    print("questions #:\t", len([row["Mistral Question"] for row in csv_reader]))

questions #:	 80


## Experiments

In [None]:
import csv

# csv header
fieldnames = ['question', 'category']

# csv data
rows = []

questions = []
tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", padding_side="left")

with open('/content/sample_data/wandb_export_2024-04-29T20_46_58.863-04_00.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    questions = [row["Mistral Question"] for row in csv_reader]

for question in questions:
  prompt = f"""
  You are given a question which will be prompted to a VLM.
  Please categorize this question so that when evaluating the robustness of the VLM, it would be possible to compare the performance on different categories of questions.
  In case the question does not match any existing category, you need to generate a new one. A category should be a short name (e.g: location, number, color) and should be vague enough to be able to group multiple questions into each category.

  Here are some examples:
  Question #1: What color is the bedspread?
  Category: color
  Explanation: Here the question asks about the color of an object to the question is based on a color

  Question #2: What kind of room is this?
  Category: location
  Explanation: Here the question is about a room so it referes to a specific location that the VLM should define

  Question #3: What are the people in the background doing?
  Category: action
  Explanation: Here the question asked what they are doing, so it refers to an action in progress

  Question #4: What is he on top of?
  Category: object
  Explanation: Here the question asks to define something which appears to be an object

  Question #5: How many are playing ball?
  Category: quantity
  Explanation: Here the question asks for a number of people that are playing ball so it's about quantity

  Question: {question}
  """

  model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

  generated_ids = model.generate(**model_inputs, max_new_tokens=25, pad_token_id=tokenizer.eos_token_id)
  output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  results = output[output.rfind("Category:"):]
  if "Explanation" in results:
    category = results[results.find("Category:")+9:results.find("Explanation:")]
  else:
    category = results[results.find("Category:")+9:]

  print("Category:\t", category.strip())
  rows.append({"question": question, "category": category.strip()})

  with open('/content/sample_data/results.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

tokenizer_config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Category:	 color
Category:	 object
Category:	 color
Category:	 object
Category:	 symbol
Category:	 markings
Category:	 number
Category:	 railway infrastructure
Category:	 ingredient
Category:	 food
Category:	 ingredient
Category:	 ingredient
Category:	 food
Category:	 object
Category:	 color
Category:	 color
Category:	 object
Category:	 color
Category:	 expression
Category:	 color
Category:	 object
Category:	 action
Category:	 size
Category:	 color
Category:	 food
Category:	 color
Category:	 ingredient
Category:	 ingredient
Category:	 food
Category:	 ingredient
Category:	 nutritional value
Category:	 phrase
Category:	 description
Category:	 color
Category:	 age
Category:	 type
Category:	 object
Category:	 object
Category:	 object
Category:	 family
Category:	 object
Category:	 object
Category:	 assistant
Category:	 color
Category:	 posture
Category:	 feature
Category:	 texture
Category:	 quantity
Category:	 color
Category:	 material
Category:	 color
Category:	 brand
Category:	 object
Ca

In [None]:
tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", padding_side="left")

print(len(dataset["question"]))

# <s>[INST] Your task is to group a question into a type of question given a specific question, to define a type, we ask ourselves: What is the question about? If the type that you suggest is right is not included in the examples, you need to generate a new type. So for instance the question:

for row in dataset["question"][:50]:
  prompt = f"""<s>Few-shot examples:
  [INST]
  What color is the bedspread?
  would be binded to the group:[/INST]
  color
  [INST]also, the question:
  Why are some people wearing hats?
  would be binded to the group:[/INST]
  explanation
  [INST]also, the question:
  How many people are on the field?
  would be binded to the group:[/INST]
  quantity
  [INST]also, the question:
  Why is the cow laying down?
  would be binded to the group:[/INST]
  explanation
  [INST]also, the question:
  What kind of room is this?
  would be binded to the group:[/INST]
  location
  [INST]also, the question:
  Where was the picture taken of the man?
  would be binded to the group:[/INST]
  location
  [INST]also, the question:
  Why is there a gap between the roof and wall?
  would be binded to the group:[/INST]
  explanation
  [INST]also, the question:
  Is this a creamy soup?
  would be binded to the group:[/INST]
  object
  [INST]also, the question:
  What is behind the foot of the bed?
  would be binded to the group:[/INST]
  object
  [INST]also, the question:
  What is this vehicle?
  would be binded to the group:[/INST]
  object
  [INST]also, the question:
  Is it daylight in this picture?
  would be binded to the group:[/INST]
  environment
  [INST]also, the question:
  Are the lights on in this room?
  would be binded to the group:[/INST]
  environment
  [INST]also, the question:
  What website copyrighted the picture?
  would be binded to the group:[/INST]
  writing
  Where is he looking?
  would be binded to the group:[/INST]
  direction
[INST]also, the question:
What are the people in the background doing?
would be binded to the group:[/INST]
  action
[INST]also, the question:
What is he on top of?
would be binded to the group:[/INST]
  object
[INST]also, the question:
What website copyrighted the picture?
would be binded to the group:[/INST]
  writing
[INST]also, the question:
Is this a creamy soup?
would be binded to the group:[/INST]
  food
[INST]also, the question:
Is this rice noodle soup?
would be binded to the group:[/INST]
  food
[INST]also, the question:
What is to the right of the soup?
would be binded to the group:[/INST]
  object
[INST]also, the question:
What is the man doing in the street?
would be binded to the group:[/INST]
  action
[INST]also, the question:
How many photo's can you see?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
What does the truck on the left sell?
would be binded to the group:[/INST]
  object
[INST]also, the question:
Why is there a gap between the roof and wall?
would be binded to the group:[/INST]
  explanation
[INST]also, the question:
Is it daylight in this picture?
would be binded to the group:[/INST]
  environment
[INST]also, the question:
Why is the cow laying down?
would be binded to the group:[/INST]
  explanation
[INST]also, the question:
What color is the grass in this picture?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Did the batter hit the ball?
would be binded to the group:[/INST]
  action
[INST]also, the question:
How many are playing ball?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
Is there a chain link fence in the image?
would be binded to the group:[/INST]
  object
[INST]also, the question:
Is the boy playing baseball?
would be binded to the group:[/INST]
  action
[INST]also, the question:
Is that a folding chair?
would be binded to the group:[/INST]
  object
[INST]also, the question:
How many beds?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
What color is the bedspread?
would be binded to the group:[/INST]
  color
[INST]also, the question:
How many pictures are there?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
Are these twin mattresses?
would be binded to the group:[/INST]
  object
[INST]also, the question:
How many sources of light are there?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
Is this room decorated for the 1970s?
would be binded to the group:[/INST]
  environment
[INST]also, the question:
Are the lights on in this room?
would be binded to the group:[/INST]
  environment
[INST]also, the question:
Are the windows big?
would be binded to the group:[/INST]
  size
[INST]also, the question:
What is the chair made of?
would be binded to the group:[/INST]
  material
[INST]also, the question:
Is this room in someone's home?
would be binded to the group:[/INST]
  location
[INST]also, the question:
Which room is this?
would be binded to the group:[/INST]
  location
[INST]also, the question:
Is the bed white?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Could this be a hotel room?
would be binded to the group:[/INST]
  location
[INST]also, the question:
Is the bed made?
would be binded to the group:[/INST]
  action
[INST]also, the question:
Are there bed headboards present in the photo?
would be binded to the group:[/INST]
  object
[INST]also, the question:
Is there a mirror in the room?
would be binded to the group:[/INST]
  object
[INST]also, the question:
What kind of room is this?
would be binded to the group:[/INST]
  location
[INST]also, the question:
How many chairs are in the photo?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
Is the desk cluttered?
would be binded to the group:[/INST]
  appearance
[INST]also, the question:
Has the bed been made?
would be binded to the group:[/INST]
  material
[INST]also, the question:
How many seats are there?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
How many frames are on the wall?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
What color are the walls?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Are there any boxes in the room?
would be binded to the group:[/INST]
  object
[INST]also, the question:
Could this be a multi-purpose room?
would be binded to the group:[/INST]
  house
[INST]also, the question:
What animal print does that chair resemble?
would be binded to the group:[/INST]
  object
[INST]also, the question:
What is behind the foot of the bed?
would be binded to the group:[/INST]
  object
[INST]also, the question:
What is the size of the bed?
would be binded to the group:[/INST]
  size
[INST]also, the question:
How many pictures on the wall?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
Will this kid leave the powdered sugar on his face?
would be binded to the group:[/INST]
  explanation
[INST]also, the question:
What is the child eating?
would be binded to the group:[/INST]
  object
[INST]also, the question:
Is this person wearing a tie?
would be binded to the group:[/INST]
  appearance
[INST]also, the question:
What color is the kids hair?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Is the man's tie purple?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Where was the picture taken of the man?
would be binded to the group:[/INST]
  location
[INST]also, the question:
What is the man wearing on his head?
would be binded to the group:[/INST]
  object
[INST]also, the question:
Is this a designer tie?
would be binded to the group:[/INST]
  object
[INST]also, the question:
Is this photo color?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Does the door open inward or outward?
would be binded to the group:[/INST]
  house
[INST]also, the question:
Is there a drainage in the pic?
would be binded to the group:[/INST]
  observation
[INST]also, the question:
What does the sentence on the top say?
would be binded to the group:[/INST]
  writing
[INST]also, the question:
What kind of vehicle is the RV pulling on the bottom picture?
would be binded to the group:[/INST]
  object
[INST]also, the question:
Where is the camping tent?
would be binded to the group:[/INST]
  location
[INST]also, the question:
What color are the gym shoes?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Is there a red sandal here?
would be binded to the group:[/INST]
  color
[INST]also, the question:
What color is the flip flop?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Are there players in the dugout?
would be binded to the group:[/INST]
  location
[INST]also, the question:
Is the battery looking at the ball?
would be binded to the group:[/INST]
  observation
[INST]also, the question:
How many people are on the field?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
How many signs?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
What color is the Salisbury Rd. sign?
would be binded to the group:[/INST]
  color
[INST]also, the question:
What does the last sign say?
would be binded to the group:[/INST]
  writing
[INST]also, the question:
What color is the sign?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Can you turn right or left in the tunnel?
would be binded to the group:[/INST]
  direction
[INST]also, the question:
What color is the plant?
would be binded to the group:[/INST]
  color
[INST]also, the question:
What color is the woman's shirt on the left?
would be binded to the group:[/INST]
  color
[INST]also, the question:
What type of beverage is being displayed?
would be binded to the group:[/INST]
  object
[INST]also, the question:
Why are some people wearing hats?
would be binded to the group:[/INST]
  explanation
[INST]also, the question:
What kind of event are the people involved in?
would be binded to the group:[/INST]
  event
[INST]also, the question:
How many skyscrapers are there?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
What is the color of the building in the background?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Is there a tree in front of the building?
would be binded to the group:[/INST]
  location
[INST]also, the question:
Is there a person riding a bike?
would be binded to the group:[/INST]
  observation
[INST]also, the question:
Are there lights on in the two buildings?
would be binded to the group:[/INST]
  environment
[INST]also, the question:
How many stories is the building on the left?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
Is this a hospital?
would be binded to the group:[/INST]
  location
[INST]also, the question:
How many umbrellas do you see?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
How many colors is the building on the right painted?
would be binded to the group:[/INST]
  color
[INST]also, the question:
What is on the boy's back walking?
would be binded to the group:[/INST]
  object
[INST]also, the question:
How many modes of transportation are depicted?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
What color is the building?
would be binded to the group:[/INST]
  color
[INST]also, the question:
Is it still snowing in the picture?
would be binded to the group:[/INST]
  environment
[INST]also, the question:
Does this appear to be a photo of multiple exposures of the black clad snowboarder?
would be binded to the group:[/INST]
  observation
[INST]also, the question:
How many people can you see in the picture?
would be binded to the group:[/INST]
  quantity
[INST]also, the question:
Is this an adult party?
would be binded to the group:[/INST]
  observation
[INST]also, the question:
What is being celebrated?
would be binded to the group:[/INST]
  event
[INST]also, the question:
Who is in front of the cake with candles?
would be binded to the group:[/INST]
  environment
[INST]also, the question:
What is this vehicle?
would be binded to the group:[/INST]
  object
[INST]also, the question:
Can you see the hook up for the train?
would be binded to the group:[/INST]
  environment
[INST]also, the question:
Is this train at the station?
  location
  </s>
  [INST]
  ${row}
  [/INST]"""

  model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

  generated_ids = model.generate(**model_inputs, max_new_tokens=3, pad_token_id=tokenizer.eos_token_id)
  output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  result = output[output.rfind("[/INST]")+8:]
  print(row)
  print(result)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", padding_side="left")

print(len(dataset["question"]))

# <s>[INST] Your task is to group a question into a type of question given a specific question, to define a type, we ask ourselves: What is the question about? If the type that you suggest is right is not included in the examples, you need to generate a new type. So for instance the question:

for row in dataset["question"][:5]:
  prompt = f"""
  <s>Your task is to categorize questions that would be asked to a VLM into one of the following predefined categories:

  location
  quantity
  explanation
  object
  action
  color
  writing

  If the text doesn't fit into any of the above categories, classify it as:
  other

  ####
  Here are some examples:

  Question: Is the boy playing baseball?
  Category: action
  Question: What color is the bedspread?
  Category: color
  Question: Why is the cow laying down?
  Category: explanation
  Question: Where was the picture taken of the man?
  Category: location
  ###
  </s>

  Question: {row}
  """

  model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

  generated_ids = model.generate(**model_inputs, max_new_tokens=3, pad_token_id=tokenizer.eos_token_id)
  output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  # result = output[output.rfind("[/INST]")+8:]
  result = output
  print(row)
  print(result)

214354
Where is he looking?

   Your task is to categorize questions that would be asked to a VLM into one of the following predefined categories:

  location
  quantity
  explanation
  object
  action
  color
  writing

  If the text doesn't fit into any of the above categories, classify it as:
  other

  ####
  Here are some examples:

  Question: Is the boy playing baseball?
  Category: action
  Question: What color is the bedspread?
  Category: color
  Question: Why is the cow laying down?
  Category: explanation
  Question: Where was the picture taken of the man?
  Category: location
  ###
   

  Question: Where is he looking?
  
  Category
What are the people in the background doing?

   Your task is to categorize questions that would be asked to a VLM into one of the following predefined categories:

  location
  quantity
  explanation
  object
  action
  color
  writing

  If the text doesn't fit into any of the above categories, classify it as:
  other

  ####
  Here are some 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", padding_side="left")

print(len(dataset["question"]))

# <s>[INST] Your task is to group a question into a type of question given a specific question, to define a type, we ask ourselves: What is the question about? If the type that you suggest is right is not included in the examples, you need to generate a new type. So for instance the question:

for row in dataset["question"][:50]:
  prompt = f"""<s>Your task is to categorize questions that would be asked to a VLM into one of the following predefined categories:

  location
  quantity
  explanation
  object
  action
  color
  writing

  If the text doesn't fit into any of the above categories, classify it as: other. Here are some examples:
  [INST]
  What color is the bedspread?
  would be binded to the group:[/INST]
  color
  [INST]also, the question:
  Why are some people wearing hats?
  would be binded to the group:[/INST]
  explanation
  [INST]also, the question:
  How many people are on the field?
  would be binded to the group:[/INST]
  quantity
  [INST]also, the question:
  What kind of room is this?
  would be binded to the group:[/INST]
  location
  </s>
  [INST]
  ${row}
  [/INST]"""

  model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

  generated_ids = model.generate(**model_inputs, max_new_tokens=3, pad_token_id=tokenizer.eos_token_id)
  output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  result = output[output.rfind("[/INST]")+8:]
  print(row)
  print(result)

214354
Where is he looking?
  location
What are the people in the background doing?
  action
What is he on top of?
  object
What website copyrighted the picture?
  writing
Is this a creamy soup?
  writing
Is this rice noodle soup?
  object
What is to the right of the soup?
  writing
What is the man doing in the street?
  action
How many photo's can you see?
  quantity
What does the truck on the left sell?
  object
Why is there a gap between the roof and wall?
  explanation
Is it daylight in this picture?
  writing
Why is the cow laying down?
  explanation
What color is the grass in this picture?
  color
Did the batter hit the ball?
  action
How many are playing ball?
  quantity
Is there a chain link fence in the image?
  object
Is the boy playing baseball?
  action
Is that a folding chair?
  object
How many beds?
  quantity
What color is the bedspread?
The category for
How many pictures are there?
  quantity
Are these twin mattresses?
  object
How many sources of light are there?
  qua