In [11]:
from open_flamingo import create_model_and_transforms
from huggingface_hub import hf_hub_download
import torch
from PIL import Image
import requests
import torch
import json
import random

In [12]:
def model_setup():
    model, image_processor, tokenizer = create_model_and_transforms(
        clip_vision_encoder_path="ViT-L-14",
        clip_vision_encoder_pretrained="openai",
        lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b",
        tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b",
        cross_attn_every_n_layers=1
    )

    checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt")
    model.load_state_dict(torch.load(checkpoint_path), strict=False)
    
    return model, image_processor, tokenizer

In [13]:
def getImage(num, annotated):
    if annotated:
        return Image.open("../ai2d/annotated_images/" + str(num) + ".png")
    return Image.open("../ai2d/images/" + str(num) + ".png")

In [14]:
def getQuestion(num):
    f = open("../ai2d/questions/" + str(num) + ".png.json")
 
    # returns JSON object as a dictionary
    data = json.load(f)
    
    dictionary = data["questions"]
    question = list(dictionary.keys())[0]

    correctAnswerIndex = dictionary[question]["correctAnswer"]
    correctAnswer = dictionary[question]["answerTexts"][correctAnswerIndex]

    # Closing file
    f.close()

    return question, correctAnswer

In [15]:
def preprocessImage(image_processor, image):
    vision_x = [image_processor(image).unsqueeze(0)]
    vision_x = torch.cat(vision_x, dim=0)
    vision_x = vision_x.unsqueeze(1).unsqueeze(0)
    return vision_x

In [16]:
def preprocessText(tokenizer, question):
    lang_x = tokenizer(
        ["<image>" + question + " The answer is"],
        return_tensors="pt",
    )
    return lang_x

In [None]:
from store_annotations import annotate

for i in range(2557, 4097):
    print(i)
    ann_path = "../ai2d/annotations/"+str(i)+".png.json"
    img_path = "../ai2d/images/"+str(i)+".png"
    out_path = "../ai2d/annotated_images/"+str(i)+".png"
    annotate(ann_path, img_path, out_path)

In [17]:
def generate_text(model, vision_x, lang_x, tokenizer):
    generated_text = model.generate(
        vision_x=vision_x,
        lang_x=lang_x["input_ids"],
        attention_mask=lang_x["attention_mask"],
        max_new_tokens=20,
        num_beams=3,
    )
    return tokenizer.decode(generated_text[0])

In [18]:
model, image_processor, tokenizer = model_setup()
tokenizer.padding_side = "left"

Using pad_token, but it is not set yet.


You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50280. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Flamingo model initialized with 1046992944 trainable parameters


In [29]:
for i in range(100):
    num = random.randint(0, 2555)
    try:
        print("Image", str(num))
        image = getImage(num, False)
        annotated_image = getImage(num, True)
        question, correctAnswer = getQuestion(num)
        print(question)
        lang_x = preprocessText(tokenizer, question)

        print("REGULAR")
        vision_x = preprocessImage(image_processor, image)
        reg_answer = generate_text(model, vision_x, lang_x, tokenizer)
        print(reg_answer)

        print("ANNOTATED")
        vision_x = preprocessImage(image_processor, annotated_image)
        annotated_answer = generate_text(model, vision_x, lang_x, tokenizer)
        print(annotated_answer)

        print("CORRECT", correctAnswer)

        file_path = "../ai2d/responses/" + str(num) + ".txt"
        with open(file_path, "w") as file:
            # Write the text to the file
            text_to_save = reg_answer + "\n" + annotated_answer
            file.write(text_to_save)

        # start = len("<image>") + len(question) + len("The answer is")
        # end = -1 * len("<|endofchunk|>")
        # print(similarityScore(eval_model, correctAnswer, answer[start, end]))
    except:
        continue

Image 2073
Based on the above food web, what will happen if the number of algae decreases.
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Image 1129
Which forms the calyx of a flower?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Which forms the calyx of a flower? The answer is the pistil.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Which forms the calyx of a flower? The answer is the pistil.<|endofchunk|>
CORRECT sepal
Image 733
What comes out of the larval stage?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What comes out of the larval stage? The answer is the pupa.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What comes out of the larval stage? The answer is a butterfly.<|endofchunk|>
CORRECT Pupa.
Image 920
Image 1898
According to the given food web, what are the consequences if all the fruits are plucked by humans?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>According to the given food web, what are the consequences if all the fruits are plucked by humans? The answer is given below.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>According to the given food web, what are the consequences if all the fruits are plucked by humans? The answer is given below.<|endofchunk|>
CORRECT Acorns will suffer from loss of energy.
Image 1863
According to the given food web, select the correct pair
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>According to the given food web, select the correct pair The answer is:<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>According to the given food web, select the correct pair The answer is<|endofchunk|>
CORRECT Algae-producer
Image 1156
What part is between the ovary and the stigma?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What part is between the ovary and the stigma? The answer is the stigma.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What part is between the ovary and the stigma? The answer is the stigma.<|endofchunk|>
CORRECT Style.
Image 573
By what process is the flowering stage brought about?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>By what process is the flowering stage brought about? The answer is that the flowering stage is brought about by the process of photosynthesis. Photosynthesis is the process by
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>By what process is the flowering stage brought about? The answer is that the flowering stage is brought about by the process of photosynthesis. Photosynthesis is the process by
CORRECT Flower Forcing
Image 2138
At what phase will the black fly gain wings?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>At what phase will the black fly gain wings? The answer is that the black fly will not gain wings until it is fully grown.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>At what phase will the black fly gain wings? The answer is that the black fly will not gain wings until it is in the larval stage. The black fly is
CORRECT Adult stage
Image 1459
DESCRIBE THE LABEL C IN DIAGRAM?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>DESCRIBE THE LABEL C IN DIAGRAM? The answer is: the label C in the diagram is the collapse of the mountain.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>DESCRIBE THE LABEL C IN DIAGRAM? The answer is yes.<|endofchunk|>
CORRECT FORMATION OF CRATER LAKE AND WIZARD ISLAND
Image 1983
In this diagram, man is
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>In this diagram, man is The answer is man.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>In this diagram, man is The answer is in the middle of the tree.<|endofchunk|>
CORRECT predator
Image 226
Image 946
What animal is shown at letter D?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What animal is shown at letter D? The answer is a bird.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What animal is shown at letter D? The answer is a bird.<|endofchunk|>
CORRECT Duck
Image 710
Image 979
Which fern is an evergreen fern?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Which fern is an evergreen fern? The answer is Polypody.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Which fern is an evergreen fern? The answer is ferns are evergreen ferns. Evergreen ferns do not lose their leaves
CORRECT Christmas Fern
Image 658
Image 387
Image 1386
What is the bottom-most opening in the figure?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What is the bottom-most opening in the figure? The answer is the stomach.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What is the bottom-most opening in the figure? The answer is at the bottom-most opening in the figure. The answer is at the bottom-most opening in
CORRECT anus
Image 580
What does the given diagram depict?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What does the given diagram depict? The answer is: a frog.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What does the given diagram depict? The answer is in the answer key at the bottom of the page.<|endofchunk|>
CORRECT The frog life cycle
Image 70
What do animals respire into the air?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What do animals respire into the air? The answer is carbon dioxide.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What do animals respire into the air? The answer is carbon dioxide.<|endofchunk|>
CORRECT Carbon Dioxide
Image 1780
Image 370
Image 2539
The diagram shows the phases of the  moon. Which phase is depicted at C?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>The diagram shows the phases of the  moon. Which phase is depicted at C? The answer is B.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>The diagram shows the phases of the  moon. Which phase is depicted at C? The answer is B.<|endofchunk|>
CORRECT Waning Crescent
Image 2475
WHICH STAGE APPEARS IN WINTER?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>WHICH STAGE APPEARS IN WINTER? The answer is: all of them!<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>WHICH STAGE APPEARS IN WINTER? The answer is B.<|endofchunk|>
CORRECT QUEEN HIBERNATES
Image 2238
Do grasshoppers lay eggs?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Do grasshoppers lay eggs? The answer is yes.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Do grasshoppers lay eggs? The answer is yes.<|endofchunk|>
CORRECT Yes
Image 1478
Evaporation is represented at which letter?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Evaporation is represented at which letter? The answer is B.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Evaporation is represented at which letter? The answer is B.<|endofchunk|>
CORRECT A
Image 1629
Label C represents
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Label C represents The answer is Moon<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Label C represents The answer is B. Label D represents The answer is C. Label E represents The answer is D. Label F
CORRECT Earth
Image 1761
From the above food web diagram, if all grass dies then population of mouse will
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>From the above food web diagram, if all grass dies then population of mouse will The answer is no.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>From the above food web diagram, if all grass dies then population of mouse will The answer is no.<|endofchunk|>
CORRECT decrease
Image 1719
which label is give to thrust?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>which label is give to thrust? The answer is thrust.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>which label is give to thrust? The answer is that the thrust is given by the weight of the ship. The weight of the ship is given by
CORRECT d
Image 1076
Image 308
According to the given food chain, what will happen if seeds were not present in this food chain?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>According to the given food chain, what will happen if seeds were not present in this food chain? The answer is given below.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>According to the given food chain, what will happen if seeds were not present in this food chain? The answer is given below.<|endofchunk|>
CORRECT Population of grouse and chipmunk would decrease.
Image 1836
What doe the grashopper need for nourishment?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What doe the grashopper need for nourishment? The answer is food chain.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What doe the grashopper need for nourishment? The answer is not as simple as you might think.<|endofchunk|>
CORRECT grass
Image 2224
The diagram depicts the life cycle of a butterfly. Which letter in the diagram represents the larva?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>The diagram depicts the life cycle of a butterfly. Which letter in the diagram represents the larva? The answer is B.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>The diagram depicts the life cycle of a butterfly. Which letter in the diagram represents the larva? The answer is B.<|endofchunk|>
CORRECT D
Image 28
Based on the diagram below, how many different food sources does the trout have?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Based on the diagram below, how many different food sources does the trout have? The answer is 4.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Based on the diagram below, how many different food sources does the trout have? The answer is 2.<|endofchunk|>
CORRECT three
Image 83
Night is experienced by the part of the Earth that is farthest from the sun. Which letter represents night?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Night is experienced by the part of the Earth that is farthest from the sun. Which letter represents night? The answer is B.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Night is experienced by the part of the Earth that is farthest from the sun. Which letter represents night? The answer is B.<|endofchunk|>
CORRECT E
Image 135
Which diagram letter shows the moon in the first quarter?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Which diagram letter shows the moon in the first quarter? The answer is B.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>Which diagram letter shows the moon in the first quarter? The answer is B.<|endofchunk|>
CORRECT Letter J
Image 761
In the diagram, letters A, B, C and D show stages in the moon's cycle around the earth. Which letter shows apogee?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>In the diagram, letters A, B, C and D show stages in the moon's cycle around the earth. Which letter shows apogee? The answer is B.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>In the diagram, letters A, B, C and D show stages in the moon's cycle around the earth. Which letter shows apogee? The answer is B.<|endofchunk|>
CORRECT C
Image 2112
Image 676
the outermost layer of earth is known as:
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>the outermost layer of earth is known as: The answer is: The outermost layer of earth is known as: The outermost layer of earth is known as
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>the outermost layer of earth is known as: The answer is: The outermost layer of earth is known as: The outermost layer of earth is known as
CORRECT Crust
Image 1128
What does G represent ?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What does G represent? The answer is that it stands for Gynostemma pentaphyllum.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What does G represent? The answer is that it stands for Gynostemma pentaphyllum.<|endofchunk|>
CORRECT The Root System
Image 2537
On which day was the moon a half moon?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>On which day was the moon a half moon? The answer is: July 27, 2012.<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>On which day was the moon a half moon? The answer is: The moon is always a half-moon. The moon is always a half-moon
CORRECT Thursday
Image 559
What organism is showed in the diagram above?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What organism is showed in the diagram above? The answer is:<|endofchunk|>
ANNOTATED


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


<image>What organism is showed in the diagram above? The answer is:<|endofchunk|>
CORRECT Plant
Image 2205
How many stage shown?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


Image 14
According to the given food chain, if sea weeds are washed out due to Tsunami, which one would happen?
REGULAR


Setting `pad_token_id` to `eos_token_id`:50277 for open-end generation.


In [None]:
# pip install -U sentence-transformers

# from sentence_transformers import SentenceTransformer, util

# eval_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# def similarityScore(eval_model, realAnswer, modelOutput):
#     embedding_1= eval_model.encode(realAnswer, convert_to_tensor=True)
#     embedding_2 = eval_model.encode(modelOutput, convert_to_tensor=True)
#     return util.pytorch_cos_sim(embedding_1, embedding_2)