## **Query Generation Pipeline using LLaMA 3.2** 

In [5]:
import datasets 

# load from disk 
meme_ds = datasets.load_from_disk('./full_meme_cap_ocr')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# filter out the memes with no text

meme_ds = meme_ds.filter(lambda x: len(x["extracted_text"]["<OCR_WITH_REGION>"]["labels"]) > 0 or "" not in x["extracted_text"]["<OCR_WITH_REGION>"]["labels"] or None not in x["extracted_text"]["<OCR_WITH_REGION>"]["labels"])

In [3]:
meme_ds

Dataset({
    features: ['category', 'img_captions', 'meme_captions', 'title', 'url', 'img_fname', 'metaphors', 'post_id', 'extracted_text'],
    num_rows: 6382
})

In [1]:
from together import Together
client = Together(api_key="TOGETHER_API_KEY")

In [5]:
meme_ds[0]

{'category': 'memes',
 'img_captions': ['three heads of Avatars, flash drive and guns '],
 'meme_captions': ['Meme poster is trying to convey that Person freaks out at a gun, is fine when they learn the gun has a flash drive magazine, but then starts freaking out again when learning the usb is filled with virtual bullets. '],
 'title': 'Plot Twist!',
 'url': 'https://i.redd.it/t530tkhpy6w91.png',
 'img_fname': 'memes_ye4wo5.png',
 'metaphors': [{'meaning': 'Humanity', 'metaphor': 'three heads'}],
 'post_id': 'ye4wo5',
 'extracted_text': {'<OCR_WITH_REGION>': {'labels': ['</s>Panik',
    'Kalm',
    'Panik'],
   'quad_boxes': [[171.6479949951172,
     0.7394999861717224,
     266.8800048828125,
     0.6614999771118164,
     274.55999755859375,
     0.7664999961853027,
     178.55999755859375,
     0.8445000052452087],
    [170.8800048828125,
     1.8165000677108765,
     257.66400146484375,
     1.8165000677108765,
     257.66400146484375,
     1.9185000658035278,
     170.8800048828125

In [23]:
def make_string(row):
    img_captions = ', '.join(row['img_captions'])
    meme_captions = ', '.join(row['meme_captions'])
    title = row['title']
    metaphors =str(row['metaphors'])
    try:
        ocr = ', '.join(row['extracted_text']["<OCR_WITH_REGION>"]["labels"]).replace("</s>", "")
    except:
        ocr = "no ocr for this image"
    return f"img_captions: {img_captions}\nmeme_captions: {meme_captions}\ntitle: {title}\ntags: {metaphors}\nocr: {ocr}"

In [24]:
def create_queries(row):
        try:    
                meme_details = make_string(row)
                response = client.chat.completions.create(
                model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
                messages=[
                        {
                                "role": "user",
                                "content": [
                                        {
                                                "type": "text",
                                                "text": "Do you know \"Tip Of My Tongue\" Problem ?"
                                        }
                                ]
                        },
                        {
                                "role": "assistant",
                                "content": "You‛re referring to the \"Tip of the Tongue\" (TOT) phenomenon!\n\Tip-of-the-tongue (ToT) known-item retrieval is defined as \"an item identification task in which the searcher has previously experienced an item but cannot recall a reliable identifier\" (i.e., \"It’s on the tip of my tongue…\"). The TREC ToT track aims to develop IR systems that can successfully resolve ToT information needs. Progress in this area will likely benefit other IR systems that must deal with memory assistance, such as personal information management (PIM) systems (e.g., email re-finding)."
                        },
                        {
                                "role": "user",
                                "content": [
                                        {
                                                "type": "text",
                                                "text": f"Now, I'm creating a queries for a meme dataset. The dataset contains the following details: {meme_details}. I want to create the queries to be used later in an information retrieval system. The meme is going to be retrieved using the queries. Your goal is to take these details and create a \"tip of my tongue\" diverse queries out of them.\n\n The queries should be structured as follows:\n - every query should be one string, short or long.\n - every query can make use of one or more detail from the above details. Preferrably, try to make use of every detail to be put in the query string to make search easier.\n- a query should resemble as if a user is trying to search for this specific meme as if the words are at the \"tip of his tongue\". Create only 5 queries and you should output the queries in a python list format. It's of utmost importance that the output is in the python list format."
                                        }
                                ]
                        },
                ],
                max_tokens=512,
                temperature=0.7,
                top_p=0.7,
                top_k=50,
                repetition_penalty=1,
                stop=["<|eot_id|>","<|eom_id|>"],
                )
                response_str = response.choices[0].message.content
                # parse the list of queries using str parsing from the response between the [ and ]
                queries = response_str[response_str.find("[")+1:response_str.find("]")]
                # create list of queries
                queries = queries.split(",")
                # remove the quotes and \n from the queries
                queries = [query.replace('"', "").replace("\n", "").strip() for query in queries]
                row["queries"] = queries
                return row
        except Exception as e:
                row["queries"] = ["ERROR"]
                return row
        

In [14]:
def split_dataset_into_shards(dataset, shard_size=100):
    shards = []
    for i in range(0, len(dataset), shard_size):
        # Ensure the upper bound doesn't exceed the length of the dataset
        upper_bound = min(i + shard_size, len(dataset))
        shards.append(dataset.select(range(i, upper_bound)))
    return shards


In [15]:
shards = split_dataset_into_shards(meme_ds,10)

In [16]:
for idx, shard in enumerate(shards):
    shard = shard.map(create_queries, num_proc=6)
    # save checkpoint 
    shard.save_to_disk(f"./meme_ds_with_queries/shard_{idx}/")

Map (num_proc=6): 100%|██████████| 10/10 [00:13<00:00,  1.34s/ examples]
Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 815.35 examples/s]
Map (num_proc=6): 100%|██████████| 10/10 [00:10<00:00,  1.05s/ examples]
Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 1188.63 examples/s]
Map (num_proc=6): 100%|██████████| 10/10 [00:09<00:00,  1.04 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 1268.69 examples/s]
Map (num_proc=6): 100%|██████████| 10/10 [00:11<00:00,  1.18s/ examples]
Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 1404.05 examples/s]
Map (num_proc=6): 100%|██████████| 10/10 [00:07<00:00,  1.26 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 1281.88 examples/s]
Map (num_proc=6): 100%|██████████| 10/10 [00:09<00:00,  1.10 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 1319.54 examples/s]
Map (num_proc=6): 100%|

In [6]:
# load all shards and combine to single dataset 
shards = [datasets.load_from_disk(f"./meme_ds_with_queries/shard_{idx}") for idx in range(639)]

In [7]:
full_meme_queries = datasets.concatenate_datasets(shards)   

In [62]:
import random
full_meme_queries["queries"][random.randint(0, len(full_meme_queries))]

['Indiana Jones surrounded by people with guns Uruguay vs Portugal',
 'Meme with Christiano Ronaldo shocked expression Uruguay beating Portugal',
 'Uruguay people with handguns Indiana Jones nervous face',
 'Portugal vs Uruguay World Cup match meme with Indiana Jones',
 'Christiano Ronaldo surprised Uruguay wins with handgun metaphor']