In [None]:
import os 
import json
os.environ["HF_HOME"] = "/your/path/hf_cache"
from datasets import load_dataset 

    

In [None]:
# Loading the captions of COCO dataset
captions_data = load_dataset('json', 
                         data_files="/your/path/annotations/captions_val2017.json", 
                         split="val")
print(f"Totally {len(captions_data['annotations'])} descriptions, regarding {len(captions_data['images'])} images.")

In [None]:
# Filtering the captions with specific keywords
# Only as an example of some keywords of gestures, we can modify the keywords as needed
keywords = [' stand ', ' stands ', ' standing', ' sit ', ' sits ', ' sitting', ' jump ', ' jumps ', ' jumping', ' lie ', ' lies ', ' lying', \
    ' bend ', ' bends ', ' bending', ' kneel ', ' kneels ', ' kneeling', ' squat ', ' squats ', ' squating', \
    ' crawl ', ' crawls ', ' crawling']

In [None]:
# The codes below are used to filter the images that are related to the positive keywords
filtered_annotations = []
for ann in captions_data['annotations']:
    for keyword in keywords:
        if keyword in ann['caption'].lower():
            if not keyword.endswith('ing') and not keyword.endswith('s'):
                ann["category"] = keyword
            elif keyword.endswith('s'):
                ann["category"] = keyword[:-1]
            elif keyword == 'lying':
                ann["category"] = 'lie'
            elif keyword == 'sitting':
                ann["category"] = 'sit'
            else:
                ann["category"] = keyword[:-3]
            filtered_annotations.append(ann)
            break

In [None]:
# Filtering the images based on the filtered annotations
filtered_image_ids = [ann['image_id'] for ann in filtered_annotations]


In [None]:

with open("bad_ids.txt" ,'r') as f:
    gesture_ids = f.readlines()
    for i in range(len(gesture_ids)):
        id = gesture_ids[i].strip()
        if id in filtered_image_ids:
            filtered_image_ids.remove(id)
            for ann in captions_data['annotations']:
                if ann['image_id'] == id:
                    filtered_annotations.remove(ann)
                    break
                
filtered_images = [
    img for img in captions_data['images']
    if img['id'] in filtered_image_ids
]

In [None]:
# Optional
# Saving the filtered annotations and images to a new JSON file
filtered_data = {
    "info": captions_data["info"],
    "licenses": captions_data["licenses"],
    "images": filtered_images,
    "annotations": filtered_annotations
}
os.system("mkdir anno_filter")
with open('anno_filter/filtered.json', 'w') as f:
    json.dump(filtered_data, f, indent=2)
print(f"Extracting {len(filtered_annotations)} descriptions, regarding {len(filtered_images)} images.")

In [None]:
# Optional: Copying the filtered images to a new directory
# Note: Helping us to check the filtered images manually
os.system("mkdir fil_imgs")
for id in filtered_image_ids:
    file = "val2017/"+str(id).zfill(12)+".jpg"
    file2 = "fil_imgs/"+str(id).zfill(12)+".jpg"
    os.system(f'cp "{file}" "{file2}"')

In [None]:
def generate_sample(gesture, image_ids):
    # Generate a sample for the given gesture and image IDs
    if gesture == "lie":
        text = "Find me an everyday image that contains someone lying."
    elif gesture == "sit":
        text = "Find me an everyday image that contains someone sitting."
    else:
        text = f"Find me an everyday image that contains someone {gesture}ing."

    sample = {
        "qry_text": f"{text}",
        "qry_img_path": "",
        "tgt_text": "<|image_1|> Represent the given image.",
        "tgt_img_path": f"{image_ids}"
    }
    return sample

In [None]:
import random
# Generating negative samples
# The negative samples are generated based on the positive samples
final = []
for anno in filtered_annotations:
    image_id = anno['image_id']
    gesture = anno['category']
    gestures = ['stand', 'sit', 'jump', 'lie', 'bend', 'kneel', 'squat', 'crawl']
    if gesture == 'squat' or gesture == 'kneel':
        gestures.remove('squat')
        gestures.remove('kneel')
    else:
        gestures.remove(gesture)
        
    # Randomly select negative gesture samples from the list of gestures
    nega_ids = []
    for a in filtered_annotations:
        if a['category'] in gestures:
            nega_ids.append(a['image_id'])
    nega_ids = random.sample(nega_ids, 99)
    image_ids = [image_id] + nega_ids
    
    # Generate a sample for the given people count and image IDs
    sample = generate_sample(gesture, image_ids)
    final.append(sample)

In [None]:
# Saving the final samples to a JSON file
os.makedirs("gesture", exist_ok=True)
output_file = os.path.join("gesture", "COCO_gesture_retrieval.json")
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(final, f, ensure_ascii=False, indent=4)