In [1]:
from pathlib import Path
from dotenv import load_dotenv
import os
import json

load_dotenv()
TEAM_NAME = os.getenv("TEAM_NAME")
TEAM_TRACK = os.getenv("TEAM_TRACK")
input_dir = Path(f"/home/jupyter/{TEAM_TRACK}/vlm.jsonl")
output_dir = Path(f"/home/jupyter/{TEAM_NAME}/models")

In [2]:
from PIL import Image

def read_jsonl(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            yield json.loads(line)

def extract_categories(data):
    categories = set()
    for item in data:
        for annotation in item['annotations']:
            caption = annotation['caption']
            categories.update(caption.split(', '))
    return categories

def create_category_mapping(categories):
    return {category: idx for idx, category in enumerate(categories)}

def get_category_id(caption, category_mapping):
    for category in category_mapping:
        if category in caption:
            return category_mapping[category]
    return -1

def translate_annotations(image_data, image_path, image_id, category_mapping):
    image = Image.open(image_path)
    width, height = image.size
    annotations = image_data['annotations']
    
    translated = {
        'image_id': image_id,
        'image': image_path,
        'width': width,
        'height': height,
        'objects': {
            'id': [],
            'area': [],
            'bbox': [],
            'category': []
        }
    }
    
    for obj_id, annotation in enumerate(annotations):
        x, y, w, h = annotation['bbox']
        area = w * h
        category_id = get_category_id(annotation['caption'], category_mapping)
        
        translated['objects']['id'].append(obj_id)
        translated['objects']['area'].append(area)
        translated['objects']['bbox'].append([x, y, w, h])
        translated['objects']['category'].append(category_id)
    
    return translated

In [3]:
file_path = input_dir
data = list(read_jsonl(file_path))
categories = extract_categories(data)
category_mapping = create_category_mapping(categories)

translated_data = []
image_id = 0

for item in data:
    image_path = Path(f"/home/jupyter/til-24-siewsugar/vlm-img/images/{item['image']}")
    translated = translate_annotations(item, image_path, image_id, category_mapping)
    translated_data.append(translated)
    image_id += 1

# Save translated data to train_vlm.json
output_file_path = 'train-detr-v.20.json'
with open(output_file_path, 'w') as output_file:
    json.dump(translated_data, output_file, indent=4, default=str)

print(f"Translated data saved to {output_file_path}")

Translated data saved to train-detr-v.20.json


In [7]:
import json
from datasets import Dataset
from PIL import Image

def create_hf_dataset(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)
    flat_data = []
    for item in data:
        with Image.open(item['image']) as img:
            item['image'] = img.copy()
        flat_item = {
            'image_id': item['image_id'],
            'image': item['image'],
            'width': item['width'],
            'height': item['height'],
            'objects': item['objects']
        }
        flat_data.append(flat_item)
    hf_dataset = Dataset.from_list(flat_data)
    return hf_dataset

hf_dataset = create_hf_dataset('train-detr-v.20.json')
print(hf_dataset)

OSError: [Errno 24] Too many open files: 'train-detr-v.20.json'