In [62]:
import xml.etree.ElementTree as ET
import openpyxl

folder_name = '792_annotations'
file_path = f'data/{folder_name}/annotations.xml'

filtered_brands = openpyxl.load_workbook(f'data/{folder_name}/filtered.xlsx')
filtered_brands_active = filtered_brands.active
brands = [cell.value.lower() for cell in filtered_brands_active['A'] if cell.value is not None]

tree = ET.parse(file_path)
root = tree.getroot()

images_data = []
points, bboxes = [], []
for child in root:
    if child.tag == 'image':
        box = [subchild.attrib for subchild in child if subchild.tag == 'box']
        point = [subchild.attrib for subchild in child if subchild.tag == 'points']
        element_info = {
            'tag': child.tag,
            'attributes': child.attrib,
            'children': {'box': box, 'points': point}
        }
        images_data.append(element_info)

# for child in root:
#     if child.tag == 'image':
#         box = [subchild.attrib for subchild in child if subchild.tag == 'box' and subchild.attrib.get('label') in brands]
#         if box:
#             point = [subchild.attrib for subchild in child if subchild.tag == 'points']
#             element_info = {
#                 'tag': child.tag,
#                 'attributes': child.attrib,
#                 'children': {'box': box, 'points': point}
#             }
#             images_data.append(element_info)

In [63]:
len(images_data)

461

In [64]:
images_data[0]

{'tag': 'image',
 'attributes': {'id': '0',
  'name': '-3avTAO79w.jpg',
  'width': '1125',
  'height': '2000'},
 'children': {'box': [{'label': 'ignore',
    'occluded': '0',
    'source': 'manual',
    'xtl': '285.00',
    'ytl': '463.00',
    'xbr': '356.00',
    'ybr': '532.00',
    'z_order': '0'},
   {'label': 'ignore',
    'occluded': '0',
    'source': 'manual',
    'xtl': '249.00',
    'ytl': '917.00',
    'xbr': '353.00',
    'ybr': '964.00',
    'z_order': '0'},
   {'label': 'ignore',
    'occluded': '0',
    'source': 'manual',
    'xtl': '443.00',
    'ytl': '1331.00',
    'xbr': '535.00',
    'ybr': '1406.00',
    'z_order': '0'},
   {'label': 'ignore',
    'occluded': '0',
    'source': 'manual',
    'xtl': '630.00',
    'ytl': '1534.00',
    'xbr': '696.00',
    'ybr': '1602.00',
    'z_order': '0'},
   {'label': 'ignore',
    'occluded': '0',
    'source': 'manual',
    'xtl': '248.00',
    'ytl': '664.00',
    'xbr': '325.00',
    'ybr': '691.00',
    'z_order': '0'},


In [65]:
from PIL import Image, ImageDraw
import os

def get_unique_filename(file_path):
    base, extension = os.path.splitext(file_path)
    counter = 1
    while os.path.exists(file_path):
        file_path = f"{base}_{counter}{extension}"
        counter += 1
    return file_path

def save_crop(img_name, bbox_orig, destination):
    image_path = img_name
    original_image = Image.open(image_path)
    cropped_image = original_image.crop(bbox_orig)
    cropped_image.save(destination)

In [66]:
for image_data in images_data:
    checked = 0
    for point_data in image_data['children']['points']:
        point_data_coord = list(map(int, map(float, point_data['points'].split(','))))
        for box_data in image_data['children']['box']:
            box_bbox = [int(float(box_data['xtl'])), int(float(box_data['ytl'])), 
                        int(float(box_data['xbr'])), int(float(box_data['ybr']))]
            if point_data['label'].lower() in brands and (box_bbox[0] < point_data_coord[0] < box_bbox[2] and 
                                                          box_bbox[1] < point_data_coord[1] < box_bbox[3]):
                box_data['label'] = point_data['label']
                box_data['x'] = point_data_coord[0]
                box_data['y'] = point_data_coord[1]
                destination = get_unique_filename(f'data/{folder_name}/cropped/' + 'cropped_' + image_data['attributes']['name'])
                save_crop(f'data/{folder_name}/images/' + image_data['attributes']['name'], box_bbox, destination)
                box_data['cropped_name'] = destination.split('/')[-1]
                checked = 1
    image_data['checked'] = checked

In [67]:
images_changed_data = [x for x in images_data if x['checked'] == 1]

In [68]:
len(images_changed_data)

41

In [69]:
for image_data in images_changed_data:
    image_data['children']['box'] = [x for x in image_data['children']['box'] if x['label'] != 'ignore']
    try:
        del image_data['children']['points']
    except:
        pass

In [70]:
len(images_changed_data)

41

In [71]:
images_changed_data

[{'tag': 'image',
  'attributes': {'id': '0',
   'name': '-3avTAO79w.jpg',
   'width': '1125',
   'height': '2000'},
  'children': {'box': [{'label': 'Курганский МК Халяль Говядина "Курганская" 325 г.',
     'occluded': '0',
     'source': 'manual',
     'xtl': '598.00',
     'ytl': '1109.00',
     'xbr': '693.00',
     'ybr': '1155.00',
     'z_order': '0',
     'x': 631,
     'y': 1135,
     'cropped_name': 'cropped_-3avTAO79w_40.jpg'},
    {'label': 'Myaso в банке Оленина тушеная в/с 338 гр.',
     'occluded': '0',
     'source': 'manual',
     'xtl': '243.00',
     'ytl': '1023.00',
     'xbr': '317.00',
     'ybr': '1120.00',
     'z_order': '0',
     'x': 281,
     'y': 1073,
     'cropped_name': 'cropped_-3avTAO79w_38.jpg'},
    {'label': 'Курганский МК Халяль Говядина "Курганская" 325 г.',
     'occluded': '0',
     'source': 'manual',
     'xtl': '598.00',
     'ytl': '1056.00',
     'xbr': '693.00',
     'ybr': '1106.00',
     'z_order': '0',
     'x': 653,
     'y': 1079,
  

In [72]:
import openpyxl
from openpyxl.drawing.image import Image
from openpyxl.utils import get_column_letter
from copy import deepcopy
from PIL import Image as PILImage

def pixels_to_width_units(pixels):
    return pixels / 7

def get_image_size_with_aspect_ratio(image_path, max_width):
    with PILImage.open(image_path) as img:
        original_width, original_height = img.size
        if original_width > max_width:
            scaling_factor = max_width / original_width
            new_width = max_width
            new_height = original_height * scaling_factor
            return new_width, new_height
        else:
            return original_width, original_height

wb = openpyxl.Workbook()
ws = wb.active

result = []
max_points = 0
copied_data = deepcopy(images_changed_data)

for image_data in copied_data:
    label_groups = {}
    for changed_point in image_data['children']['box']:
        if changed_point['label'] != 'ignore':
            label = changed_point['label']
            if label not in label_groups:
                label_groups[label] = []
            label_groups[label].append(changed_point)

    for label, points in label_groups.items():
        image_file_paths = []
        for point in points:
            local_file_path = f"data/{folder_name}/cropped/{point['cropped_name']}"
            image_file_paths.append(local_file_path)

        column = [label, None, None, image_data['attributes']['id'], image_data['attributes']['name'], len(points)] + image_file_paths

        if len(points) > max_points:
            max_points = len(points)

        result.append(column)

result.sort(key=lambda x: x[0])

headers = ['Название', 'фото эталон', 'task id', 'id фото в task', 'название фото', 'количество точек на фото'] + [f'фото{x}' for x in range(1, max_points + 1)]
ws.append(headers)

max_image_widths = [0] * max_points

for row, data in enumerate(result, start=2):
    max_height_in_row = 0
    for col, cell_value in enumerate(data, start=1):
        if isinstance(cell_value, str) and cell_value.startswith(f"data/{folder_name}/cropped/"):
            img_width, img_height = get_image_size_with_aspect_ratio(cell_value, 200) 
            img = Image(cell_value)
            img.width, img.height = img_width, img_height
            max_height_in_row = max(max_height_in_row, img.height)
            ws.add_image(img, get_column_letter(col) + str(row))

            image_col_index = col - 6
            if 0 <= image_col_index < len(max_image_widths):
                max_image_widths[image_col_index] = max(max_image_widths[image_col_index], img_width)
        else:
            ws.cell(row=row, column=col, value=cell_value)

    if max_height_in_row > 0:
        ws.row_dimensions[row].height = max_height_in_row * 0.75

for i, max_width in enumerate(max_image_widths, start=6):
    if max_width > 0:
        ws.column_dimensions[get_column_letter(i)].width = pixels_to_width_units(max_width)

ws.column_dimensions['A'].width = 40
ws.column_dimensions['E'].width = 20

wb.save(f'data/{folder_name}/output.xlsx')

# If label isn't necessary (cover all cases)

In [88]:
import xml.etree.ElementTree as ET

folder_name = '894_annotations'
file_path = f'data/{folder_name}/annotations.xml'

tree = ET.parse(file_path)
root = tree.getroot()

images_data = []
points, bboxes = [], []
for child in root:
    if child.tag == 'image':
        box = [subchild.attrib for subchild in child if subchild.tag == 'box']
        point = [subchild.attrib for subchild in child if subchild.tag == 'points']
        element_info = {
            'tag': child.tag,
            'attributes': child.attrib,
            'children': {'box': box, 'points': point}
        }
        images_data.append(element_info)

In [89]:
len(images_data)

175

In [90]:
images_data[0]

{'tag': 'image',
 'attributes': {'id': '0',
  'name': '3Q2D_zCyxmo.jpg',
  'width': '1800',
  'height': '3200'},
 'children': {'box': [{'label': 'Venetto',
    'occluded': '0',
    'source': 'manual',
    'xtl': '1315.15',
    'ytl': '1590.12',
    'xbr': '1396.94',
    'ybr': '1665.18',
    'z_order': '0'},
   {'label': 'MONARCH',
    'occluded': '0',
    'source': 'manual',
    'xtl': '953.47',
    'ytl': '650.55',
    'xbr': '1062.91',
    'ybr': '700.63',
    'z_order': '0'},
   {'label': 'Nescafe',
    'occluded': '0',
    'source': 'manual',
    'xtl': '1169.75',
    'ytl': '1183.87',
    'xbr': '1254.09',
    'ybr': '1211.12',
    'z_order': '0'},
   {'label': 'Migel',
    'occluded': '0',
    'source': 'manual',
    'xtl': '1059.70',
    'ytl': '2825.13',
    'xbr': '1238.74',
    'ybr': '2886.95',
    'z_order': '0'},
   {'label': 'MACCOFFEE',
    'occluded': '0',
    'source': 'manual',
    'xtl': '1583.36',
    'ytl': '2846.30',
    'xbr': '1776.30',
    'ybr': '2909.40',
  

In [91]:
from PIL import Image, ImageDraw
import os

def get_unique_filename(file_path):
    base, extension = os.path.splitext(file_path)
    counter = 1
    while os.path.exists(file_path):
        file_path = f"{base}_{counter}{extension}"
        counter += 1
    return file_path

def save_crop(img_name, bbox_orig, destination):
    image_path = img_name
    original_image = Image.open(image_path)
    cropped_image = original_image.crop(bbox_orig)
    cropped_image.save(destination)

In [92]:
for image_data in images_data:
    checked = 0
    for box_data in image_data['children']['box']:
        box_bbox = [int(float(box_data['xtl'])), int(float(box_data['ytl'])), 
                    int(float(box_data['xbr'])), int(float(box_data['ybr']))]
        destination = get_unique_filename(f'data/{folder_name}/cropped/' + 'cropped_' + image_data['attributes']['name'])
        save_crop(f'data/{folder_name}/images/' + image_data['attributes']['name'], box_bbox, destination)
        box_data['cropped_name'] = destination.split('/')[-1]
        checked = 1
    image_data['checked'] = checked

In [94]:
len(images_data)

175

In [95]:
import openpyxl
from openpyxl.drawing.image import Image
from openpyxl.utils import get_column_letter
from copy import deepcopy
from PIL import Image as PILImage

def pixels_to_width_units(pixels):
    return pixels / 7

def get_image_size_with_aspect_ratio(image_path, max_width):
    with PILImage.open(image_path) as img:
        original_width, original_height = img.size
        if original_width > max_width:
            scaling_factor = max_width / original_width
            new_width = max_width
            new_height = original_height * scaling_factor
            return new_width, new_height
        else:
            return original_width, original_height

wb = openpyxl.Workbook()
ws = wb.active

result = []
max_points = 0
copied_data = deepcopy(images_data)

for image_data in copied_data:
    label_groups = {}
    for changed_point in image_data['children']['box']:
        # if changed_point['label'] != 'ignore':
        label = changed_point['label']
        if label not in label_groups:
            label_groups[label] = []
        label_groups[label].append(changed_point)

    for label, points in label_groups.items():
        image_file_paths = []
        for point in points:
            local_file_path = f"data/{folder_name}/cropped/{point['cropped_name']}"
            image_file_paths.append(local_file_path)

        column = [label, None, None, image_data['attributes']['id'], image_data['attributes']['name'], len(points)] + image_file_paths

        if len(points) > max_points:
            max_points = len(points)

        result.append(column)

result.sort(key=lambda x: x[0])

headers = ['Название', 'фото эталон', 'task id', 'id фото в task', 'название фото', 'количество точек на фото'] + [f'фото{x}' for x in range(1, max_points + 1)]
ws.append(headers)

max_image_widths = [0] * max_points

for row, data in enumerate(result, start=2):
    max_height_in_row = 0
    for col, cell_value in enumerate(data, start=1):
        if isinstance(cell_value, str) and cell_value.startswith(f"data/{folder_name}/cropped/"):
            img_width, img_height = get_image_size_with_aspect_ratio(cell_value, 200) 
            img = Image(cell_value)
            img.width, img.height = img_width, img_height
            max_height_in_row = max(max_height_in_row, img.height)
            ws.add_image(img, get_column_letter(col) + str(row))

            image_col_index = col - 6
            if 0 <= image_col_index < len(max_image_widths):
                max_image_widths[image_col_index] = max(max_image_widths[image_col_index], img_width)
        else:
            ws.cell(row=row, column=col, value=cell_value)

    if max_height_in_row > 0:
        ws.row_dimensions[row].height = max_height_in_row * 0.75

for i, max_width in enumerate(max_image_widths, start=6):
    if max_width > 0:
        ws.column_dimensions[get_column_letter(i)].width = pixels_to_width_units(max_width)

ws.column_dimensions['A'].width = 40
ws.column_dimensions['E'].width = 20

wb.save(f'data/{folder_name}/output.xlsx')