# 目标检测数据集清洗
本文档主要记录关于目标检测数据集清洗的一些方法与步骤，数据集原始数据为图片数据和`Pascal VOC`格式的标注数据,数据清洗的主要工作包括：
1. 删除多余的标注xml文件
2. 删除空的标注xml文件
3. 删除多余的图片文件
4. 获取所有的标注类别
5. 检查标注文件中的拼写错误并纠正
6. 检查标注文件中的标注框是否超出图片边界(待定)
7. 校验图片尺寸是否与标注文件中的尺寸一致(待定)
8. 删除长宽比过大或过小的图片(待定)
9. 转换数据格式为YOLO格式

# 获取原始数据集

In [None]:
import os
from pathlib import Path

root_dir = Path('scene_data')
image_dir = root_dir / 'JPEGImages'
annotation_dir = root_dir / 'Annotations'

assert image_dir.exists() and annotation_dir.exists() and image_dir.is_dir() and annotation_dir.is_dir(), '数据集目录不存在'
image_files = list(image_dir.glob('*.jpg'))
annotation_files = list(annotation_dir.glob('*.xml'))

print(f'原始数据集共有{len(image_files)}张图片，{len(annotation_files)}个标注文件')

# 删除多余的标注xml文件
`Pascal VOC`格式的标注文件与图片文件一一对应,但是有时候会出现多余的标注文件，这些文件没有对应的图片文件，需要删除这些多余的标注文件

In [None]:
# 遍历所有的标注文件，获取文件名，然后判断是否存在对应的图片文件，如果不存在则删除该标注文件
for annotation_file in annotation_files:
    image_file = image_dir / (annotation_file.stem + '.jpg')
    if not image_file.exists():
        print(f'删除多余的标注文件{annotation_file}')
        annotation_file.unlink()
annotation_files = list(annotation_dir.glob('*.xml'))
image_files = list(image_dir.glob('*.jpg'))
print(
    f'删除完成，剩余{len(list(annotation_dir.glob("*.xml")))}个标注文件,图片文件{len(list(image_dir.glob("*.jpg")))}个')

# 删除空的标注xml文件
存在一些标注文件中没有标注框，这些标注文件对于训练没有意义，需要删除这些标注文件
使用`xml.etree.ElementTree`解析xml文件，获取`object`节点，如果`object`节点为空，则删除该标注文件

In [None]:
import xml.etree.ElementTree as ET

for annotation_file in annotation_files:
    tree = ET.parse(annotation_file)
    root = tree.getroot()
    objects = root.findall('object')
    if len(objects) == 0:
        print(f'删除空的标注文件{annotation_file.name}')
        annotation_file.unlink()
annotation_files = list(annotation_dir.glob('*.xml'))
image_files = list(image_dir.glob('*.jpg'))
print(f'删除完成，剩余{len(annotation_files)}个标注文件,图片文件{len(image_files)}个')

# 删除多余的图片文件
删除多余的图片文件，即图片文件没有对应的标注文件

In [None]:
for image_file in image_files:
    annotation_file = annotation_dir / (image_file.stem + '.xml')
    if not annotation_file.exists():
        print(f'删除多余的图片文件{image_file.name}')
        image_file.unlink()
annotation_files = list(annotation_dir.glob('*.xml'))
image_files = list(image_dir.glob('*.jpg'))
print(f'删除完成，剩余{len(annotation_files)}个标注文件,图片文件{len(image_files)}个')

# 获取所有的标注类别
获取所有的标注类别，这些类别将用于后续的标签纠正，标签转换等工作

In [None]:
import xml.etree.ElementTree as ET


def get_classes(annotation_files):
    classes = set()
    for annotation_file in annotation_files:
        tree = ET.parse(annotation_file)
        root = tree.getroot()
        objects = root.findall('object')
        for obj in objects:
            name = obj.find('name').text
            classes.add(name)
    return list(classes)


classes = get_classes(annotation_files)
print(f'数据集中共有{len(classes)}个标注类别，分别为\n{classes}')


# 校正拼写错误的标签
数据集人工手动标注过程中，可能会出现标签拼写错误的情况，需要校正这些错误的标签

In [None]:
# 创建勘误表
corrigenda = {
    "documents_voucher001": "documents_voucher",
    "seal000": "seal"
}

# 遍历所有的标注文件，获取标注文件中的标签，如果标签在勘误表中，则替换为正确的标签
for annotation_file in annotation_files:
    tree = ET.parse(annotation_file)
    root = tree.getroot()
    objects = root.findall('object')
    for obj in objects:
        name = obj.find('name').text
        if name in corrigenda.keys():
            print(f'标签{annotation_file.name}中的标签{name}拼写错误，已经替换为{corrigenda[name]}')
            obj.find('name').text = corrigenda[name]
    tree.write(annotation_file)
classes = get_classes(annotation_files)
print(f'数据集中共有{len(classes)}个标注类别，分别为\n{classes}')


In [None]:
# 将classes保存到文件中
with open('classes.txt', 'w') as f:
    f.write('\n'.join(classes))

# 检查标注文件中的标注框是否超出图片边界

In [None]:
# TODO

# 校验图片尺寸是否与标注文件中的尺寸一致

In [None]:
from PIL import Image
import xml.etree.ElementTree as ET
from tqdm import tqdm


def check_image_size(image_files, annotation_files):
    file_count = len(image_files)
    for file_index in tqdm(range(file_count)):
        image = Image.open(image_files[file_index])
        root = ET.parse(annotation_files[file_index]).getroot()
        size = root.find('size')
        width = int(size.find('width').text)
        height = int(size.find('height').text)
        image_width, image_height = image.size
        if width != image_width or height != image_height:
            print(f'图片{image_files[file_index].name}的尺寸与标注文件中的尺寸不一致')
            print(f'图片尺寸为{image_width}*{image_height},标注文件中的尺寸为{width}*{height}')
            print(f'已经将标注文件中的尺寸修改为{image_width}*{image_height}')
            size.find('width').text = str(image_width)
            size.find('height').text = str(image_height)
            tree.write(annotation_files[file_index])


check_image_size(image_files, annotation_files)

# 删除长宽比过大或过小的图片

In [None]:
# TODO

# 转换数据格式为YOLO格式

In [None]:
from typing import List, NoReturn, Tuple, Union


def xyxy2xywh(size: Tuple[int, int], box: Tuple[float, float, float, float]) -> Tuple[float, float, float, float]:
    """
    将(xmin,ymin,xmax,ymax)格式转换为(x,y,w,h)格式
    Args:
        size(tuple[int,int]): 图像的宽与高
        box(tuple[float,float,float,float]):

    Returns:
        tuple[float, float, float, float],返回XYWH格式的标注框数据,数值均归一化到[0,1]

    """
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (box[0] + box[2]) / 2 * dw
    y = (box[1] + box[3]) / 2 * dh
    w = (box[2] - box[0]) * dw
    h = (box[3] - box[1]) * dh
    return x, y, w, h

In [None]:
from os import PathLike
import logging

# 配置logging基本设置
logging.basicConfig(level=logging.DEBUG, format='%(name)s - %(levelname)s - %(message)s')


def voc2yolo(src_anno_dir: Union[str, Path], classes: List[str], dst_anno_dir: Union[str, Path] = 'labels') -> NoReturn:
    """
    转换 Pascal VOC格式标注文件为 YOLO 格式标注文件
    Args:
        src_anno_dir(str): 标注数据文件夹路径
        classes(List[str]): 目标检测Object类列表
        dst_anno_dir(str): YOLO格式标注文件保存路径

    Returns:

    """
    from os.path import join, abspath, basename, dirname, exists, isdir
    from os import listdir, makedirs
    # 校验文件路径是否合法
    assert exists(annotation_dir), "The Annotation Dir Not Exists, Please Check it"
    assert isdir(annotation_dir), "The  Annotation Dir has to be a Directory"

    # 保存YOLO格式标注文件的路径，没有则创建
    yolo_annotation_dir = abspath(join(dirname(annotation_dir), dst_anno_dir))
    makedirs(yolo_annotation_dir, exist_ok=True)

    # 获取所有xml格式的标注文件
    annotation_files = [abspath(join(annotation_dir, item)) for item in listdir(annotation_dir) if
                        item.endswith('.xml')]

    # 将所有的类名称写入classes.txt文件，文件路径与其他的标注文件同级
    with open(join(yolo_annotation_dir, "classes.txt"), 'w+', encoding='utf-8') as fw:
        fw.write("\n".join(classes))

    logging.info("Start Processing Annotation files' format from Pascal VOC to YOLO ...")
    for file in tqdm(annotation_files):

        annotation_text_filename = f"{basename(file).replace('xml', 'txt')}"

        # 获取目标文件写入流
        out_file = open(abspath(join(yolo_annotation_dir, annotation_text_filename)), 'w', encoding='utf-8')

        # 开始解析xml文件
        tree = ET.parse(file)
        root = tree.getroot()
        # 图片的shape值
        # TODO 考虑到标注文件中的shape可能与实际尺寸不符，后期直接读取图片文件获取实际数据
        size = root.find('size')
        w = int(size.find('width').text)
        h = int(size.find('height').text)

        for obj in root.iter('object'):
            difficult = obj.find('difficult').text
            cls = obj.find('name').text
            if cls not in classes or int(difficult) == 1:
                continue
            # 将名称转换为id下标
            cls_id = classes.index(cls)
            # 获取整个bounding box框
            bndbox = obj.find('bndbox')
            # xml给出的是x1, y1, x2, y2
            box = (
                float(bndbox.find('xmin').text),
                float(bndbox.find('ymin').text),
                float(bndbox.find('xmax').text),
                float(bndbox.find('ymax').text)
            )

            # 将x1, y1, x2, y2转换成yolov5所需要的x_center, y_center, w, h格式
            bbox = xyxy2xywh((w, h), box)
            # 写入目标文件中，格式为 id x y w h
            out_file.write(str(cls_id) + " " + " ".join(str(x) for x in bbox) + '\n')
    logging.info("Done Processing.")

In [None]:
with open('classes.txt', 'r') as f:
    classes = f.read().split('\n')
print(classes)

In [None]:
voc2yolo(annotation_dir, classes)

In [None]:
# rename the image and label files
import os
from pathlib import Path
root_dir = Path('scene_data_yolo')
image_dir = root_dir / 'images'
label_dir = root_dir / 'labels'

import shutil

name_prefix = 'abc_bank_personal_belonging_scene_'
# name the image files by adding prefix plus counter, like abc_bank_personal_belonging_scene_00000001.jpg
# in the meantime, rename the label files by adding prefix plus counter, like abc_bank_personal_belonging_scene_00000001.txt
# but maintain the correspondence between image files and label files
image_files = list(image_dir.glob('*.jpg'))
label_files = list(label_dir.glob('*.txt'))
for index, (image_file, label_file) in enumerate(zip(image_files, label_files)):
    src_image = os.path.abspath(str(image_file))
    dst_image = os.path.abspath(str(image_dir / (name_prefix + str(index).zfill(8) + '.jpg')))
    print(f"rename {src_image} to {dst_image}")
    shutil.move(src_image, dst_image)

    src_label = os.path.abspath(str(label_file))
    dst_label = os.path.abspath(str(label_dir / (name_prefix + str(index).zfill(8) + '.txt')))
    print(f"rename {src_label} to {dst_label}")
    shutil.move(src_label, dst_label)



# 拆分数据集

In [27]:
# dataset directory is scene_data_yolo, and it contains images and labels
# now we need to split the dataset into train, val and test and test_2 four parts
# the train, val  parts are used for training and validation, the test part is used for testing, the test_2 part is used for backup incase of the test part is damaged
# the train, val, test, test_2 parts are 7:1:1:1
# each part contains images and labels directory

from pathlib import Path
import shutil
import random
from tqdm import tqdm
root_dir = Path('scene_data_yolo')
image_dir = root_dir / 'images'
label_dir = root_dir / 'labels'

image_files = list(image_dir.glob('*.jpg'))
label_files = list(label_dir.glob('*.txt'))

print(f'原始数据集共有{len(image_files)}张图片，{len(label_files)}个标注文件')

# shuffle the image files and label files while maintaining the correspondence between them
random.seed(2021)
random.shuffle(image_files)
random.seed(2021)
random.shuffle(label_files)

# check the correspondence between image files and label files
for image_file, label_file in zip(image_files, label_files):
    assert image_file.stem == label_file.stem, f'{image_file}与{label_file}不匹配'

train_image_files = image_files[:int(len(image_files) * 0.7)]
train_label_files = label_files[:int(len(label_files) * 0.7)]

# check the correspondence between train image files and train label files
for image_file, label_file in zip(train_image_files, train_label_files):
    assert image_file.stem == label_file.stem, f'{image_file}与{label_file}不匹配'

train_dst_image_dir = root_dir / 'train' / 'images'
train_dst_label_dir = root_dir / 'train' / 'labels'

train_dst_image_dir.mkdir(parents=True, exist_ok=True)
train_dst_label_dir.mkdir(parents=True, exist_ok=True)
import os


for image_file, label_file in zip(train_image_files, train_label_files):
    src_image = os.path.abspath(str(image_file))
    dst_image = os.path.abspath(str(train_dst_image_dir / image_file.name))
    src_label = os.path.abspath(str(label_file))
    dst_label = os.path.abspath(str(train_dst_label_dir / label_file.name))

    print(f"training copy {src_image} to {dst_image}")
    shutil.copy(src_image, dst_image)
    print(f"training copy {src_label} to {dst_label}")
    shutil.copy(src_label, dst_label)

    # shutil.copy(image_file, train_dst_image_dir / image_file.name)
    # shutil.copy(label_file, train_dst_label_dir / label_file.name)

val_image_files = image_files[int(len(image_files) * 0.7):int(len(image_files) * 0.8)]
val_label_files = label_files[int(len(label_files) * 0.7):int(len(label_files) * 0.8)]

# check the correspondence between val image files and val label files
for image_file, label_file in zip(val_image_files, val_label_files):
    assert image_file.stem == label_file.stem, f'{image_file}与{label_file}不匹配'

val_dst_image_dir = root_dir / 'val' / 'images'
val_dst_label_dir = root_dir / 'val' / 'labels'

val_dst_image_dir.mkdir(parents=True, exist_ok=True)
val_dst_label_dir.mkdir(parents=True, exist_ok=True)

for image_file, label_file in zip(val_image_files, val_label_files):
    src_image = os.path.abspath(str(image_file))
    dst_image = os.path.abspath(str(val_dst_image_dir / image_file.name))
    src_label = os.path.abspath(str(label_file))
    dst_label = os.path.abspath(str(val_dst_label_dir / label_file.name))
    print(f"val copy {src_image} to {dst_image}")
    shutil.copy(src_image, dst_image)
    print(f"val copy {src_label} to {dst_label}")
    shutil.copy(src_label, dst_label)

test_image_files = image_files[int(len(image_files) * 0.8):int(len(image_files) * 0.9)]
test_label_files = label_files[int(len(label_files) * 0.8):int(len(label_files) * 0.9)]

# check the correspondence between test image files and test label files
for image_file, label_file in zip(test_image_files, test_label_files):
    assert image_file.stem == label_file.stem, f'{image_file}与{label_file}不匹配'

test_dst_image_dir = root_dir / 'test' / 'images'
test_dst_label_dir = root_dir / 'test' / 'labels'

test_dst_image_dir.mkdir(parents=True, exist_ok=True)
test_dst_label_dir.mkdir(parents=True, exist_ok=True)

for image_file, label_file in zip(test_image_files, test_label_files):
    src_image = os.path.abspath(str(image_file))
    dst_image = os.path.abspath(str(test_dst_image_dir / image_file.name))
    src_label = os.path.abspath(str(label_file))
    dst_label = os.path.abspath(str(test_dst_label_dir / label_file.name))
    print(f"copy {src_image} to {dst_image}")
    shutil.copy(src_image, dst_image)
    print(f"copy {src_label} to {dst_label}")
    shutil.copy(src_label, dst_label)

test_2_image_files = image_files[int(len(image_files) * 0.9):]
test_2_label_files = label_files[int(len(label_files) * 0.9):]

# check the correspondence between test_2 image files and test_2 label files
for image_file, label_file in zip(test_2_image_files, test_2_label_files):
    assert image_file.stem == label_file.stem, f'{image_file}与{label_file}不匹配'

test_2_dst_image_dir = root_dir / 'test_2' / 'images'
test_2_dst_label_dir = root_dir / 'test_2' / 'labels'

test_2_dst_image_dir.mkdir(parents=True, exist_ok=True)
test_2_dst_label_dir.mkdir(parents=True, exist_ok=True)

for image_file, label_file in zip(test_2_image_files, test_2_label_files):
    src_image = os.path.abspath(str(image_file))
    dst_image = os.path.abspath(str(test_2_dst_image_dir / image_file.name))
    src_label = os.path.abspath(str(label_file))
    dst_label = os.path.abspath(str(test_2_dst_label_dir / label_file.name))
    print(f"copy {src_image} to {dst_image}")
    shutil.copy(src_image, dst_image)
    print(f"copy {src_label} to {dst_label}")
    shutil.copy(src_label, dst_label)


原始数据集共有14072张图片，14072个标注文件
training copy E:\Datasets\bank_op_datasets\scene_data_yolo\images\abc_bank_personal_belonging_scene_00011379.jpg to E:\Datasets\bank_op_datasets\scene_data_yolo\train\images\abc_bank_personal_belonging_scene_00011379.jpg
training copy E:\Datasets\bank_op_datasets\scene_data_yolo\labels\abc_bank_personal_belonging_scene_00011379.txt to E:\Datasets\bank_op_datasets\scene_data_yolo\train\labels\abc_bank_personal_belonging_scene_00011379.txt
training copy E:\Datasets\bank_op_datasets\scene_data_yolo\images\abc_bank_personal_belonging_scene_00004056.jpg to E:\Datasets\bank_op_datasets\scene_data_yolo\train\images\abc_bank_personal_belonging_scene_00004056.jpg
training copy E:\Datasets\bank_op_datasets\scene_data_yolo\labels\abc_bank_personal_belonging_scene_00004056.txt to E:\Datasets\bank_op_datasets\scene_data_yolo\train\labels\abc_bank_personal_belonging_scene_00004056.txt
training copy E:\Datasets\bank_op_datasets\scene_data_yolo\images\abc_bank_personal_belon