In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from src.data_access_layer.json_services.dataset_loader import DatasetLoader
from typing import List

import os
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm

In [3]:
dataset_folder = os.path.join('..', '..', '..', '..', 'datasets')  #указать свой путь
temp_dataset_folder = "E:\\temp\\datasets"  #указать свой путь

saving_train_dataset_folder = os.path.join(temp_dataset_folder, 'train')
saving_val_dataset_folder = os.path.join(temp_dataset_folder, 'val')

labels_folder = os.path.join(dataset_folder, 'labels')
images_folder = os.path.join(dataset_folder, 'images')

train_images_folder = os.path.join(images_folder, 'train')
val_images_folder = os.path.join(images_folder, 'val')

train_dataset_path = os.path.join(labels_folder, 'bdd100k_labels_images_train.json')
val_dataset_path = os.path.join(labels_folder, 'bdd100k_labels_images_val.json')

In [4]:
train_dataset_loader = DatasetLoader(train_dataset_path)
val_dataset_loader = DatasetLoader(val_dataset_path)

Read raw data...
Reading is done
Prepare labels dataset...
Preparation labels is done
Prepare images metadata...
Preparation images metadata is done
Read raw data...
Reading is done
Prepare labels dataset...
Preparation labels is done
Prepare images metadata...
Preparation images metadata is done


In [5]:
train_labels = train_dataset_loader.get_labels_dataset()
val_labels = val_dataset_loader.get_labels_dataset()

In [6]:
train_imgs = train_labels['image_name'].unique().tolist()
val_imgs = val_labels['image_name'].unique().tolist()

In [7]:
classes = sorted(train_labels.label.unique().tolist())
classes

['bike',
 'bus',
 'car',
 'motor',
 'person',
 'rider',
 'traffic light',
 'traffic sign',
 'train',
 'truck']

In [8]:
classes_tuple = [(k, v) for k, v in enumerate(classes)]
classes_tuple

[(0, 'bike'),
 (1, 'bus'),
 (2, 'car'),
 (3, 'motor'),
 (4, 'person'),
 (5, 'rider'),
 (6, 'traffic light'),
 (7, 'traffic sign'),
 (8, 'train'),
 (9, 'truck')]

Сохранение классов

In [9]:
classes_path = os.path.join(temp_dataset_folder, 'classes.csv')

In [10]:
pd.DataFrame.from_records(classes_tuple).to_csv(classes_path, index=False, header=False, sep='\t')

In [11]:
pd.read_csv(classes_path, sep='\t', header=None)  #Для проверки, что все ок, чтение

Unnamed: 0,0,1
0,0,bike
1,1,bus
2,2,car
3,3,motor
4,4,person
5,5,rider
6,6,traffic light
7,7,traffic sign
8,8,train
9,9,truck


In [12]:
def save_yolo_labels(root_folder: str, imgs: List, labels: pd.DataFrame, classes: List):
    if not os.path.exists(root_folder):
        return

    for img in tqdm(imgs):
        ls_bbox = labels.loc[labels.image_name == img].drop("image_name", axis=1).apply(lambda x: x.to_dict(),
                                                                                        axis=1).to_list()
        img_dim = Image.open(os.path.join(root_folder, img)).size
        ls_out = []

        for bbox in ls_bbox:
            bbox_x = (bbox["x_min"] + bbox["x_max"]) / (2 * img_dim[0])
            bbox_y = (bbox["y_min"] + bbox["y_max"]) / (2 * img_dim[1])
            bbox_w = (bbox["x_max"] - bbox["x_min"]) / img_dim[0]
            bbox_h = (bbox["y_max"] - bbox["y_min"]) / img_dim[1]
            class_id = classes.index(bbox["label"])
            ls_out.append("{} {:.3f} {:.3f} {:.3f} {:.3f}".format(class_id, bbox_x, bbox_y, bbox_w, bbox_h))

        filename = os.path.join(root_folder, img.replace("jpg", "txt"))
        print("\n".join(ls_out), file=open(filename, "w"))


def save_pytorch_labels(root_folder: str, imgs: List, labels: pd.DataFrame):
    if not os.path.exists(root_folder):
        return

    prog_bar = tqdm(list(imgs))

    for img in prog_bar:
        prog_bar.set_description(f"Current image: {img}")

        filepath = os.path.join(root_folder, img.replace("jpg", "csv"))

        if os.path.exists(filepath):
            continue

        filtered_labeles = labels.loc[labels.image_name == img].drop("image_name", axis=1)
        filtered_labeles.to_csv(os.path.join(root_folder, img.replace("jpg", "csv")), index=False, sep=';')

In [13]:
save_pytorch_labels(saving_train_dataset_folder, train_imgs, train_labels)

  0%|          | 0/69863 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

In [14]:
save_pytorch_labels(saving_val_dataset_folder, val_imgs, val_labels)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [15]:
def clean_unused_images(root_dir: str, used_images: List):
    exist_imgs = [x for x in os.listdir(root_dir) if '.jpg' in x]
    base_used_images = [os.path.basename(x).replace('.jpg', '') for x in used_images]

    prog_bar = tqdm(list(exist_imgs))

    for exists_img in prog_bar:

        ex_basename = os.path.basename(exists_img).replace('.jpg', '')
        prog_bar.set_description(f"Current image: {ex_basename}")

        if ex_basename not in base_used_images:

            path = os.path.join(root_dir, exists_img)

            if os.path.exists(path):
                os.remove(path)

Сохранение датасета для формата читаемым yolo

In [None]:
save_yolo_labels(saving_train_dataset_folder, train_imgs, train_labels, classes)

In [None]:
save_yolo_labels(saving_val_dataset_folder, val_imgs, val_labels, classes)

В датасете есть картинки где нет объектов с боксами, но есть сегментация дорог. Поэтому было принято их удалить

In [None]:
clean_unused_images(saving_train_dataset_folder, train_imgs)

In [None]:
clean_unused_images(saving_val_dataset_folder, val_imgs)