<a href="https://colab.research.google.com/github/minoxd/fiber_optic_box_inspection/blob/main/fiftyone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<!-- Autogenerated by `scripts/make_examples.py` -->
<table align="left">
    <td>
        <a target="_blank" href="https://colab.research.google.com/github/voxel51/fiftyone-examples/blob/master/examples/quickstart.ipynb">
            <img src="https://user-images.githubusercontent.com/25985824/104791629-6e618700-5769-11eb-857f-d176b37d2496.png" height="32" width="32">
            Try in Google Colab
        </a>
    </td>
    <td>
        <a target="_blank" href="https://nbviewer.jupyter.org/github/voxel51/fiftyone-examples/blob/master/examples/quickstart.ipynb">
            <img src="https://user-images.githubusercontent.com/25985824/104791634-6efa1d80-5769-11eb-8a4c-71d6cb53ccf0.png" height="32" width="32">
            Share via nbviewer
        </a>
    </td>
    <td>
        <a target="_blank" href="https://github.com/voxel51/fiftyone-examples/blob/master/examples/quickstart.ipynb">
            <img src="https://user-images.githubusercontent.com/25985824/104791633-6efa1d80-5769-11eb-8ee3-4b2123fe4b66.png" height="32" width="32">
            View on GitHub
        </a>
    </td>
    <td>
        <a href="https://github.com/voxel51/fiftyone-examples/raw/master/examples/quickstart.ipynb" download>
            <img src="https://user-images.githubusercontent.com/25985824/104792428-60f9cc00-576c-11eb-95a4-5709d803023a.png" height="32" width="32">
            Download notebook
        </a>
    </td>
</table>


# Get dataset

In [None]:
!rm -rf /content/dataset_v2_ALL/

In [None]:
shutil.copytree(paths['dataset_dir'], local_paths['dataset_dir'], dirs_exist_ok=True)

PosixPath('/content/dataset_v2_ALL')

In [None]:
with tarfile.open(paths['processed_dataset_compressed'], 'r:gz') as tar_file:
    tar_file.extractall(local_paths['dataset_dir'])
    print(f"Extracted {paths['processed_dataset_compressed']} to {local_paths['dataset_dir']}")

Extracted /content/drive/MyDrive/fiber_optic_box_inspection/dataset_v2_ALL/processed_dataset.tar.gz to /content/dataset_v2_ALL


# Environmental Setup

## mount gdrive, import libraries, declare constants, specify paths

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
from pathlib import Path
import shutil
import tarfile
import os
from sklearn.model_selection import train_test_split
import copy
from multiprocessing import Pool
import json
import yaml
import numpy as np
import cv2
import matplotlib.pyplot as plt
import albumentations as A
from google.colab.patches import cv2_imshow
# import torch
# import pandas as pd

if Path('/content/sample_data').exists():
    shutil.rmtree('/content/sample_data')

In [None]:
IMGSZ = 960

These files are required in the `Data_INF` folder:
- data_label_All.tar.gz
- inf_raw_classes_v1.0.json
- matching_name_id_v1.0.xlsx

In [None]:
BASE_DIR = '/content/drive/MyDrive/fiber_optic_box_inspection'
if not Path(BASE_DIR).exists():
    raise FileNotFoundError(f'Base directory {BASE_DIR} not found. Check Google Drive mount.')

LOCAL_BASE_DIR = '/content'

paths = {
    'base_dir':                                   Path(BASE_DIR),
    'Data_INF':                                   Path(BASE_DIR, 'Data_INF'),
    'dataset_dir':                                Path(BASE_DIR, 'dataset_v2_ALL'),
    # 'yolo_output_dir':                            Path(BASE_DIR, 'runs_yolo'),
}

if not paths['dataset_dir'].exists():
    Path.mkdir(paths['dataset_dir'])

paths['raw_classes'] =                            Path(paths['Data_INF'], 'inf_raw_classes_v1.0.json')
paths['new_raw_classes'] =                        Path(paths['Data_INF'], 'inf_raw_classes_v1.0_new.json')
paths['matching_name_id'] =                       Path(paths['Data_INF'], 'matching_name_id_v1.0.xlsx')
paths['dataset_compressed'] =                     Path(paths['Data_INF'], 'data_label_ALL.tar.gz')

paths['processed_dataset_compressed'] =           Path(paths['dataset_dir'], 'processed_dataset.tar.gz')

local_paths = {
    'base_dir':                                   Path(LOCAL_BASE_DIR),
    'dataset_dir':                                Path(LOCAL_BASE_DIR, 'dataset_v2_ALL'),
}

if not local_paths['dataset_dir'].exists():
    Path.mkdir(local_paths['dataset_dir'])

local_paths['image_label_dir'] =                  Path(local_paths['dataset_dir'], 'image_label')
local_paths['label_bbox_backup_dir'] =            Path(local_paths['dataset_dir'], 'label_bbox_backup')
local_paths['label_id_backup_dir'] =              Path(local_paths['dataset_dir'], 'label_id_backup')
local_paths['ids_from_label_files'] =             Path(local_paths['dataset_dir'], 'ids_from_label_files_backup_dir.txt')
local_paths['processed_dataset_compressed'] =     Path(local_paths['dataset_dir'], 'processed_dataset.tar.gz')

# local_paths['yolo_data_cfg'] =                    Path(local_paths['dataset_dir'], 'yolo_data_cfg.yaml')
# local_paths['train_paths'] =                      Path(local_paths['dataset_dir'], 'train.txt')
# local_paths['val_paths'] =                        Path(local_paths['dataset_dir'], 'val.txt')
# local_paths['test_paths'] =                       Path(local_paths['dataset_dir'], 'test.txt')


### percentile process paths

In [None]:
#P00
local_paths['image_label_P00_dir'] =              Path(local_paths['dataset_dir'], 'image_label_P00')
local_paths['empty_data_P00_dir'] =               Path(local_paths['dataset_dir'], 'empty_data_P00')
local_paths['label_id_backup_P00_dir'] =          Path(local_paths['dataset_dir'], 'label_id_backup_P00')
local_paths['class_names_P00'] =                  Path(local_paths['dataset_dir'], 'class_names_P00.txt')
local_paths['all_paths_P00'] =                    Path(local_paths['dataset_dir'], 'all_P00.txt')
local_paths['train_paths_P00'] =                  Path(local_paths['dataset_dir'], 'train_P00.txt')
local_paths['val_paths_P00'] =                    Path(local_paths['dataset_dir'], 'val_P00.txt')
local_paths['test_paths_P00'] =                   Path(local_paths['dataset_dir'], 'test_P00.txt')
local_paths['yolo_data_cfg_P00'] =                Path(local_paths['dataset_dir'], 'yolo_data_cfg_P00.yaml')
#P25
local_paths['image_label_P25_dir'] =              Path(local_paths['dataset_dir'], 'image_label_P25')
local_paths['empty_data_P25_dir'] =               Path(local_paths['dataset_dir'], 'empty_data_P25')
local_paths['label_id_backup_P25_dir'] =          Path(local_paths['dataset_dir'], 'label_id_backup_P25')
local_paths['class_names_P25'] =                  Path(local_paths['dataset_dir'], 'class_names_P25.txt')
local_paths['all_paths_P25'] =                    Path(local_paths['dataset_dir'], 'all_P25.txt')
local_paths['train_paths_P25'] =                  Path(local_paths['dataset_dir'], 'train_P25.txt')
local_paths['val_paths_P25'] =                    Path(local_paths['dataset_dir'], 'val_P25.txt')
local_paths['test_paths_P25'] =                   Path(local_paths['dataset_dir'], 'test_P25.txt')
local_paths['yolo_data_cfg_P25'] =                Path(local_paths['dataset_dir'], 'yolo_data_cfg_P25.yaml')
#P30
local_paths['image_label_P30_dir'] =              Path(local_paths['dataset_dir'], 'image_label_P30')
local_paths['empty_data_P30_dir'] =               Path(local_paths['dataset_dir'], 'empty_data_P30')
local_paths['label_id_backup_P30_dir'] =          Path(local_paths['dataset_dir'], 'label_id_backup_P30')
local_paths['class_names_P30'] =                  Path(local_paths['dataset_dir'], 'class_names_P30.txt')
local_paths['all_paths_P30'] =                    Path(local_paths['dataset_dir'], 'all_P30.txt')
local_paths['train_paths_P30'] =                  Path(local_paths['dataset_dir'], 'train_P30.txt')
local_paths['val_paths_P30'] =                    Path(local_paths['dataset_dir'], 'val_P30.txt')
local_paths['test_paths_P30'] =                   Path(local_paths['dataset_dir'], 'test_P30.txt')
local_paths['yolo_data_cfg_P30'] =                Path(local_paths['dataset_dir'], 'yolo_data_cfg_P30.yaml')
#P35
local_paths['image_label_P35_dir'] =              Path(local_paths['dataset_dir'], 'image_label_P35')
local_paths['empty_data_P35_dir'] =               Path(local_paths['dataset_dir'], 'empty_data_P35')
local_paths['label_id_backup_P35_dir'] =          Path(local_paths['dataset_dir'], 'label_id_backup_P35')
local_paths['class_names_P35'] =                  Path(local_paths['dataset_dir'], 'class_names_P35.txt')
local_paths['all_paths_P35'] =                    Path(local_paths['dataset_dir'], 'all_P35.txt')
local_paths['train_paths_P35'] =                  Path(local_paths['dataset_dir'], 'train_P35.txt')
local_paths['val_paths_P35'] =                    Path(local_paths['dataset_dir'], 'val_P35.txt')
local_paths['test_paths_P35'] =                   Path(local_paths['dataset_dir'], 'test_P35.txt')
local_paths['yolo_data_cfg_P35'] =                Path(local_paths['dataset_dir'], 'yolo_data_cfg_P35.yaml')
#P40
local_paths['image_label_P40_dir'] =              Path(local_paths['dataset_dir'], 'image_label_P40')
local_paths['empty_data_P40_dir'] =               Path(local_paths['dataset_dir'], 'empty_data_P40')
local_paths['label_id_backup_P40_dir'] =          Path(local_paths['dataset_dir'], 'label_id_backup_P40')
local_paths['class_names_P40'] =                  Path(local_paths['dataset_dir'], 'class_names_P40.txt')
local_paths['all_paths_P40'] =                    Path(local_paths['dataset_dir'], 'all_P40.txt')
local_paths['train_paths_P40'] =                  Path(local_paths['dataset_dir'], 'train_P40.txt')
local_paths['val_paths_P40'] =                    Path(local_paths['dataset_dir'], 'val_P40.txt')
local_paths['test_paths_P40'] =                   Path(local_paths['dataset_dir'], 'test_P40.txt')
local_paths['yolo_data_cfg_P40'] =                Path(local_paths['dataset_dir'], 'yolo_data_cfg_P40.yaml')
#P40 aug 1
local_paths['image_label_P40_aug_1_dir'] =        Path(local_paths['dataset_dir'], 'image_label_P40_aug_1')
local_paths['train_paths_P40_aug_1'] =            Path(local_paths['dataset_dir'], 'train_P40_aug_1.txt')
local_paths['yolo_data_cfg_P40_aug_1'] =          Path(local_paths['dataset_dir'], 'yolo_data_cfg_P40_aug_1.yaml')
#P40 aug 2
local_paths['image_label_P40_aug_2_dir'] =        Path(local_paths['dataset_dir'], 'image_label_P40_aug_2')
local_paths['train_paths_P40_aug_2'] =            Path(local_paths['dataset_dir'], 'train_P40_aug_2.txt')
local_paths['yolo_data_cfg_P40_aug_2'] =          Path(local_paths['dataset_dir'], 'yolo_data_cfg_P40_aug_2.yaml')

## Install FiftyOne


In [None]:
!pip install fiftyone



## learn custom class names todo

In [None]:
# matching_name_id_df

In [None]:
# matching_name_id_df = pd.read_excel(paths['matching_name_id'], sheet_name=0)
# matching_name_id = dict(zip(matching_name_id_df['Label ID'], matching_name_id_df['Tên Label']))
# print(matching_name_id[1.0])

In [None]:
# id_to_names = matching_name_id_df.groupby(matching_name_id_df['Label ID'].astype(int))['Tên Label'].apply(list).to_dict()

# def find_names_by_id(integer_part):
#     return id_to_names.get(int(integer_part), ['ID not found'])

# integer_part = 0
# names = find_names_by_id(integer_part)
# print(f'Names for IDs with integer part {integer_part}: {names}')

In [None]:
# id_to_names

## utils

In [None]:
def write_image_paths(paths, destination):
    with open(destination, 'w') as f:
        for path in paths:
            f.write(f'{path}\n')

In [None]:
def read_image_paths(file_path):
    if Path(file_path).exists():
        with open(file_path, 'r') as f:
            return [Path(line.strip()) for line in f if line.strip()]
    return []

In [None]:
def read_class_names():
    with open(local_paths['class_names'], 'r') as f:
        class_names = [name.strip() for name in f.readlines()]
    return class_names

# test

In [None]:
import fiftyone as fo

# fo.delete_dataset("fiber_optic_train")

samples = []
for path in [local_paths['train_paths_P40']]:
    with open(path, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        sample = fo.Sample(filepath=line)

        detections = []
        with open(Path(line).with_suffix('.txt'), 'r') as f:
            ls = f.readlines()

        for l in ls:
            parts = l.strip().split(' ')
            class_id = parts[0]
            (x_center, y_center, bbox_width, bbox_height) = map(float, parts[1:5])
            x_min = max(0.0, x_center - (bbox_width/2))
            y_min = max(0.0, y_center - (bbox_height/2))

            detections.append(
                fo.Detection(label=class_id, bounding_box=[x_min, y_min, bbox_width, bbox_height])
            )
        sample['ground_truth'] = fo.Detections(detections=detections)

        samples.append(sample)
dataset = fo.Dataset("fiber_optic_train")
dataset.add_samples(samples)

 100% |███████████████| 7403/7403 [38.8s elapsed, 0s remaining, 132.8 samples/s]      


INFO:eta.core.utils: 100% |███████████████| 7403/7403 [38.8s elapsed, 0s remaining, 132.8 samples/s]      


['6866f06412d76f96f7059e60',
 '6866f06412d76f96f7059e61',
 '6866f06412d76f96f7059e62',
 '6866f06412d76f96f7059e63',
 '6866f06412d76f96f7059e64',
 '6866f06412d76f96f7059e65',
 '6866f06412d76f96f7059e66',
 '6866f06412d76f96f7059e67',
 '6866f06412d76f96f7059e68',
 '6866f06412d76f96f7059e69',
 '6866f06412d76f96f7059e6a',
 '6866f06412d76f96f7059e6b',
 '6866f06412d76f96f7059e6c',
 '6866f06412d76f96f7059e6d',
 '6866f06412d76f96f7059e6e',
 '6866f06412d76f96f7059e6f',
 '6866f06412d76f96f7059e70',
 '6866f06412d76f96f7059e71',
 '6866f06412d76f96f7059e72',
 '6866f06412d76f96f7059e73',
 '6866f06412d76f96f7059e74',
 '6866f06412d76f96f7059e75',
 '6866f06412d76f96f7059e76',
 '6866f06412d76f96f7059e77',
 '6866f06412d76f96f7059e78',
 '6866f06412d76f96f7059e79',
 '6866f06412d76f96f7059e7a',
 '6866f06412d76f96f7059e7b',
 '6866f06412d76f96f7059e7c',
 '6866f06412d76f96f7059e7d',
 '6866f06412d76f96f7059e7e',
 '6866f06412d76f96f7059e7f',
 '6866f06412d76f96f7059e80',
 '6866f06412d76f96f7059e81',
 '6866f06412d7

In [None]:
print(dataset)

Name:        fiber_optic_train
Media type:  image
Num samples: 7403
Persistent:  False
Tags:        []
Sample fields:
    id:               fiftyone.core.fields.ObjectIdField
    filepath:         fiftyone.core.fields.StringField
    tags:             fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:         fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    created_at:       fiftyone.core.fields.DateTimeField
    last_modified_at: fiftyone.core.fields.DateTimeField
    ground_truth:     fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections)


# launch

In [None]:
session = fo.launch_app(dataset)