In [5]:
# データセットの作成
import time, params, oci_client
from oci.data_labeling_service import DataLabelingManagementClient
from oci.data_labeling_service.models import CreateDatasetDetails, LabelSet, Label, \
    Dataset, DatasetSourceDetails, DatasetFormatDetails, ObjectStorageSourceDetails

data_format_details = DatasetFormatDetails(format_type='IMAGE') 

data_source_details = ObjectStorageSourceDetails(
    source_type = 'OBJECT_STORAGE',
    namespace = params.os_namespace,
    bucket = params.os_bucket
)

label_set = LabelSet(items=[Label(name='good'), Label(name='bad'), Label(name='empty')])

create_dataset_details = CreateDatasetDetails(
    display_name = params.dataset_name,
    annotation_format = "SINGLE_LABEL",
    compartment_id = params.compartment_id,
    dataset_format_details = data_format_details,
    dataset_source_details = data_source_details,
    label_set = label_set
)

dl_mgmt_client = oci_client.get(DataLabelingManagementClient) # type: DataLabelingManagementClient
dataset = dl_mgmt_client.create_dataset(create_dataset_details).data # type: Dataset

while True:
    latest_dataset = dl_mgmt_client.get_dataset(dataset.id).data # type: Dataset
    if(latest_dataset.lifecycle_state == 'ACTIVE'):
        break
    print('.', end='')
    time.sleep(5)

print(f'\n"{dataset.display_name}" was created.')
time.sleep(5) # extra to go to the next cell

....
"lemon_dataset" was created.


In [6]:
# データ・レコードの作成
import params, oci_client, oci, re, random
from oci.data_labeling_service_dataplane import DataLabelingClient
from oci.data_labeling_service_dataplane.models import Record, CreateRecordDetails, CreateObjectStorageSourceDetails
from oci.object_storage import ObjectStorageClient
from oci.object_storage.models import ListObjects, ObjectSummary

dataset_id = dataset.id

def create_record(obj: ObjectSummary) -> Record:
    dl_client = oci_client.get(DataLabelingClient) # type: DataLabelingClient
    object_name = obj.name
    name = re.match(r'^.*/([^/]+)$', object_name).group(1)

    create_source_details = CreateObjectStorageSourceDetails(
        source_type = 'OBJECT_STORAGE', 
        relative_path = object_name
    )
    create_record_details = CreateRecordDetails(
        compartment_id = params.compartment_id,
        dataset_id = dataset_id,
        source_details = create_source_details,
        name = name
    )
    response = dl_client.create_record(create_record_details)
    return response.data

os_client = oci_client.get(ObjectStorageClient) # type: ObjectStorageClient
response = oci.pagination.list_call_get_all_results(os_client.list_objects, params.os_namespace, params.os_bucket, prefix='data/').data # type: ListObjects
count = 0
objects = response.objects # type: list[ObjectSummary]
sample = objects if params.sampling_factor == 1.0 else random.sample(objects, int(len(objects) * params.sampling_factor))
print(f'Creating data - {len(sample)} records (sampling {int(params.sampling_factor*100.0)}%)')
for obj in sample:
    #obj = obj # type: ObjectSummary
    if(obj.name.endswith('.jpg')):
        record = create_record(obj)
        count += 1
        if count % 25 == 0:
            print('.', end='')
print(f'\ndone - total: {count}')

Creating data - 2528 records (sampling 100%)
.....................................................................................................
done - total: 2528


In [7]:
# データ・レコードにラベルをつける
import params, oci_client
from oci.data_labeling_service_dataplane import DataLabelingClient
from oci.data_labeling_service_dataplane.models import Record, RecordCollection, RecordSummary, CreateAnnotationDetails, GenericEntity, Label

dataset_id = dataset.id

dl_client = oci_client.get(DataLabelingClient) # type: DataLabelingClient

# レコード名（ファイル名由来）から "good", "bad","empty" のうちのどれかのラベルをつける
def create_annotation(record: RecordSummary):
    label = Label()
    label.label = "good" if 'good' in record.name else ("bad" if 'bad' in record.name else ("empty" if 'empty' in record.name else None))
    #print("{} {}".format(label.label, record.name))
    if not label.label:
        raise Exception('No label: {}'.format(record.name))

    entity = GenericEntity(entity_type="GENERIC", labels=[label])
    create_annotation_details = CreateAnnotationDetails(
        compartment_id = params.compartment_id,
        entities = [entity],
        record_id = record.id
    )
    dl_client.create_annotation(create_annotation_details)

print(f'Creating annotations, it will take some time...')
count = 0
records = oci.pagination.list_call_get_all_results(dl_client.list_records, params.compartment_id, dataset_id).data # type: RecordCollection
for record in records:
    create_annotation(record)
    count += 1
    if count % 25 == 0:
        print('.', end='')
print(f'\ndone - total: {count}')

Creating annotations, it will take some time...
.....................................................................................................
done - total: 2528


In [None]:
# Dataset 情報を保存
import json

dataset_info ={
    "id" : dataset.id,
    "display_name" : dataset.display_name
}

with open('dataset_info.json', 'w') as f:
    json.dump(f'dataset_{params.dataset_name}.json', f, indent=2)