# 02. Custom Model

[Kaggle - Lemon Quality Dataset](https://www.kaggle.com/datasets/yusufemir/lemon-quality-dataset) を OCI Vision + Data Labeling を用いて実施するデモです。

### データの準備

データをダウンロードし展開します。

```sh
$ tree lemon_dataset -L 1
lemon_dataset
├── bad_quality # 954 枚
├── empty_background # 1128 枚
└── good_quality # 455 枚
```

学習用、テスト用にランダムに 30 枚ずつサンプリングし、移動させます。

```sh
find ./lemon_dataset/bad_quality | sort -R | tail -n 30 | xargs -i sh -c 'mv {} ./lemon_dataset/test/'
```

```sh
find ./lemon_dataset/empty_background | sort -R | tail -n 30 | xargs -i sh -c 'mv {} ./lemon_dataset/test/'
```

```sh
find ./lemon_dataset/good_quality | sort -R | tail -n 30 | xargs -i sh -c 'mv {} ./lemon_dataset/test/'
```

最終的に、`lemon_dataset` 内が以下のようになれば OK です。

```sh
$ tree lemon_dataset -L 1
lemon_dataset
├── bad_quality # 924 枚
├── empty_background # 1098 枚
├── good_quality # 425 枚
└── test　# 90 枚
```

In [None]:
!tree lemon_dataset -L 1


In [None]:
!rm -rf ./lemon_dataset/test/*
!find ./lemon_dataset/bad_quality | sort -R | tail -n 30 | xargs -i sh -c 'mv {} ./lemon_dataset/test/'
!find ./lemon_dataset/empty_background | sort -R | tail -n 30 | xargs -i sh -c 'mv {} ./lemon_dataset/test/'
!find ./lemon_dataset/good_quality | sort -R | tail -n 30 | xargs -i sh -c 'mv {} ./lemon_dataset/test/'


In [None]:
!tree lemon_dataset -L 1


### オブジェクト・ストレージへのアップロード

Data Labeling のデータセット作成を作成するために、Object Storage に学習データをアップロードします。


In [None]:
# 学習データ格納用のバケットを作成する

from config import *
import os
from dotenv import load_dotenv
from oci.config import from_file
from oci.object_storage import ObjectStorageClient
from oci.object_storage.models import CreateBucketDetails
from oci.exceptions import ServiceError

load_dotenv()

config = from_file()
os_client = ObjectStorageClient(config=config)


In [None]:
try:
    create_bucket_resp = os_client.create_bucket(
        namespace_name = NAMESPACE,
        create_bucket_details = CreateBucketDetails(
            name = LEMON_TRAINING_DATA_BUCKET_NAME,
            compartment_id = COMPARTMENT_ID,
        )
    )
    print(f"bucket: {create_bucket_resp.data}")
except ServiceError:
    print(f"bucket: {LEMON_TRAINING_DATA_BUCKET_NAME} is already exists.")


In [None]:
# 学習データを作成したバケットにアップロードする

import glob
import mimetypes
from oci.object_storage import UploadManager
from concurrent.futures import ThreadPoolExecutor

um = UploadManager(object_storage_client = os_client)

def put_object(file):
    mimetype, encoding = mimetypes.guess_type(file)
    um.upload_file(
        namespace_name=NAMESPACE,
        bucket_name=LEMON_TRAINING_DATA_BUCKET_NAME,
        object_name=file.removeprefix(f"{LEMON_DATASET_DIRECTORY_PATH}/"),
        file_path=file,
        content_type=mimetype
    )
    
for prefix in DATASET_PREFIX:
    print(f"{LEMON_DATASET_DIRECTORY_PATH}/{prefix}/*.jpg")
    files = glob.glob(f"{LEMON_DATASET_DIRECTORY_PATH}/{prefix}/*.jpg")
    print(f"Uploading {len(files)} to the bucket {BUCKET_NAME}, it will take some times...")
    with ThreadPoolExecutor(NO_OF_PROCESSORS) as executor:
        futures = [executor.submit(put_object, file)
                    for file in files]


### Data Labeling

In [None]:
# Data Labeling - Dataset を作成します

from oci.data_labeling_service import DataLabelingManagementClient
from oci.data_labeling_service_dataplane import DataLabelingClient
from oci.data_labeling_service_dataplane.models import Label, ObjectStorageSourceDetails, CreateAnnotationDetails, GenericEntity
from oci.data_labeling_service.models import CreateDatasetDetails, DatasetFormatDetails, LabelSet, GenerateDatasetRecordsDetails

dlm_client = DataLabelingManagementClient(config = config)
dls_client = DataLabelingClient(config = config)


In [None]:
# Data Labeling - Dataset を作成します

label_set = []
for label in DATASET_PREFIX:
    label_set.append(
        Label(name = label)
    )

try:
    create_data_labeling_resp = dlm_client.create_dataset(
        create_dataset_details = CreateDatasetDetails(
            display_name = "lemon-quality-classification-dataset",
            description = "dataset of lemon classification demo.",
            compartment_id = COMPARTMENT_ID,
            annotation_format = "SINGLE_LABEL",
            dataset_source_details = ObjectStorageSourceDetails(
                source_type = "OBJECT_STORAGE",
                namespace = NAMESPACE,
                bucket = LEMON_TRAINING_DATA_BUCKET_NAME,
            ),
            dataset_format_details = DatasetFormatDetails(
                format_type = "IMAGE",
            ),
            label_set = LabelSet(
                items = label_set
            ),
        )
    )
    print(create_data_labeling_resp.data)
except ServiceError:
    print(f"dataset: lemon-quality-classification-dataset is already exists.")


In [None]:
# Data Labeling - Data records を生成します（完了には多少時間を要します）
import time

dataset_id = create_data_labeling_resp.data.id

dlm_client.generate_dataset_records(
    dataset_id = create_data_labeling_resp.data.id,
    generate_dataset_records_details = GenerateDatasetRecordsDetails(
        limit = 5000
    )
)

list_wr_resp = dlm_client.list_work_requests(
    compartment_id = COMPARTMENT_ID
)

work_request_id = list_wr_resp.data.items[0].id

print("processing...")
while True:
    time.sleep(30)
    get_wr_resp = dlm_client.get_work_request(work_request_id=work_request_id)
    print(f"complete: {get_wr_resp.data.percent_complete}")
    if get_wr_resp.data.status == "SUCCEEDED":
        print("finished")
        break


In [None]:
# Data Labeling - Data records にラベルをつける
import json

def create_annotation(name, record_id):
    label_keys = list(LABEL_MAP.keys()) # bad_quality/, empty_background/, good_quality/
    label = ""
    for label_key in label_keys:
        if name.startswith(label_key):
            label = LABEL_MAP[label_key]
            break
    if label != "":
        dls_client.create_annotation(
            create_annotation_details = CreateAnnotationDetails(
                record_id = record_id,
                compartment_id = COMPARTMENT_ID,
                entities = [
                    GenericEntity(
                        entity_type = "GENERIC",
                        labels = [
                            Label(
                                label = label
                            )
                        ]
                    )
                ]
            )
        )

def list_record(page):
    try:
        response = dls_client.list_records(
            compartment_id = COMPARTMENT_ID,
            dataset_id = dataset_id,
            is_labeled = False,
            limit = 1000,
            page = page
        )
    except Exception as error:
        response = error
        print(response)
        
    data = json.loads(str(response.data))
    names = [dls_dataset_record["name"] for dls_dataset_record in data["items"]]
    ids = [dls_dataset_record["id"] for dls_dataset_record in data["items"]]
    if response.has_next_page:
        page = response.next_page
    else:
        page = None
    return names, ids, page


pool = mp.Pool(NO_OF_PROCESSORS)

page = None
while True:
    names, ids, page = list_record(page)
    pool.starmap(create_annotation, zip(names, ids))
    if not page:
        break
pool.close()



In [None]:
# Vision - Project を作成する
from oci.ai_vision import AIServiceVisionClient
from oci.ai_vision import models as vision_models

vision_client = AIServiceVisionClient(config = config)


In [None]:

try:
    create_project_resp = vision_client.create_project(
        create_project_details = vision_models.CreateProjectDetails(
            display_name = "lemon-quality-classification-project",
            compartment_id = COMPARTMENT_ID
        )
    )
    print(create_project_resp.data)
except ServiceError:
    print(f"project: lemon-quality-classification-project is already exists.")


In [None]:
# Vision - カスタム・モデルを作成する
try:
    create_custom_model_resp = vision_client.create_model(
        create_model_details = vision_models.CreateModelDetails(
            display_name = "lemon-custom-model",
            model_version = "1.0.0",
            model_type = "IMAGE_CLASSIFICATION",
            compartment_id = COMPARTMENT_ID,
            is_quick_mode = True,
            training_dataset = vision_models.DataScienceLabelingDataset(
                dataset_type = "DATA_SCIENCE_LABELING",
                dataset_id = dataset_id
            ),
            project_id = create_project_resp.data.id
        )
    )
    print(create_custom_model_resp.data)
except ServiceError:
    print(f"model: lemon-custom-model is already exists.")


In [None]:
list_wr_resp = vision_client.list_work_requests(compartment_id = COMPARTMENT_ID)
work_request_id = list_wr_resp.data.items[0].id

print("processing...")
while True:
    time.sleep(30)
    get_wr_resp = vision_client.get_work_request(work_request_id=work_request_id)
    print(f"complete: {get_wr_resp.data.percent_complete}")
    if get_wr_resp.data.status == "SUCCEEDED":
        print("finished")
        break


In [None]:
import base64

# FIXME: use custom model id from response.
custom_model_id = "ocid1.aivisionmodel.oc1.ap-tokyo-1.amaaaaaassl65iqagayt2uvd7rrsamq3aokyp3yywxghdekw5ogf7xhautkq"
print(custom_model_id)

with open("./lemon_dataset/test/good_quality_396.jpg", "rb") as file:
    image_file = file.read()

    # INLINE: リクエストのペイロード中に直接画像データを含める
    image_classification_analyze_details = vision_models.AnalyzeImageDetails(
        features = [
            vision_models.ImageClassificationFeature(
                feature_type = "IMAGE_CLASSIFICATION",
                model_id = custom_model_id
            )
        ],
        image = vision_models.InlineImageDetails(
            source = "INLINE",
            data = base64.b64encode(image_file).decode('utf-8')
        ),
        compartment_id = COMPARTMENT_ID,
    )
    image_classification_response = vision_client.analyze_image(analyze_image_details = image_classification_analyze_details)

print(image_classification_response.data)
