In [8]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/workspace')
from clearml import StorageManager, Dataset
from config.default import TrainingConfig
conf = TrainingConfig()


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from dataclasses import asdict, dataclass, fields
asdict(conf)

{'PROJECT_NAME': 'bousteud',
 'TASK_NAME': 'maturity',
 'TYPE_TASK': <TaskTypes.training: 'training'>,
 'OUTPUT_URI': 's3://10.8.0.66:9000/clearml-test',
 'db': {'bucket_experiment': 's3://10.8.0.66:9000/clearml-test/training/experiment',
  'bucket_dataset': 's3://10.8.0.66:9000/clearml-test/dataset/simple'},
 'data': {'random_seed': 76,
  'dir': '/workspace/dataset/simple',
  'batch': 24,
  'train_ratio': 0.8,
  'val_ratio': 0.1,
  'test_ratio': 0.1,
  'input_size': 224,
  'input_resize': 256},
 'aug': {'augmentor': 'albumentations',
  'type_executions': 'online',
  'augmentor_task': {'train': {'OneOf_1': [{'VerticalFlip': {'always_apply': False,
       'p': 0.95}},
     {'HorizontalFlip': {'always_apply': False, 'p': 0.95}}],
    'ShiftScaleRotate': {'always_apply': True,
     'p': 0.5,
     'shift_limit_x': (-0.12, 0.12),
     'shift_limit_y': (-0.12, 0.12),
     'scale_limit': (-0.050000000000000044, 0.1499999999999999),
     'rotate_limit': (-90, 90),
     'interpolation': 0,
    

#### Manage Data

In [None]:
manager = StorageManager()
manager.download_folder(
    remote_url=conf.db.bucket_dataset,
    local_folder='/workspace/dataset/'
)

{'endpoint_url': 'http://10.8.0.66:9000', 'region_name': 'binsho-server-2', 'use_ssl': False, 'verify': True, 'config': <botocore.config.Config object at 0x7f7243f3fa10>, 'aws_access_key_id': 'agfian_test_1', 'aws_secret_access_key': 'clearml_secret_key_test'}


'/workspace/dataset/'

In [None]:
ds = Dataset.create(
    dataset_project='fruit-oil',
    dataset_name='sample-data-bousteud', 
    dataset_tags=['lerning-clearml'],
    output_uri=f'{conf.OUTPUT_URI}/dataset/sample-from-clearml'
)

ClearML results page: http://10.8.0.10:7080/projects/e3ab1f2c806947eea2ea59994d35ec50/experiments/411403ce990b433aaeac743d154d52a2/output/log
ClearML dataset page: http://10.8.0.10:7080/datasets/simple/e3ab1f2c806947eea2ea59994d35ec50/experiments/411403ce990b433aaeac743d154d52a2


In [6]:
ds.add_files(path='/workspace/dataset/simple')

Generating SHA2 hash for 420 files


100%|██████████| 420/420 [00:00<00:00, 4999.31it/s]

Hash generation completed





420

In [7]:
import os
root_folder = '/workspace/dataset/simple'
counts = []
folders = sorted(os.listdir(root_folder))
for folder in folders:
    count = len(os.listdir(os.path.join(root_folder, folder)))
    counts.append([count])


ds.get_logger().report_histogram(
    title='Dataset Histogram',
    series='Training Simple Dataset',
    values=counts,
    labels=folders,
    xaxis='class',
    yaxis='count of data'
)

In [8]:
ds.finalize(auto_upload=True)

Pending uploads, starting dataset upload to s3://10.8.0.66:9000/clearml-test/dataset/sample-from-clearml
Uploading dataset changes (420 files compressed to 69.21 MiB) to s3://10.8.0.66:9000/clearml-test/dataset/sample-from-clearml
File compression and upload completed: total size 69.21 MiB, 1 chunk(s) stored (average size 69.21 MiB)


True

In [10]:
ds.get_default_storage()

's3://10.8.0.66:9000/clearml-test/dataset/sample-from-clearml'

# Data Preparation

In [5]:
import glob
import torch
import albumentations as al
from os.path import join
from typing import Union
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import cv2
from torchvision.datasets import ImageFolder
import torchvision.transforms as tt
import numpy as np



### Prepare Data

In [2]:
from random import shuffle

def get_list_data(root_path, conf:TrainingConfig):

    def check_health_img(fp):
        try:
            img = cv2.imread(fp)
            h,w,c = img.shape
            if h > 0 and w>0:
                return True
        except Exception as e:
            print('[ERROR] Image Corrupt: ', fp, e)
            return False

    def split_list(ls_fp_image):
        count_imgs = len(ls_fp_image)
        tr_count = int(tr*count_imgs)
        va_count = int(va*count_imgs)
        shuffle(ls_fp_image)
        train = ls_fp_image[:tr_count]
        val = ls_fp_image[tr_count:tr_count+va_count]
        test = ls_fp_image[tr_count+va_count:]
        return train, val, test
    
    d_metadata = {
        'ratio': [],
        'counts' : {
            'train': {},
            'val': {},
            'test': {},
        }
    }

    labels = conf.data.category
    d_data = {lbl:[] for lbl in labels}
    ls_train = []
    ls_val = []
    ls_test = []

    for label in labels:
        fp_folder = join(root_path, label)
        for file in os.listdir(fp_folder):
            fp_image = join(fp_folder, file)
            if check_health_img(fp_image):
                d_data[label].append((fp_image, labels.index(label)))
    
    tr = conf.data.train_ratio
    va = conf.data.val_ratio
    te = conf.data.test_ration
    
    d_metadata['ratio'] = [tr, va, te]

    ls_train_set, ls_val_set, ls_test_set = [], [], []
    for key, ls_fp_image in d_data.items():
        ls_train, ls_val, ls_test = split_list(ls_fp_image)
        ls_train_set.extend(ls_train)
        ls_val_set.extend(ls_val)
        ls_test_set.extend(ls_test)
        d_metadata['counts']['train'][key] = len(ls_train)
        d_metadata['counts']['val'][key] = len(ls_train)
        d_metadata['counts']['test'][key] = len(ls_train)

    d_metadata['train_count'] = len(ls_train_set)
    d_metadata['val_count'] = len(ls_val_set)
    d_metadata['test_count'] = len(ls_test_set)
    
    return ls_train_set, ls_val_set, ls_test_set, d_metadata

### Create Dataset

In [3]:
class ImageDatasetBinsho(Dataset):
    def __init__(self, data, transform):
        self.data = data
        self.transform = al.Compose(transform)

    def __len__(self): return len(self.data)
    
    def __getitem__(self, index):
        fp_img, y  = self.data[index]
        y_label = torch.tensor(int(y))
        x_image = np.array(Image.open(fp_img)) # rgb format!
        x_image = self.transform(image=x_image)["image"] 
        return x_image, y_label


### Create DataLoader

In [141]:
def meanstd(dl):
    batch,sum_,sqr_= 0, 0, 0
    for x,y in dl:
        # print(type(x), x.shape, torch.min(x), torch.max(x))
        sum_+=torch.mean(x,axis=[0,2,3])
        sqr_+=torch.mean(x**2,axis=[0,2,3])
        batch+=1
    mean= sum_/batch
    std= (sqr_/batch)-mean**2
    print(mean,std)

In [142]:
imgs1 = ImageDatasetBinsho(root_path='/workspace/dataset/simple', conf=conf, transform=conf.aug.get_ls_val())
len(imgs1)

420

In [None]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader

# Note - you must have torchvision installed for this example
from torchvision import transforms


class ImageDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str = "./"):
        super().__init__()
        self.data_dir = data_dir
        self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
    
    def setup(self, stage: str):
        # get list of data
        self.conf = TrainingConfig()
        ls_train_set, ls_val_set, ls_test_set, d_metadata = get_list_data(root_path='/workspace/dataset/simple', conf=self.conf)
        
        # Assign train/val datasets for use in dataloaders
        if stage == "fit":
            self.data_train = ImageDatasetBinsho(ls_train_set, transform=self.conf.aug.get_ls_train())
            self.data_val = ImageDatasetBinsho(ls_val_set, transform=self.conf.aug.get_ls_train())

        # Assign test dataset for use in dataloader(s)
        if stage == "test":
            self.data_test = ImageDatasetBinsho(ls_test_set, transform=self.conf.aug.get_ls_train())

    def train_dataloader(self):

        return DataLoader(self.data_test, batch_size=self.conf.data.batch)

    def val_dataloader(self):
        return DataLoader(self.data_val, batch_size=self.conf.data.batch)

    def test_dataloader(self):
        return DataLoader(self.data_test, batch_size=self.conf.data.batch)