使用MindSpore的GeneratorDataset和GAS配合的简单样例

安装tensorbay：

In [None]:
!pip install tensorbay

In [2]:
import numpy as np
from PIL import Image
from tensorbay import GAS
from tensorbay.dataset import Dataset
import mindspore.dataset as ds

使用GAS SDK访问数据集：

In [3]:
gas = GAS("Your Access Key")
dataset = Dataset("RP2K", gas)
dataset

Dataset("RP2K") [
  Segment("test") [...],
  Segment("train") [...]
]

生成label字典：

In [4]:
dataset_client = gas.get_dataset("RP2K")
statistics = dataset_client.get_label_statistics()
statistics = statistics.dumps()
categories = [i['name'] for i in statistics['CLASSIFICATION']['categories']]
category_dict = {name: idx for idx, name in enumerate(categories)}
len(category_dict)

2424

选择数据集(train/test)

In [5]:
segment = dataset["test"]
segment

Segment("test") [
  RemoteData("-1013482322316369151.jpg")(...),
  RemoteData("-1080441667073922393.jpg")(...),
  RemoteData("-1493574444992800688.jpg")(...),
  RemoteData("-1553082000360211978.jpg")(...),
  RemoteData("-1602115196667906420.jpg")(...),
  RemoteData("-1660766814815816162.jpg")(...),
  RemoteData("-1713888261473361265.jpg")(...),
  RemoteData("-1863960068078052738.jpg")(...),
  RemoteData("-1896647297294998436.jpg")(...),
  RemoteData("-1981790978490656488.jpg")(...),
  RemoteData("-2078593936457174218.jpg")(...),
  RemoteData("-2130423476235569574.jpg")(...),
  RemoteData("-2164004255165872026.jpg")(...),
  RemoteData("-2168295182164522597.jpg")(...),
  ... (39395 items are folded),
  RemoteData("黄山松(醉翁亭).png")(...)
]

自定义迭代器：

In [6]:
def generator_rp2k(segment, category_dict):
    for data in segment:
        category = data.label.classification.category
        label = np.array(category_dict[category])
        with data.open() as fp:
            image = Image.open(fp)
            img = np.array(image)
        yield img, label
        
iterator = generator_rp2k(segment, category_dict)

In [7]:
next(iterator)

(array([[[ 30,  30,  38],
         [ 30,  30,  38],
         [ 29,  29,  37],
         ...,
         [ 36,  37,  41],
         [  9,  10,  12],
         [  9,  10,  12]],
 
        [[ 30,  30,  38],
         [ 30,  30,  38],
         [ 29,  29,  37],
         ...,
         [ 23,  24,  28],
         [ 27,  28,  30],
         [ 27,  28,  30]],
 
        [[ 30,  30,  38],
         [ 30,  30,  38],
         [ 30,  30,  38],
         ...,
         [ 24,  25,  29],
         [ 33,  34,  36],
         [ 33,  34,  36]],
 
        ...,
 
        [[193, 193, 191],
         [192, 192, 190],
         [191, 191, 189],
         ...,
         [112,  52,  54],
         [116,  56,  55],
         [121,  61,  60]],
 
        [[188, 188, 186],
         [189, 189, 187],
         [191, 191, 189],
         ...,
         [ 71,  29,  30],
         [ 84,  36,  32],
         [ 95,  47,  43]],
 
        [[193, 193, 191],
         [192, 192, 190],
         [192, 192, 190],
         ...,
         [199, 175, 173],
  

使用MindSpore的GeneratorDataset加载数据：

In [8]:
dataset = ds.GeneratorDataset(source=iterator, column_names=["img", "label"])

In [9]:
dataset_iter = dataset.create_tuple_iterator()
next(dataset_iter)

[Tensor(shape=[284, 128, 3], dtype=UInt8, value=
 [[[233, 128,  80],
   [233, 128,  80],
   [234, 129,  81],
   ...
   [217, 218, 213],
   [231, 231, 223],
   [186, 186, 178]],
  [[232, 130,  81],
   [233, 131,  82],
   [233, 131,  82],
   ...
   [210, 211, 206],
   [227, 226, 221],
   [213, 213, 205]],
  [[235, 133,  85],
   [235, 133,  85],
   [235, 133,  85],
   ...
   [204, 205, 200],
   [219, 218, 213],
   [240, 239, 234]],
  ...
  [[166, 167, 161],
   [166, 167, 161],
   [167, 168, 162],
   ...
   [192, 195, 176],
   [188, 191, 172],
   [182, 185, 166]],
  [[167, 168, 162],
   [168, 169, 163],
   [168, 169, 163],
   ...
   [181, 184, 165],
   [182, 185, 166],
   [181, 184, 165]],
  [[169, 170, 164],
   [169, 170, 164],
   [170, 171, 165],
   ...
   [167, 170, 151],
   [170, 173, 154],
   [173, 176, 157]]]),
 Tensor(shape=[], dtype=Int64, value= 1217)]