In [1]:
import json
import logging
import os
import sys
from time import time
from typing import List, Dict, Any, Tuple

import numpy as np
import ray
import torch
from torchvision import transforms

import data_utilities as du

### Ray Data

In [2]:
# Load the credentials and connection information.
with open('credentials.json') as f:
    credentials = json.load(f)

os.environ['MINIO_URL'] = credentials['url']
os.environ['MINIO_ACCESS_KEY'] = credentials['accessKey']
os.environ['MINIO_SECRET_ACCESS_KEY'] = credentials['secretKey']

X_train, y_train, X_test, y_test, load_time_sec = du.get_train_test_data()

print('Training set size: ', len(X_train))
print('Test set size:', len(X_test))
print('Load time (seconds):', load_time_sec)

23774 2023-12-04 08:08:51,221 | INFO | get_train_test_data called. smoke_test_size: 0
Training set size:  60000
Test set size: 10000
Load time (seconds): 20.208586931228638


In [3]:
print('Training sample:', X_train[0])
print('Label for sample:', y_train[0])

Training sample: train/0/000c2bff-4fa5-4e95-b90a-e39be79cf5e7.jpeg
Label for sample: 0


In [4]:
train_dict_list = [{'X': X_train[i], 'y': y_train[i]} for i in range(len(X_train))]
test_dict_list = [{'X': X_test[i], 'y': y_test[i]} for i in range(len(X_test))]
train_data = ray.data.from_items(train_dict_list, parallelism=5)
test_data = ray.data.from_items(test_dict_list, parallelism=5)
type(train_data)

2023-12-04 08:09:13,430	INFO worker.py:1664 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


ray.data.dataset.MaterializedDataset

In [5]:
ray.data.DataContext.get_current().execution_options.verbose_progress = True
rows = train_data.take(3)
rows

2023-12-04 08:09:14,122	INFO dataset.py:2383 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2023-12-04 08:09:14,124	INFO streaming_executor.py:104 -- Executing DAG InputDataBuffer[Input] -> LimitOperator[limit=3]
2023-12-04 08:09:14,125	INFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=True)


- limit=3 1:   0%|          | 0/5 [00:00<?, ?it/s]

Running 0:   0%|          | 0/5 [00:00<?, ?it/s]

[{'X': 'train/0/000c2bff-4fa5-4e95-b90a-e39be79cf5e7.jpeg', 'y': 0},
 {'X': 'train/0/00143885-bede-4b19-8ef0-99135c8f2290.jpeg', 'y': 0},
 {'X': 'train/0/00289886-490f-4966-ada8-2bfe1e165aa9.jpeg', 'y': 0}]

In [6]:
#train_data = train_data.map_batches(du.preprocess_batch, fn_kwargs={'bucket_name':'mnist'})
train_data = train_data.map_batches(du.ProcessBatch, compute=ray.data.ActorPoolStrategy(size=2), fn_constructor_kwargs={'bucket_name':'mnist'})

In [7]:
print(train_data)

MapBatches(ProcessBatch)
+- Dataset(num_blocks=5, num_rows=60000, schema={X: string, y: int64})


In [8]:
batch_count = 0
batch_size = 0
for batch in train_data.iter_torch_batches(batch_size=400, dtypes=torch.float32):
    if batch_size == 0: batch_size = len(batch['X'])
    images, labels = batch['X'], batch['y']
    batch_count += 1
else:
    print(type(batch))
    print(type(images))
    print('Batch size: ', batch_size)
    print('Batch count:', batch_count)
    print(images[0])
    print(labels[0])

2023-12-04 08:09:14,475	INFO streaming_executor.py:104 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(ProcessBatch)]
2023-12-04 08:09:14,476	INFO streaming_executor.py:105 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=True)
2023-12-04 08:09:14,488	INFO actor_pool_map_operator.py:114 -- MapBatches(ProcessBatch): Waiting for 2 pool actors to start...


- MapBatches(ProcessBatch) 1:   0%|          | 0/5 [00:00<?, ?it/s]

Running 0:   0%|          | 0/5 [00:00<?, ?it/s]

[36m(_MapWorker pid=23826)[0m 23826 2023-12-04 08:09:15,610 | INFO | ProcessBatch object created. bucket_name: mnist.
[36m(MapWorker(MapBatches(ProcessBatch)) pid=23826)[0m 23826 2023-12-04 08:09:16,874 | INFO | Batch retrieval successful for bucket: mnist in MinIO object storage.
[36m(_MapWorker pid=23825)[0m 23825 2023-12-04 08:09:15,648 | INFO | ProcessBatch object created. bucket_name: mnist.
[36m(MapWorker(MapBatches(ProcessBatch)) pid=23826)[0m 23826 2023-12-04 08:09:22,700 | INFO | Batch retrieval successful for bucket: mnist in MinIO object storage.[32m [repeated 10x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(MapWorker(MapBatches(ProcessBatch)) pid=23826)[0m 23826 2023-12-04 08:09:28,616 | INFO | Batch retrieval successful for bucket: mnist in MinIO object storage.[32m [repeated 10x across cl

In [9]:
#train_data.count()