# Ray Data (still in beta)
Ray Datasets are the standard way to load and exchange data in Ray libraries and applications. Datasets provide basic distributed data transformations such as map, filter, and repartition, and are compatible with a variety of file formats, datasources, and distributed frameworks.

In [1]:
import ray
ray.init()

2021-12-25 19:43:17,674	INFO services.py:1340 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.1.105',
 'raylet_ip_address': '192.168.1.105',
 'redis_address': '192.168.1.105:6379',
 'object_store_address': '/tmp/ray/session_2021-12-25_19-43-15_166874_13523/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-12-25_19-43-15_166874_13523/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-12-25_19-43-15_166874_13523',
 'metrics_export_port': 52808,
 'node_id': '376046ac8c34102d1bb47b5c912ffef92a95074fbd1efca1a4c79fce'}

In [2]:
ds = ray.data.range(10000)

In [3]:
print(type(ds))

<class 'ray.data.dataset.Dataset'>


In [4]:
ds

Dataset(num_blocks=200, num_rows=10000, schema=<class 'int'>)

In [5]:
ds.take(20)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [6]:
print(ds.get_internal_block_refs()[:3])

[ObjectRef(69a6825d641b4613ffffffffffffffffffffffff0100000002000000), ObjectRef(ee4e90da584ab0ebffffffffffffffffffffffff0100000002000000), ObjectRef(4ee449587774c1f0ffffffffffffffffffffffff0100000002000000)]


In [7]:
print(ray.get(ds.get_internal_block_refs()[0]))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [8]:
ds.mean()

GroupBy Map: 100%|██████████| 200/200 [00:00<00:00, 306.37it/s]
GroupBy Reduce: 100%|██████████| 1/1 [00:00<00:00, 182.65it/s]


4999.5

## read parquet

In [9]:
import pandas as pd
pd.DataFrame([(1,2,3)]* 1000000, columns=['col1', 'col2', 'col3']).to_parquet('/tmp/tmp_ex_07.parquet')
pd.DataFrame([(1,2,3)]* 1000000, columns=['col1', 'col2', 'col3']).to_csv('/tmp/tmp_ex_07.csv')

In [10]:
!ls -l --block-size M /tmp/

total 13M
drwxrwxrwx 18 root root  1M Dec 25 19:43 ray
-rw-------  1 root root  1M Dec  3 02:34 tmp7xk4pjtb
-rw-r--r--  1 root root 13M Dec 25 19:43 tmp_ex_07.csv
-rw-r--r--  1 root root  1M Dec 25 19:43 tmp_ex_07.parquet


### read parquet

In [11]:
df = ray.data.read_parquet(['/tmp/tmp_ex_07.parquet'] * 3)


In [12]:
df

Dataset(num_blocks=3, num_rows=3000000, schema={col1: int64, col2: int64, col3: int64})

##### check re-partition effects

In [25]:
df = ray.data.read_parquet(['/tmp/tmp_ex_07.parquet'] * 3)
df.mean()

GroupBy Map: 100%|██████████| 3/3 [00:21<00:00,  7.28s/it]
GroupBy Reduce: 100%|██████████| 1/1 [00:00<00:00, 482.77it/s]

CPU times: user 470 ms, sys: 71.5 ms, total: 541 ms
Wall time: 22 s





{'mean(col1)': 1.0, 'mean(col2)': 2.0, 'mean(col3)': 3.0}

In [27]:
%%time
# parquet
df = ray.data.read_parquet(['/tmp/tmp_ex_07.parquet'] * 3).repartition(8)
df.mean()

Repartition: 100%|██████████| 8/8 [00:00<00:00, 339.51it/s]
GroupBy Map: 100%|██████████| 8/8 [00:17<00:00,  2.22s/it]
GroupBy Reduce: 100%|██████████| 1/1 [00:00<00:00, 371.77it/s]

CPU times: user 480 ms, sys: 77.8 ms, total: 558 ms
Wall time: 18 s





{'mean(col1)': 1.0, 'mean(col2)': 2.0, 'mean(col3)': 3.0}

### read_csv

In [28]:
%%time
df = ray.data.read_csv(['/tmp/tmp_ex_07.csv'] * 3)
df.mean()

GroupBy Map: 100%|██████████| 3/3 [00:28<00:00,  9.60s/it]
GroupBy Reduce: 100%|██████████| 1/1 [00:00<00:00, 479.62it/s]

CPU times: user 548 ms, sys: 120 ms, total: 668 ms
Wall time: 29.2 s





{'mean()': 499999.5, 'mean(col1)': 1.0, 'mean(col2)': 2.0, 'mean(col3)': 3.0}

In [29]:
%%time
df = ray.data.read_csv(['/tmp/tmp_ex_07.csv'] * 3).repartition(8)
df.mean()

Repartition: 100%|██████████| 8/8 [00:00<00:00, 318.12it/s]
GroupBy Map: 100%|██████████| 8/8 [00:27<00:00,  3.44s/it]
GroupBy Reduce: 100%|██████████| 1/1 [00:00<00:00, 347.24it/s]

CPU times: user 692 ms, sys: 170 ms, total: 861 ms
Wall time: 28.1 s





{'mean()': 499999.5, 'mean(col1)': 1.0, 'mean(col2)': 2.0, 'mean(col3)': 3.0}

# Modin

In [30]:
import modin.pandas as mod_pd
import pandas as pd

In [31]:
%%time
files = ['/tmp/tmp_ex_07.parquet']* 3
mod_df = mod_pd.read_parquet(files)
mod_df.mean()

CPU times: user 186 ms, sys: 33.8 ms, total: 220 ms
Wall time: 474 ms


col1    1.0
col2    2.0
col3    3.0
dtype: float64

In [34]:
%%time
file = '/tmp/tmp_ex_07.csv'
mod_df = mod_pd.read_csv(file)
mod_df.mean()

CPU times: user 60.3 ms, sys: 30.2 ms, total: 90.5 ms
Wall time: 326 ms


Unnamed: 0    499999.5
col1               1.0
col2               2.0
col3               3.0
dtype: float64

In [32]:
%%time
files = ['/tmp/tmp_ex_07.parquet']* 3
pd_df = pd.read_parquet(files)
pd_df.mean()

CPU times: user 67.3 ms, sys: 55.6 ms, total: 123 ms
Wall time: 115 ms


col1    1.0
col2    2.0
col3    3.0
dtype: float64

In [35]:
%%time
file = '/tmp/tmp_ex_07.csv'
pd_df = pd.read_csv(file)
pd_df.mean()

CPU times: user 171 ms, sys: 20.5 ms, total: 192 ms
Wall time: 183 ms


Unnamed: 0    499999.5
col1               1.0
col2               2.0
col3               3.0
dtype: float64