In [1]:
import os
import arcticdb as adb
from arcticdb_ext.version_store import OutputFormat
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

# For this demo we use ArcticDB with LMDB backend located at arctic_path and compare it agains the same dataframes written in parquet
# Data consists of dummy trades on two symbols, each in a separate table.
data_path = os.getenv("DATA_PATH", "/tmp/arrow-test")
arctic_path = os.path.join(data_path, "arctic")
lib_name = "lib"
parquet_path = os.path.join(data_path, "parquet")

In [2]:
# Setting up the arctic client
ac = adb.Arctic(f"lmdb://{arctic_path}")
lib = ac[lib_name]

# We have two symbols written in the library
lib.list_symbols()

['TSLA', 'AAPL']

In [3]:
# We can read the entire dataframe as arrow by specifying an output_format
aapl_table = lib.read("AAPL", output_format=OutputFormat.ARROW).data
print("Table shape:", aapl_table.shape)
print("Table schema:", aapl_table.schema)

Table shape: (5000000, 5)
Table schema: price: double
size: int64
direction: string
trade_id: string
__index_level_0__: timestamp[ns]
-- schema metadata --
pandas: '{"index_columns": ["__index_level_0__"], "column_indexes": [{"na' + 791


In [4]:
# And we can use the pyarrow output to perform our computations (e.g. filter just the buy trades)
aapl_table.filter(pa.compute.field("direction") == "Buy").to_pandas()

Unnamed: 0,price,size,direction,trade_id
2025-01-01 09:00:00,140.53,58,Buy,trade_AAPL_0
2025-01-01 09:00:02,108.43,31,Buy,trade_AAPL_2
2025-01-01 09:00:06,181.93,94,Buy,trade_AAPL_6
2025-01-01 09:00:07,109.39,74,Buy,trade_AAPL_7
2025-01-01 09:00:08,177.71,13,Buy,trade_AAPL_8
...,...,...,...,...
2025-02-28 05:53:14,147.96,21,Buy,trade_AAPL_4999994
2025-02-28 05:53:15,192.30,97,Buy,trade_AAPL_4999995
2025-02-28 05:53:16,137.75,52,Buy,trade_AAPL_4999996
2025-02-28 05:53:17,127.64,82,Buy,trade_AAPL_4999997


In [5]:
# ArcticDB can perform many of the filtering and processing operations while reading
# E.g. let's get just the prices of all Buy trades in a date range:
date_range = (pd.Timestamp(2025, 2, 1), pd.Timestamp(2025, 2, 2))
lazy_df = lib.read("AAPL", date_range=date_range, columns=["price"], lazy=True)
lazy_df = lazy_df[lazy_df["direction"] == "Buy"]
result_arrow = lazy_df.collect(output_format=OutputFormat.ARROW).data
result_arrow.to_pandas()

Unnamed: 0,price
2025-02-01 00:00:00,170.64
2025-02-01 00:00:01,105.09
2025-02-01 00:00:03,192.30
2025-02-01 00:00:04,184.12
2025-02-01 00:00:05,126.05
...,...
2025-02-01 23:59:53,181.80
2025-02-01 23:59:56,138.28
2025-02-01 23:59:57,114.70
2025-02-01 23:59:58,163.06


In [6]:
# ArcticDB also allows you to do advanced timeseries processing like resampling
# E.g. let's get the volume bought each hour.
date_range = (pd.Timestamp(2025, 2, 1), pd.Timestamp(2025, 2, 28))
lazy_df = lib.read("AAPL", date_range=date_range, lazy=True)
lazy_df = lazy_df[lazy_df["direction"] == "Buy"]
lazy_df = lazy_df.resample("1h").agg({"size": "sum"})
result_arrow = lazy_df.collect(output_format=OutputFormat.ARROW).data
result_arrow.to_pandas()

Unnamed: 0,size
2025-02-01 00:00:00,95251
2025-02-01 01:00:00,97155
2025-02-01 02:00:00,99722
2025-02-01 03:00:00,95481
2025-02-01 04:00:00,98484
...,...
2025-02-27 20:00:00,97186
2025-02-27 21:00:00,95840
2025-02-27 22:00:00,98542
2025-02-27 23:00:00,100093


In [7]:
# We can also read multiple symbols in parallel
date_range = (pd.Timestamp(2025, 2, 1), pd.Timestamp(2025, 2, 5))
read_requests = [
    adb.ReadRequest(symbol="AAPL", date_range=date_range),
    adb.ReadRequest(symbol="TSLA", date_range=date_range),
]
[aapl, tsla] = lib.read_batch(read_requests, output_format=OutputFormat.ARROW)
print(aapl.data.to_pandas().head())
print(tsla.data.to_pandas().head())

                      price  size direction            trade_id
2025-02-01 00:00:00  170.64    43       Buy  trade_AAPL_2646000
2025-02-01 00:00:01  105.09    29       Buy  trade_AAPL_2646001
2025-02-01 00:00:02  172.14    84      Sell  trade_AAPL_2646002
2025-02-01 00:00:03  192.30    62       Buy  trade_AAPL_2646003
2025-02-01 00:00:04  184.12    60       Buy  trade_AAPL_2646004
                      price  size direction            trade_id
2025-02-01 00:00:00  177.59    80       Buy  trade_TSLA_2646000
2025-02-01 00:00:01  103.69    72      Sell  trade_TSLA_2646001
2025-02-01 00:00:02  119.81    39      Sell  trade_TSLA_2646002
2025-02-01 00:00:03  179.63    43      Sell  trade_TSLA_2646003
2025-02-01 00:00:04  179.90    93       Buy  trade_TSLA_2646004


In [8]:
# And a couple of benchmarks on arctic vs parquet
# TODO: This will be much better when we use full arrow support and not just the temporary bodge to read pandas and convert to pyarrow
import timeit

def read_entire_arctic(symbol):
    return lib.read(symbol, output_format=OutputFormat.ARROW).data

def read_entire_parquet(symbol):
    path = os.path.join(parquet_path, f"{symbol}.parquet")
    return pq.read_table(path)

def read_filtered_arctic(symbol):
    date_range = (pd.Timestamp(2025, 1, 10), pd.Timestamp(2025, 2, 1))
    lazy_df = lib.read(symbol, date_range=date_range, lazy=True)
    lazy_df = lazy_df[lazy_df["direction"] == "Buy"]
    return lazy_df.collect(output_format=OutputFormat.ARROW).data

def read_filtered_parquet(symbol):
    path = os.path.join(parquet_path, f"{symbol}.parquet")
    filters = [[
        ("__index_level_0__", ">=", pd.Timestamp(2025, 1, 10)),
        ("__index_level_0__", "<", pd.Timestamp(2025, 2, 1)),
        ("direction", "==", "Buy"),
    ]]
    return pq.read_table(path, filters=filters)

In [9]:
%timeit read_entire_arctic("TSLA")

2.99 s ± 58.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit read_entire_parquet("TSLA")

149 ms ± 2.79 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%timeit read_filtered_arctic("TSLA")

470 ms ± 19.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%timeit read_filtered_parquet("TSLA")

80.4 ms ± 3.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
