In [1]:
import os
import arcticdb as adb
from arcticdb.options import RuntimeOptions
from arcticdb_ext.version_store import OutputFormat
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

# For this demo we use ArcticDB with LMDB backend located at arctic_path and compare it agains the same dataframes written in parquet
# Data consists of dummy trades on two symbols, each in a separate table.
data_path = os.getenv("DATA_PATH", "/tmp/arrow-test")
arctic_path = os.path.join(data_path, "arctic")
lib_name = "lib"
parquet_path = os.path.join(data_path, "parquet")

In [2]:
# Setting up the arctic client
ac = adb.Arctic(f"lmdb://{arctic_path}")
lib = ac.get_library(lib_name, runtime_options=RuntimeOptions(output_format=OutputFormat.ARROW))

# We have two symbols written in the library
lib.list_symbols()

['TSLA', 'AAPL']

In [3]:
# We can read the entire dataframe as arrow by specifying an output_format
aapl_table = lib.read("AAPL").data
print("Table shape:", aapl_table.shape)
print("Table schema:", aapl_table.schema)

Table shape: (5000000, 5)
Table schema: index: timestamp[ns] not null
price: double not null
size: int64 not null
direction: string not null
trade_id: string not null


In [4]:
# And we can use the pyarrow output to perform our computations (e.g. filter just the buy trades)
aapl_table.filter(pa.compute.field("direction") == "Buy").to_pandas()

Unnamed: 0,index,price,size,direction,trade_id
0,2025-01-01 09:00:02,160.14,85,Buy,trade_AAPL_2
1,2025-01-01 09:00:03,173.05,60,Buy,trade_AAPL_3
2,2025-01-01 09:00:04,166.29,67,Buy,trade_AAPL_4
3,2025-01-01 09:00:06,102.42,23,Buy,trade_AAPL_6
4,2025-01-01 09:00:09,147.48,33,Buy,trade_AAPL_9
...,...,...,...,...,...
2502234,2025-02-28 05:53:07,187.70,35,Buy,trade_AAPL_4999987
2502235,2025-02-28 05:53:13,198.08,29,Buy,trade_AAPL_4999993
2502236,2025-02-28 05:53:14,102.01,27,Buy,trade_AAPL_4999994
2502237,2025-02-28 05:53:17,172.93,18,Buy,trade_AAPL_4999997


In [5]:
# ArcticDB can perform many of the filtering and processing operations while reading
# E.g. let's get just the prices of all Buy trades in a date range:
date_range = (pd.Timestamp(2025, 1, 1, 11), pd.Timestamp(2025, 1, 1, 12))
lazy_df = lib.read("AAPL", date_range=date_range, columns=["price"], lazy=True)
lazy_df = lazy_df[lazy_df["direction"] == "Buy"]
result_arrow = lazy_df.collect().data
result_arrow.to_pandas()

Unnamed: 0,index,price
0,2025-01-01 11:00:01,162.90
1,2025-01-01 11:00:03,179.96
2,2025-01-01 11:00:04,192.08
3,2025-01-01 11:00:05,142.53
4,2025-01-01 11:00:06,154.06
...,...,...
1761,2025-01-01 11:59:49,113.90
1762,2025-01-01 11:59:50,179.02
1763,2025-01-01 11:59:51,117.37
1764,2025-01-01 11:59:55,126.93


In [6]:
# ArcticDB also allows you to do advanced timeseries processing like resampling
# E.g. let's get the volume bought each hour.
date_range = (pd.Timestamp(2025, 1, 1, 11), pd.Timestamp(2025, 1, 1, 13))
lazy_df = lib.read("AAPL", date_range=date_range, lazy=True)
lazy_df = lazy_df[lazy_df["direction"] == "Buy"]
lazy_df = lazy_df.resample("10min").agg({"size": "sum"})
result_arrow = lazy_df.collect().data
result_arrow.to_pandas()

Unnamed: 0,index,size
0,2025-01-01 11:00:00,16165
1,2025-01-01 11:10:00,15507
2,2025-01-01 11:20:00,16867
3,2025-01-01 11:30:00,15115
4,2025-01-01 11:40:00,15368
5,2025-01-01 11:50:00,17637
6,2025-01-01 12:00:00,15876
7,2025-01-01 12:10:00,17365
8,2025-01-01 12:20:00,17347
9,2025-01-01 12:30:00,16225


In [7]:
# We can also read multiple symbols in parallel
date_range = (pd.Timestamp(2025, 1, 1, 11), pd.Timestamp(2025, 1, 1, 13))
read_requests = [
    adb.ReadRequest(symbol="AAPL", date_range=date_range),
    adb.ReadRequest(symbol="TSLA", date_range=date_range),
]
[aapl, tsla] = lib.read_batch(read_requests)
print(aapl.data.to_pandas().head())
print(tsla.data.to_pandas().head())

                index   price  size direction         trade_id
0 2025-01-01 11:00:00  165.06    88      Sell  trade_AAPL_7200
1 2025-01-01 11:00:01  162.90    66       Buy  trade_AAPL_7201
2 2025-01-01 11:00:02  124.39    20      Sell  trade_AAPL_7202
3 2025-01-01 11:00:03  179.96    81       Buy  trade_AAPL_7203
4 2025-01-01 11:00:04  192.08    54       Buy  trade_AAPL_7204
                index   price  size direction         trade_id
0 2025-01-01 11:00:00  124.77    13       Buy  trade_TSLA_7200
1 2025-01-01 11:00:01  135.21    43      Sell  trade_TSLA_7201
2 2025-01-01 11:00:02  102.30    43      Sell  trade_TSLA_7202
3 2025-01-01 11:00:03  148.15    50       Buy  trade_TSLA_7203
4 2025-01-01 11:00:04  110.00    86       Buy  trade_TSLA_7204


In [8]:
# And a couple of benchmarks on arctic vs parquet
import timeit

def read_entire_arctic(symbol):
    return lib.read(symbol, output_format=OutputFormat.ARROW).data

def read_entire_arctic_pandas(symbol):
    return lib.read(symbol, output_format=OutputFormat.PANDAS).data

def read_entire_parquet(symbol):
    path = os.path.join(parquet_path, f"{symbol}.parquet")
    return pq.read_table(path)


def read_filtered_arctic(symbol):
    date_range = (pd.Timestamp(2025, 1, 10), pd.Timestamp(2025, 2, 1))
    lazy_df = lib.read(symbol, date_range=date_range, lazy=True)
    lazy_df = lazy_df[lazy_df["direction"] == "Buy"]
    return lazy_df.collect(output_format=OutputFormat.ARROW).data

def read_filtered_arctic_pandas(symbol):
    date_range = (pd.Timestamp(2025, 1, 10), pd.Timestamp(2025, 2, 1))
    lazy_df = lib.read(symbol, date_range=date_range, lazy=True)
    lazy_df = lazy_df[lazy_df["direction"] == "Buy"]
    return lazy_df.collect(output_format=OutputFormat.PANDAS).data

def read_filtered_parquet(symbol):
    path = os.path.join(parquet_path, f"{symbol}.parquet")
    filters = [[
        ("__index_level_0__", ">=", pd.Timestamp(2025, 1, 10)),
        ("__index_level_0__", "<", pd.Timestamp(2025, 2, 1)),
        ("direction", "==", "Buy"),
    ]]
    return pq.read_table(path, filters=filters)

In [9]:
%timeit read_entire_arctic("TSLA")

373 ms ± 30.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit read_entire_arctic_pandas("TSLA")

770 ms ± 52.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit read_entire_parquet("TSLA")

253 ms ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%timeit read_filtered_arctic("TSLA")

80 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%timeit read_filtered_arctic_pandas("TSLA")

110 ms ± 18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
%timeit read_filtered_parquet("TSLA")

100 ms ± 2.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
