In [1]:
import pathlib

import duckdb
import pyarrow.dataset as ds
from deltalake import DeltaTable

# DuckDB Read Delta

This section shows the bad, better and best ways to read Delta tables into DuckDB databases.

In [2]:
table = DeltaTable(f"{pathlib.Path.home()}/data/delta/G1_1e9_1e2_0_0")

In [3]:
%%time

quack = duckdb.arrow(table.to_pyarrow_table())

CPU times: user 50.7 s, sys: 17.9 s, total: 1min 8s
Wall time: 17.1 s


### Bad

In [3]:
%%time

quack = duckdb.arrow(table.to_pyarrow_table())
quack.filter("id1 = 'id016' and v2 > 10")

CPU times: user 50.5 s, sys: 18.1 s, total: 1min 8s
Wall time: 17.1 s


┌─────────┬─────────┬──────────────┬───────┬───────┬─────────┬───────┬───────┬───────────┐
│   id1   │   id2   │     id3      │  id4  │  id5  │   id6   │  v1   │  v2   │    v3     │
│ varchar │ varchar │   varchar    │ int32 │ int32 │  int32  │ int32 │ int32 │  double   │
├─────────┼─────────┼──────────────┼───────┼───────┼─────────┼───────┼───────┼───────────┤
│ id016   │ id054   │ id0002309114 │    62 │    95 │ 7180859 │     4 │    13 │  7.750173 │
│ id016   │ id044   │ id0003968533 │    63 │    98 │ 2356363 │     4 │    14 │  3.942417 │
│ id016   │ id034   │ id0001082839 │    58 │    73 │ 8039808 │     5 │    12 │ 76.820135 │
│ id016   │ id037   │ id0006298446 │    29 │    34 │ 2173400 │     2 │    13 │ 68.078028 │
│ id016   │ id034   │ id0008791534 │    76 │    92 │ 6022714 │     1 │    12 │ 76.331411 │
│ id016   │ id002   │ id0009927251 │    34 │    97 │ 1126082 │     1 │    11 │ 51.147419 │
│ id016   │ id049   │ id0008934288 │    96 │     8 │ 3574132 │     5 │    14 │ 70.233415 │

### Better 

In [8]:
%%time

table = table.to_pyarrow_table(
    filter=((ds.field("id1") == "id016") & (ds.field("v2") > 10))
)
quack = duckdb.arrow(table)
quack

TypeError: to_pyarrow_table() got an unexpected keyword argument 'filter'

### Best

In [4]:
%%time

dataset = table.to_pyarrow_dataset()
quack = duckdb.arrow(dataset)
quack.filter("id1 = 'id016' and v2 > 10")

CPU times: user 7.53 ms, sys: 4.86 ms, total: 12.4 ms
Wall time: 10.1 ms


┌─────────┬─────────┬──────────────┬───────┬───────┬─────────┬───────┬───────┬───────────┐
│   id1   │   id2   │     id3      │  id4  │  id5  │   id6   │  v1   │  v2   │    v3     │
│ varchar │ varchar │   varchar    │ int32 │ int32 │  int32  │ int32 │ int32 │  double   │
├─────────┼─────────┼──────────────┼───────┼───────┼─────────┼───────┼───────┼───────────┤
│ id016   │ id054   │ id0002309114 │    62 │    95 │ 7180859 │     4 │    13 │  7.750173 │
│ id016   │ id044   │ id0003968533 │    63 │    98 │ 2356363 │     4 │    14 │  3.942417 │
│ id016   │ id034   │ id0001082839 │    58 │    73 │ 8039808 │     5 │    12 │ 76.820135 │
│ id016   │ id037   │ id0006298446 │    29 │    34 │ 2173400 │     2 │    13 │ 68.078028 │
│ id016   │ id034   │ id0008791534 │    76 │    92 │ 6022714 │     1 │    12 │ 76.331411 │
│ id016   │ id002   │ id0009927251 │    34 │    97 │ 1126082 │     1 │    11 │ 51.147419 │
│ id016   │ id049   │ id0008934288 │    96 │     8 │ 3574132 │     5 │    14 │ 70.233415 │

In [5]:
dataset = table.to_pyarrow_dataset()
quack = duckdb.arrow(dataset)
quack.filter("id1 = 'id016' and v2 > 10")

┌─────────┬─────────┬──────────────┬───────┬───────┬─────────┬───────┬───────┬───────────┐
│   id1   │   id2   │     id3      │  id4  │  id5  │   id6   │  v1   │  v2   │    v3     │
│ varchar │ varchar │   varchar    │ int32 │ int32 │  int32  │ int32 │ int32 │  double   │
├─────────┼─────────┼──────────────┼───────┼───────┼─────────┼───────┼───────┼───────────┤
│ id016   │ id054   │ id0002309114 │    62 │    95 │ 7180859 │     4 │    13 │  7.750173 │
│ id016   │ id044   │ id0003968533 │    63 │    98 │ 2356363 │     4 │    14 │  3.942417 │
│ id016   │ id034   │ id0001082839 │    58 │    73 │ 8039808 │     5 │    12 │ 76.820135 │
│ id016   │ id037   │ id0006298446 │    29 │    34 │ 2173400 │     2 │    13 │ 68.078028 │
│ id016   │ id034   │ id0008791534 │    76 │    92 │ 6022714 │     1 │    12 │ 76.331411 │
│ id016   │ id002   │ id0009927251 │    34 │    97 │ 1126082 │     1 │    11 │ 51.147419 │
│ id016   │ id049   │ id0008934288 │    96 │     8 │ 3574132 │     5 │    14 │ 70.233415 │

## Query times: PyArrow vs PyTables (example not working properly yet)

In [2]:
table_1e8 = DeltaTable(f"{pathlib.Path.home()}/data/delta/G1_1e8_1e2_0_0")
table_1e9 = DeltaTable(f"{pathlib.Path.home()}/data/delta/G1_1e9_1e2_0_0")

In [10]:
%%time

pyarrow_table = duckdb.arrow(table_1e9.to_pyarrow_table())


CPU times: user 50.8 s, sys: 19.5 s, total: 1min 10s
Wall time: 19.2 s


In [None]:
%%time

res = duckdb.sql('SELECT id1, sum(v1) AS v1 from pyarrow_table GROUP BY id1 limit 10')
res

CPU times: user 649 µs, sys: 1.32 ms, total: 1.97 ms
Wall time: 3.29 ms


In [None]:
%%time

pyarrow_dataset = duckdb.arrow(table_1e9.to_pyarrow_dataset())

In [None]:
%%time

res = duckdb.sql('SELECT id1, sum(v1) AS v1 from pyarrow_dataset GROUP BY id1 limit 10')
res

CPU times: user 780 µs, sys: 113 µs, total: 893 µs
Wall time: 900 µs
