# DuckDB
> SQL em pandas e comparações (dask, datatable)

- toc: true 
- badges: true
- comments: true
- categories: [demo, sql, pandas, tabelas, benchmarks]
- image: https://cdn.mindovermachines.com/wp-content/uploads/2019/03/rubber-ducky-2-800x400.jpg

# Introdução
- query opt: https://duckdb.org/2021/05/14/sql-on-pandas.html
- pandas trend: https://insights.stackoverflow.com/trends?tags=pandas%2Cnumpy%2Cdask%2Cspacy%2Ctensorflow%2Cpytorch%2Cscikit-learn%2Cpyspark%2Cpostgresql
- duckdb caches stuff
- pandas-sql 

In [42]:
# hide
!pip install duckdb pandas 'dask[dataframe]' datatable polars pandasql > /dev/null

In [15]:
# hide
!wget -q https://github.com/cwida/duckdb-data/releases/download/v1.0/lineitemsf1.snappy.parquet
!wget -q https://github.com/cwida/duckdb-data/releases/download/v1.0/orders.parquet

In [120]:
# hide
import dask.dataframe as dd
import datatable as dt
import duckdb
import polars as pl
from datatable import by, f, g, join

In [64]:
lineitem = duckdb.query("SELECT * FROM 'lineitemsf1.snappy.parquet'").to_df()
orders = duckdb.query("SELECT * FROM 'orders.parquet'").to_df()

lineitem.head()

Unnamed: 0,l_orderkey,l_partkey,l_suppkey,l_linenumber,l_quantity,l_extendedprice,l_discount,l_tax,l_returnflag,l_linestatus,l_shipdate,l_commitdate,l_receiptdate,l_shipinstruct,l_shipmode,l_comment
0,1,155190,7706,1,17,21168.23,0.04,0.02,N,O,1996-03-13,1996-02-12,1996-03-22,DELIVER IN PERSON,TRUCK,egular courts above the
1,1,67310,7311,2,36,45983.16,0.09,0.06,N,O,1996-04-12,1996-02-28,1996-04-20,TAKE BACK RETURN,MAIL,ly final dependencies: slyly bold
2,1,63700,3701,3,8,13309.6,0.1,0.02,N,O,1996-01-29,1996-03-05,1996-01-31,TAKE BACK RETURN,REG AIR,"riously. regular, express dep"
3,1,2132,4633,4,28,28955.64,0.09,0.06,N,O,1996-04-21,1996-03-30,1996-05-16,NONE,AIR,lites. fluffily even de
4,1,24027,1534,5,24,22824.48,0.1,0.04,N,O,1996-03-30,1996-03-14,1996-04-01,NONE,FOB,pending foxes. slyly re


In [262]:
%%timeit
duckdb.query(
"""
SELECT l_returnflag,
       l_linestatus,
       SUM(l_extendedprice),
       MIN(l_extendedprice),
       MAX(l_extendedprice),
       AVG(l_extendedprice),
       SUM(l_quantity),
       MIN(l_quantity),
       MAX(l_quantity),
       AVG(l_quantity)
FROM lineitem AS lineitem
JOIN orders AS orders ON (l_orderkey=o_orderkey)
WHERE l_shipdate <= DATE '1998-09-02'
  AND o_orderpriority IN ('1-URGENT', '2-HIGH')
GROUP BY l_returnflag,
         l_linestatus
"""
).to_df()

627 ms ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [150]:
%%timeit
lineitem.merge(orders, left_on="l_orderkey", right_on="o_orderkey").pipe(
    lambda df: df.copy()[
        (df["l_shipdate"] < "1998-09-02")
        & (df["o_orderpriority"].isin(("1-URGENT", "2-HIGH")))
    ]
).groupby(["l_returnflag", "l_linestatus"]).agg(
    {
        "l_extendedprice": ["sum", "min", "max", "mean"],
        "l_quantity": ["sum", "min", "max", "mean"],
    }
)

CPU times: user 8.84 s, sys: 5.3 s, total: 14.1 s
Wall time: 15.6 s


Unnamed: 0_level_0,Unnamed: 1_level_0,l_extendedprice,l_extendedprice,l_extendedprice,l_extendedprice,l_quantity,l_quantity,l_quantity,l_quantity
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,max,mean,sum,min,max,mean
l_returnflag,l_linestatus,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
A,F,22677200000.0,904.0,104949.5,38303.426664,15123892,1,50,25.545346
N,F,596530200.0,920.0,104049.0,38461.006847,397729,1,50,25.643391
N,O,44734060000.0,901.0,104749.5,38267.695102,29826993,1,50,25.515466
R,F,22683100000.0,906.0,104899.5,38240.140532,15126938,1,50,25.501645


In [151]:
# collapse
# dask
dd_lineitem = dd.from_pandas(lineitem, npartitions=1)
dd_orders = dd.from_pandas(orders, npartitions=1)

In [152]:
%%timeit
dd_lineitem.merge(dd_orders, left_on="l_orderkey", right_on="o_orderkey").pipe(
    lambda df: df.copy()[
        (df["l_shipdate"] < "1998-09-02")
        & (df["o_orderpriority"].isin(("1-URGENT", "2-HIGH")))
    ]
).groupby(["l_returnflag", "l_linestatus"]).agg(
    {
        "l_extendedprice": ["sum", "min", "max", "mean"],
        "l_quantity": ["sum", "min", "max", "mean"],
    }
).compute()

CPU times: user 6.56 s, sys: 1.63 s, total: 8.2 s
Wall time: 8.23 s


Unnamed: 0_level_0,Unnamed: 1_level_0,l_extendedprice,l_extendedprice,l_extendedprice,l_extendedprice,l_quantity,l_quantity,l_quantity,l_quantity
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,max,mean,sum,min,max,mean
l_returnflag,l_linestatus,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
A,F,22677200000.0,904.0,104949.5,38303.426664,15123892,1,50,25.545346
N,F,596530200.0,920.0,104049.0,38461.006847,397729,1,50,25.643391
N,O,44734060000.0,901.0,104749.5,38267.695102,29826993,1,50,25.515466
R,F,22683100000.0,906.0,104899.5,38240.140532,15126938,1,50,25.501645


In [179]:
# collapse
# datatable
dt_lineitem = dt.Frame(lineitem)
dt_orders = dt.Frame(orders)

In [180]:
%%timeit
# preparação
dt_lineitem.names = {"l_orderkey": "orderkey"}
dt_orders.names = {"o_orderkey": "orderkey"}

# dt_lineitem.key = "orderkey"  # não podemos usar como `key` existem valores repetidos
dt_orders.key = "orderkey"

# executando a query
dt_lineitem[
    (g.o_orderpriority == "1-URGENT") | (g.o_orderpriority == "2-HIGH"),
    :,
    join(dt_orders),
][
    :,
    {
        "sum(l_extendedprice)": dt.sum(f.l_extendedprice),
        "min(l_extendedprice)": dt.min(f.l_extendedprice),
        "max(l_extendedprice)": dt.max(f.l_extendedprice),
        "mean(l_extendedprice)": dt.mean(f.l_extendedprice),
        "sum(l_quantity)": dt.sum(f.l_quantity),
        "min(l_quantity)": dt.min(f.l_quantity),
        "max(l_quantity)": dt.max(f.l_quantity),
        "mean(l_quantity)": dt.mean(f.l_quantity),
    },
    by(f.l_returnflag, f.l_linestatus),
]

CPU times: user 6.79 s, sys: 271 ms, total: 7.06 s
Wall time: 1.17 s


Unnamed: 0_level_0,l_returnflag,l_linestatus,sum(l_extendedprice),min(l_extendedprice),max(l_extendedprice),mean(l_extendedprice),sum(l_quantity),min(l_quantity),max(l_quantity),mean(l_quantity)
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪
0,A,F,22677200000.0,904,104950,38303.4,15123892,1,50,25.5453
1,N,F,596530000.0,920,104049,38461.0,397729,1,50,25.6434
2,N,O,46045600000.0,901,104750,38264.7,30703038,1,50,25.5147
3,R,F,22683100000.0,906,104900,38240.1,15126938,1,50,25.5016


In [259]:
# collapse
# polars
pl_lineitem = pl.DataFrame(lineitem)
pl_orders = pl.DataFrame(orders)

In [263]:
%%timeit
pl_lineitem.join(pl_orders, left_on="l_orderkey", right_on="o_orderkey").filter(
    (pl.col("l_shipdate") < "1998-09-02")
    & (
        (pl.col("o_orderpriority") == "1-URGENT")
        | (pl.col("o_orderpriority") == "2-HIGH")
    )
).groupby(["l_returnflag", "l_linestatus"]).agg(
    {
        "l_extendedprice": ["sum", "min", "max", "mean"],
        "l_quantity": ["sum", "min", "max", "mean"],
    }
)

3.05 s ± 1.23 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [271]:
# hide
# using lazy eval
import time

pl_lineitem = pl_lineitem.lazy()
pl_orders = pl_orders.lazy()

In [275]:
# hide
start = time.monotonic()

# lazy dataframe
_df = (
    pl_lineitem.join(pl_orders, left_on="l_orderkey", right_on="o_orderkey")
    .filter(
        (pl.col("l_shipdate") < "1998-09-02")
        & (
            (pl.col("o_orderpriority") == "1-URGENT")
            | (pl.col("o_orderpriority") == "2-HIGH")
        )
    )
    # could not delay collection - error (not finding `o_orderkey` column)
    .collect()
)
# eager dataframe
_df = (_df
    .groupby(["l_returnflag", "l_linestatus"])
    .agg(
        [
            pl.sum("l_extendedprice"),
            pl.min("l_extendedprice"),
            pl.max("l_extendedprice"),
            pl.mean("l_extendedprice"),
            pl.sum("l_quantity"),
            pl.min("l_quantity"),
            pl.max("l_quantity"),
            pl.mean("l_quantity"),
        ],
    )
)

print(f"Wall time: {time.monotonic() - start:.2f}s")
_df

Wall time: 5.74s


l_returnflag,l_linestatus,l_extendedprice_sum,l_extendedprice_min,l_extendedprice_max,l_extendedprice_mean,l_quantity_sum,l_quantity_min,l_quantity_max,l_quantity_mean
str,str,f64,f64,f64,f64,i64,i64,i64,f64
"""A""","""F""",22677199025.699192,904,104949.5,38303.42666419926,15123892,1,50,25.54534567707304
"""N""","""F""",596530216.2000039,920,104049.0,38461.0068471956,397729,1,50,25.64339136041264
"""N""","""O""",44734055416.80963,901,104749.5,38267.695101622725,29826993,1,50,25.515466087014545
"""R""","""F""",22683095360.31932,906,104899.5,38240.14053242183,15126938,1,50,25.501644539975555


In [None]:
!pip install pandasql

In [267]:
# hide_output
from pandasql import sqldf

start = time.monotonic()

_df = sqldf(
"""
SELECT l_returnflag,
       l_linestatus,
       SUM(l_extendedprice),
       MIN(l_extendedprice),
       MAX(l_extendedprice),
       AVG(l_extendedprice),
       SUM(l_quantity),
       MIN(l_quantity),
       MAX(l_quantity),
       AVG(l_quantity)
FROM lineitem AS lineitem
JOIN orders AS orders ON (l_orderkey=o_orderkey)
WHERE l_shipdate <= DATE '1998-09-02'
  AND o_orderpriority IN ('1-URGENT', '2-HIGH')
GROUP BY l_returnflag,
         l_linestatus
""",
    globals(),
)

print(f"Wall time: {time.monotonic() - start:.2f}s")
_df

PandaSQLException: (sqlite3.OperationalError) near "'1998-09-02'": syntax error
[SQL: 
SELECT l_returnflag,
       l_linestatus,
       SUM(l_extendedprice),
       MIN(l_extendedprice),
       MAX(l_extendedprice),
       AVG(l_extendedprice),
       SUM(l_quantity),
       MIN(l_quantity),
       MAX(l_quantity),
       AVG(l_quantity)
FROM lineitem AS lineitem
JOIN orders AS orders ON (l_orderkey=o_orderkey)
WHERE l_shipdate <= DATE '1998-09-02'
  AND o_orderpriority IN ('1-URGENT', '2-HIGH')
GROUP BY l_returnflag,
         l_linestatus
]
(Background on this error at: http://sqlalche.me/e/14/e3q8)

In [276]:
!rm lineitemsf1.snappy.parquet
!rm orders.parquet