# Converting csv to parquet

With polars or duckdb, and optional queries between reading and writing.

## Generate 2.2 GB csv

In [None]:
import duckdb  # 1.2.1
import numpy as np  # 1.26.4
import polars as pl  # 1.25.2

n = 100_000_000

rng = np.random.default_rng(42)

df = pl.DataFrame(
    {
        "X": rng.choice(["a", "b", "c"], n),
        "Y": rng.uniform(0, 1, n),
        "Z": rng.choice([1, 2, 3, 4, 5], n),
    }
)

df.write_csv("data.csv")

## Polars (in lazy mode)

In [None]:
# Native API
pl.thread_pool_size()  # POLARS_MAX_THREADS = 8
(
    pl.scan_csv("data.csv")
    .filter(pl.col("X") == "a")
    .drop("X")
    .sort(["Y", "Z"])
    .sink_parquet("data.parquet", row_group_size=100_000)  # "zstd" compression
)

In [None]:
# Via SQL API (slower!?)
(
    pl.scan_csv("data.csv")
    .sql("SELECT Y, Z FROM self WHERE X == 'a' ORDER BY Y, Z")
    .sink_parquet("data.parquet", row_group_size=100_000)
)

In [14]:
# Result ok?
pl.scan_parquet("data.parquet").head(5).collect()

Y,Z
f64,i64
3.7796e-08,4
5.0273e-08,5
5.7652e-08,4
8.0578e-08,3
8.1598e-08,4


## Duckdb

In [None]:
con = duckdb.connect(config={"threads": 8, "memory_limit": "4GB"})

con.sql(
    """
    COPY (
        SELECT Y, Z
        FROM 'data.csv'
        WHERE X == 'a'
        ORDER BY Y, Z
    ) TO 'data.parquet' (FORMAT parquet, COMPRESSION zstd, ROW_GROUP_SIZE 100_000)
    """
)

In [28]:
con.sql("SELECT * FROM 'data.parquet' LIMIT 5")

┌────────────────────────┬───────┐
│           Y            │   Z   │
│         double         │ int64 │
├────────────────────────┼───────┤
│  3.779571322581887e-08 │     4 │
│ 5.0273087692787044e-08 │     5 │
│   5.76523543349694e-08 │     4 │
│  8.057776434977626e-08 │     3 │
│  8.159834352650108e-08 │     4 │
└────────────────────────┴───────┘

## Resulting file

In all cases, the resulting parquet file is of the same size, around 170 MB (because of the expensive double).