# DuckDB Python Client Basics

In [20]:
import duckdb
import pandas as pd
import numpy as np
import polars as pl
import pyarrow as pa
import os.path

duckdb.__version__

'0.10.0'

# Python API

# Basic API Usage

In [21]:
duckdb.sql("SELECT 42").show()

┌───────┐
│  42   │
│ int32 │
├───────┤
│    42 │
└───────┘



In [22]:
relation_1 = duckdb.sql(
    "SELECT 42 AS i"
)

(duckdb
 .sql("SELECT i * 2 AS k FROM relation_1")
 .show()
)

┌───────┐
│   k   │
│ int32 │
├───────┤
│    84 │
└───────┘



# Data Input

In [23]:
ls

Historic_Districts_20240426.csv  duckdb-python-sql.ipynb
duckdb-python-basics.ipynb


In [24]:
# duckdb.read_csv()
duckdb.read_csv(
    "Historic_Districts_20240426.csv",
    date_format= "%m/%d/%Y"
)

┌──────────────────────┬─────────┬───────────┬──────────┬───┬────────────┬────────────┬───────────────┬───────────────┐
│       the_geom       │ BOROUGH │ LP_NUMBER │ CURRENT_ │ … │  DESDATE   │  CALDATE   │  Shape_Leng   │  Shape_Area   │
│       varchar        │ varchar │  varchar  │ varchar  │   │    date    │    date    │    double     │    double     │
├──────────────────────┼─────────┼───────────┼──────────┼───┼────────────┼────────────┼───────────────┼───────────────┤
│ MULTIPOLYGON (((-7…  │ QN      │ LP-02040  │ Yes      │ … │ 1999-06-29 │ 1999-04-13 │ 11203.1997224 │ 4067523.25047 │
│ MULTIPOLYGON (((-7…  │ MN      │ LP-01985  │ Yes      │ … │ 1998-05-05 │ NULL       │ 410.860306645 │ 10652.4239136 │
│ MULTIPOLYGON (((-7…  │ MN      │ LP-01901  │ Yes      │ … │ 1993-02-25 │ NULL       │ 4995.47146205 │ 1069203.49477 │
│ MULTIPOLYGON (((-7…  │ BK      │ LP-00989  │ Yes      │ … │ 1978-07-11 │ NULL       │ 1240.77043794 │  78546.856215 │
│ MULTIPOLYGON (((-7…  │ BK      │ LP-02

My preferred method - `duckdb.sql()`

In [25]:
# duckdb.sql(SELECT)
duckdb.sql(
    """
    SELECT *
    FROM Historic_Districts_20240426.csv
    LIMIT 10
    """
)

┌──────────────────────┬─────────┬───────────┬──────────┬───┬────────────┬────────────┬───────────────┬───────────────┐
│       the_geom       │ BOROUGH │ LP_NUMBER │ CURRENT_ │ … │  DESDATE   │  CALDATE   │  Shape_Leng   │  Shape_Area   │
│       varchar        │ varchar │  varchar  │ varchar  │   │    date    │    date    │    double     │    double     │
├──────────────────────┼─────────┼───────────┼──────────┼───┼────────────┼────────────┼───────────────┼───────────────┤
│ MULTIPOLYGON (((-7…  │ QN      │ LP-02040  │ Yes      │ … │ 1999-06-29 │ 1999-04-13 │ 11203.1997224 │ 4067523.25047 │
│ MULTIPOLYGON (((-7…  │ MN      │ LP-01985  │ Yes      │ … │ 1998-05-05 │ NULL       │ 410.860306645 │ 10652.4239136 │
│ MULTIPOLYGON (((-7…  │ MN      │ LP-01901  │ Yes      │ … │ 1993-02-25 │ NULL       │ 4995.47146205 │ 1069203.49477 │
│ MULTIPOLYGON (((-7…  │ BK      │ LP-00989  │ Yes      │ … │ 1978-07-11 │ NULL       │ 1240.77043794 │  78546.856215 │
│ MULTIPOLYGON (((-7…  │ BK      │ LP-02

# DataFrames

In [26]:
ls

Historic_Districts_20240426.csv  duckdb-python-sql.ipynb
duckdb-python-basics.ipynb


In [27]:
# pandas
df = pd.DataFrame({"a": [42]})

duckdb.sql("SELECT * FROM df")

┌───────┐
│   a   │
│ int64 │
├───────┤
│    42 │
└───────┘

In [28]:
# polars
df = pl.DataFrame({"a": [42]})

duckdb.sql("SELECT * FROM df")

┌───────┐
│   a   │
│ int64 │
├───────┤
│    42 │
└───────┘

In [29]:
# pyarrow
df = pa.Table.from_pydict({"a": [42]})

duckdb.sql("SELECT * FROM df")

┌───────┐
│   a   │
│ int64 │
├───────┤
│    42 │
└───────┘

# Result Conversion

In [30]:
# Python objects
print(type(duckdb.sql("SELECT 42").fetchall()))

# NumPy Arrays
print(type(duckdb.sql("SELECT 42").fetchnumpy())) 

# Pandas DataFrame
print(type(duckdb.sql("SELECT 42").df()))

# Polars DataFrame
print(type(duckdb.sql("SELECT 42").pl()))

# Arrow Table
print(type(duckdb.sql("SELECT 42").arrow()))

<class 'list'>
<class 'dict'>
<class 'pandas.core.frame.DataFrame'>
<class 'polars.dataframe.frame.DataFrame'>
<class 'pyarrow.lib.Table'>


In [31]:
# Example: Pandas DataFrame
duckdb.sql("SELECT 42").df()

Unnamed: 0,42
0,42


In [32]:
# Example: Pandas DataFrame
duckdb.sql("SELECT 42 AS x").df()

Unnamed: 0,x
0,42


# Writing Data to Disk

In [33]:
# Write to a Parquet file
duckdb.sql("SELECT 42").write_parquet("out.parquet") 

# Write to a CSV file
duckdb.sql("SELECT 42").write_csv("out.csv")   

# Copy to a Parquet file
duckdb.sql(
    "COPY (SELECT 42) TO 'out.parquet'"
)      

In [34]:
ls

Historic_Districts_20240426.csv  out.csv
duckdb-python-basics.ipynb       out.parquet
duckdb-python-sql.ipynb


# Connection Options

## Using an In-Memory Database

In [35]:
con = duckdb.connect()

(con
 .sql("SELECT 42 AS x")
 .show()
)

┌───────┐
│   x   │
│ int32 │
├───────┤
│    42 │
└───────┘



A connection to a persistent database can be created using the `connect` function.

## Persistent Storage

In [36]:
con = duckdb.connect("file.db")

con.sql("CREATE TABLE integers (i INTEGER)")
con.sql("INSERT INTO integers VALUES (42)")

con.sql("SELECT * FROM integers").show()

┌───────┐
│   i   │
│ int32 │
├───────┤
│    42 │
└───────┘



In [37]:
# create a connection to a file called 'file.db'
con = duckdb.connect("file-one.db")

# create a table and load data into it
con.sql("CREATE TABLE test (i INTEGER)")
con.sql("INSERT INTO test VALUES (42)")

# query the table
con.table("test").show()

# explicitly close the connection
con.close()
# Note: connections also closed implicitly when they go out of scope

┌───────┐
│   i   │
│ int32 │
├───────┤
│    42 │
└───────┘



In [38]:
ls

Historic_Districts_20240426.csv  file.db
duckdb-python-basics.ipynb       out.csv
duckdb-python-sql.ipynb          out.parquet
file-one.db


In [39]:
with duckdb.connect("file-two.db") as con:
    con.sql("CREATE TABLE test (i INTEGER)")
    con.sql("INSERT INTO test VALUES (42)")
    con.table("test").show()
    # the context manager closes the connection automatically

┌───────┐
│   i   │
│ int32 │
├───────┤
│    42 │
└───────┘



In [40]:
ls

Historic_Districts_20240426.csv  file-two.db
duckdb-python-basics.ipynb       file.db
duckdb-python-sql.ipynb          out.csv
file-one.db                      out.parquet


In [43]:
# reconnect to file called 'file.db'
con = duckdb.connect("file.db")

con.sql("SELECT * FROM integers").show()

┌───────┐
│   i   │
│ int32 │
├───────┤
│    42 │
└───────┘

