In [1]:
import duckdb
import pandas as pd
import numpy as np
import polars as pl
import pyarrow as pa

duckdb.__version__

'0.10.0'

# Jupyter Notebooks

In [2]:
%reload_ext sql
conn = duckdb.connect(config={"allow_unsigned_extensions": "true"})
%sql conn --alias duckdb

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

# SQL in Python

In [3]:
(duckdb
 .sql("SELECT 42")
 .show()
)

┌───────┐
│  42   │
│ int32 │
├───────┤
│    42 │
└───────┘



In [4]:
results = (
    duckdb
    .sql("SELECT 42")
    .fetchall()
)

print(type(results))
results

<class 'list'>


[(42,)]

# SQL on Pandas

In [5]:
# Create a Pandas dataframe
my_df = pd.DataFrame.from_dict({'a': [42]})

# query the Pandas DataFrame "my_df"
# Note: duckdb.sql connects to the default in-memory database connection
results = duckdb.sql("SELECT * FROM my_df").df()

results

Unnamed: 0,a
0,42


In [6]:
# Create a Pandas dataframe
my_df = pd.DataFrame.from_dict({'a': [42]})

# create the table "my_table" from the DataFrame "my_df"
# Note: duckdb.sql connects to the default in-memory database connection
duckdb.sql("CREATE TABLE my_table AS SELECT * FROM my_df")

# insert into the table "my_table" from the DataFrame "my_df"
duckdb.sql("INSERT INTO my_table SELECT * FROM my_df")

In [7]:
# as a df
my_df

Unnamed: 0,a
0,42


In [8]:
# as a relation
duckdb.sql("SELECT * FROM my_df")

┌───────┐
│   a   │
│ int64 │
├───────┤
│    42 │
└───────┘

# Querying Pandas Dataframes

In [9]:
%sql res << SELECT 'Off and flying!' AS a_duckdb_column;

print(type(res))
res

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,a_duckdb_column
0,Off and flying!


In [10]:
dct = {
    "i": [1, 2, 3],
    "j": ["one", "two", "three"]
}

input_df = pd.DataFrame.from_dict(dct)

input_df

Unnamed: 0,i,j
0,1,one
1,2,two
2,3,three


In [11]:
%%sql output_df <<
SELECT sum(i)AS total_i
FROM input_df;

In [12]:
output_df

Unnamed: 0,total_i
0,6.0


# Install and Load DuckDB httpfs Extension

In [13]:
%%sql
INSTALL '../duckdb/build/release/repository/v0.10.0/osx_amd64/httpfs.duckdb_extension';
LOAD '../duckdb/build/release/repository/v0.10.0/osx_amd64/httpfs.duckdb_extension'

Unnamed: 0,Success


In [29]:
%%sql
SELECT *
FROM 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet'
LIMIT 10;

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1,1.72,1,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1,1.8,1,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1,4.7,1,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1,1.4,1,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1,0.8,1,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0
5,1,2024-01-01 00:54:08,2024-01-01 01:26:31,1,4.7,1,N,148,141,1,29.6,3.5,0.5,6.9,0.0,1.0,41.5,2.5,0.0
6,2,2024-01-01 00:49:44,2024-01-01 01:15:47,2,10.82,1,N,138,181,1,45.7,6.0,0.5,10.0,0.0,1.0,64.95,0.0,1.75
7,1,2024-01-01 00:30:40,2024-01-01 00:58:40,0,3.0,1,N,246,231,2,25.4,3.5,0.5,0.0,0.0,1.0,30.4,2.5,0.0
8,2,2024-01-01 00:26:01,2024-01-01 00:54:12,1,5.44,1,N,161,261,2,31.0,1.0,0.5,0.0,0.0,1.0,36.0,2.5,0.0
9,2,2024-01-01 00:28:08,2024-01-01 00:29:16,1,0.04,1,N,113,113,2,3.0,1.0,0.5,0.0,0.0,1.0,8.0,2.5,0.0


In [35]:
%%sql
SELECT *
FROM 'https://data.cityofnewyork.us/Popular_Baby_Names_20240428.csv'
LIMIT 10;

HTTPException: HTTP Error: Unable to connect to URL "https://data.cityofnewyork.us/Popular_Baby_Names_20240428.csv": 404 (Not Found)

In [None]:
<a download="Popular_Baby_Names_20240428.csv" target="_blank" rel="noreferrer" style="display: none" href="blob:https://data.cityofnewyork.us/8155f5dc-9a3f-43b6-a89d-44e13af732c4"></a>