# Intro to DuckDB 🦆 with Python 🐍

In [1]:
# Install DuckDB
!pip install duckdb==0.9.1 


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Initiate the connection & install httpfs extension
import duckdb
conn = duckdb.connect()
conn.sql('install httpfs')


In [None]:
# -- 20 M rows ~ 1GB of dataset from october 2021 to october 2022 on duckDB pypi stats
conn.sql("DESCRIBE FROM 's3://us-prd-motherduck-open-datasets/duckdb_stats/pypi/duckdb_pypi.parquet';")


In [3]:
# Count the number of rows
conn.sql("SELECT COUNT(*) FROM 's3://us-prd-motherduck-open-datasets/duckdb_stats/pypi/duckdb_pypi.parquet';")


┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     19557831 │
└──────────────┘

In [None]:
# Quick preview
conn.sql("FROM 's3://us-prd-motherduck-open-datasets/duckdb_stats/pypi/duckdb_pypi.parquet' limit 5;")


In [4]:
# Create a new dataset
conn.sql("""
CREATE TABLE top_country_download AS 
SELECT country_code, COUNT(*) as download_count 
FROM 's3://us-prd-motherduck-open-datasets/duckdb_stats/pypi/duckdb_pypi.parquet'
GROUP BY country_code 
ORDER BY download_count DESC 
LIMIT 1000;
""")


In [5]:
conn.sql("FROM top_country_download")


┌──────────────┬────────────────┐
│ country_code │ download_count │
│   varchar    │     int64      │
├──────────────┼────────────────┤
│ US           │       14715752 │
│ CN           │         839626 │
│ IE           │         485556 │
│ HK           │         414756 │
│ SG           │         402705 │
│ GB           │         256908 │
│ DE           │         254110 │
│ RU           │         196197 │
│ FR           │         185013 │
│ NL           │         168658 │
│ ·            │              · │
│ ·            │              · │
│ ·            │              · │
│ GA           │              3 │
│ VA           │              2 │
│ SS           │              2 │
│ KM           │              2 │
│ GQ           │              1 │
│ KP           │              1 │
│ WS           │              1 │
│ FK           │              1 │
│ TO           │              1 │
│ SB           │              1 │
├──────────────┴────────────────┤
│      213 rows (20 shown)      │
└─────────────

## Connect and uploading to Motherduck ☁️


In [6]:
# Load MotherDuck extension
# If duckdb isn't launched yet, you can directly connect to md using conn = duckdb.connect('md:')
conn.sql("LOAD motherduck")
conn.sql("PRAGMA MD_CONNECT")


1. Please open this link to login into your account: https://auth.motherduck.com/activate
2. Enter the following code: TJWV-XHGP


Token successfully retrieved ✅
You can store it as an environment variable to avoid having to log in again:
  $ export motherduck_token='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzZXNzaW9uIjoibWVoZGkubW90aGVyZHVjay5jb20iLCJlbWFpbCI6Im1laGRpQG1vdGhlcmR1Y2suY29tIiwidXNlcklkIjoiZDc0NmUwM2UtOTA3OS00OGQ4LThiNmYtYjI1YTkzMWZhNzEyIiwiaWF0IjoxNjk3NTEwMTMxLCJleHAiOjE3MjkwNjc3MzF9.8iAT8Tk-M-ABgeCFU8HFtuvfuUKJPDXOdCCzG0ZrQwQ'



┌─────────┬─────────┐
│  level  │ message │
│ varchar │ varchar │
├───────────────────┤
│      0 rows       │
└───────────────────┘

In [7]:
# Push the data to MotherDuck
conn.sql("DROP TABLE IF EXISTS my_db.top_country_download")
conn.sql("CREATE TABLE my_db.top_country_download AS FROM top_country_download")


In [8]:
# Check Cloud databases then head over the MotherDuck UI to check the data!
conn.sql('show databases')


┌───────────────────────┐
│     database_name     │
│        varchar        │
├───────────────────────┤
│ cloud_ducks           │
│ duckdb_pypi_dashboard │
│ duckdb_stats          │
│ holiday_budget        │
│ memory                │
│ my_db                 │
│ my_demo_share         │
│ sample_data           │
│ stackoverflow         │
└───────────────────────┘