# Intro to DuckDB 🦆

In [None]:
# Install DuckDB
!pip install duckdb==0.8.1


In [None]:
# Initiate the connection & install httpfs extension
import duckdb
conn = duckdb.connect()
conn.sql('install httpfs')


In [None]:
# -- 20 M rows ~ 1GB of dataset from october 2021 to october 2022 on duckDB pypi stats
conn.sql("DESCRIBE FROM 's3://us-prd-motherduck-open-datasets/duckdb_stats/pypi/duckdb_pypi.parquet';")


In [None]:
# Count the number of rows
conn.sql("SELECT COUNT(*) FROM 's3://us-prd-motherduck-open-datasets/duckdb_stats/pypi/duckdb_pypi.parquet';")


In [None]:
# Quick preview
conn.sql("FROM 's3://us-prd-motherduck-open-datasets/duckdb_stats/pypi/duckdb_pypi.parquet' limit 5;")


In [None]:
# Create a new dataset
conn.sql("""
CREATE TABLE top_country_download AS 
SELECT country_code, COUNT(*) as download_count 
FROM 's3://us-prd-motherduck-open-datasets/duckdb_stats/pypi/duckdb_pypi.parquet'
GROUP BY country_code 
ORDER BY download_count DESC 
LIMIT 1000;
""")


In [None]:
conn.sql("FROM top_country_download")


## Connect and uploading to Motherduck ☁️


In [None]:
# Load MotherDuck extension
# If duckdb isn't launched yet, you can directly connect to md using conn = duckdb.connect('md:')
conn.sql("LOAD motherduck")
conn.sql("PRAGMA MD_CONNECT")


In [None]:
# Push the data to MotherDuck
conn.sql("DROP TABLE IF EXISTS my_db.top_country_download")
conn.sql("CREATE TABLE my_db.top_country_download AS FROM top_country_download")


In [None]:
# Check Cloud databases then head over the MotherDuck UI to check the data!
conn.sql('show databases')
