# Exploring the Melbourne Bike Share Dataset with DuckDB

## Getting the dataset

https://melbournetestbed.opendatasoft.com/explore/dataset/melbourne-bike-share-station-readings-2011-2017/information/

In [None]:
%%bash
pip install -r requirements.txt
wget --no-clobber https://opendatasoft-s3.s3.amazonaws.com/downloads/archive/74id-aqj9.zip
unzip -n 74id-aqj9.zip
mkdir -p data
mv 74id-aqj9.csv data/melb_bike_share.csv

## Loading the dataset with Pandas

In [None]:
%%time
import pandas as pd

bikes_df = pd.read_csv(
    "data/melb_bike_share.csv",
    usecols=["ID", "NAME", "NBBIKES", "RUNDATE", "LAT", "LONG"],
    parse_dates=["RUNDATE"],
    date_format="%Y%m%d%H%M%S",
)

bikes_df

## Loading the dataset with DuckDB

In [None]:
%%time
import duckdb

bikes_rel = duckdb.sql(
    """
    SELECT ID, NAME, NBBIKES, RUNDATE, LAT, LONG
    FROM read_csv( 
        'data/melb_bike_share.csv', 
        types={'RUNDATE': TIMESTAMP},
        timestampformat='%Y%m%d%H%M%S'
    )
    """
)

bikes_rel

# Querying the dataset

In [None]:
%%time
monthly_bikes_rel = duckdb.sql(
    """
    SELECT 
        month(RUNDATE) AS MONTH,
        round(avg(NBBIKES), 2) AS AVG_BIKES,
    FROM bikes_rel
    WHERE year(RUNDATE) = 2017
    GROUP BY MONTH
    ORDER BY MONTH
    """
)

monthly_bikes_rel

## Pandas version of the query

In [None]:
avg_bikes_df = (
    bikes_df[bikes_df["RUNDATE"].dt.year == 2017]
    .groupby(bikes_df["RUNDATE"].dt.month)["NBBIKES"]
    .mean()
    .round(2)
    .reset_index(name="AVG_BIKES")
    .sort_values("RUNDATE")
)

avg_bikes_df

## Exporting our dataset back to disk

In [None]:
duckdb.sql("COPY monthly_bikes_rel TO 'data/monthly_avg_bikes.csv'")

In [None]:
duckdb.sql("COPY monthly_bikes_rel TO 'data/monthly_avg_bikes.parquet'")

In [None]:
duckdb.sql("COPY monthly_bikes_rel TO 'data/monthly_avg_bikes.json'")

Passing in options:

In [None]:
duckdb.sql("COPY monthly_bikes_rel TO 'data/monthly_avg_bikes.csv' (DELIMITER '|')")

In [None]:
duckdb.sql("COPY monthly_bikes_rel TO 'data/monthly_avg_bikes.json' (ARRAY true)")

### Some nice SQL features

In [None]:
duckdb.sql(
    """    
    COPY (
        SELECT * FROM bikes_rel LIMIT 100000
    ) TO 'bike_share_100k.csv'"
    """
)

In [None]:
duckdb.sql(
    """    
    COPY (
        FROM bikes_rel USING SAMPLE 100000
    ) TO 'bike_share_sample_100k.csv'
    """
)

## Python in-memory data format integration

### Exporting

In [None]:
bikes_rel.df()

In [None]:
bikes_rel.pl()

In [None]:
bikes_rel.arrow()

In [None]:
bikes_rel.fetchnumpy()

### Importing

Note: not runable examples.

In [None]:
duckdb.sql("SELECT * FROM pandas_df")

In [None]:
duckdb.sql("SELECT * FROM polars_df")

In [None]:
duckdb.sql("SELECT * FROM arrow_table")

In [None]:
duckdb.sql("SELECT * FROM numpy_array")

# The DuckDB Relational API

In [None]:
import duckdb

duckdb.read_csv(
    "data/melb_bike_share.csv", 
    dtype={"RUNDATE": "TIMESTAMP"},
    timestamp_format="%Y%m%d%H%M%S",
).filter("year(RUNDATE) = 2017").count("*")

In [None]:
from duckdb import FunctionExpression, StarExpression, ColumnExpression

star = duckdb.StarExpression()
rundate_col = ColumnExpression("RUNDATE")
month_col = FunctionExpression("month", rundate_col).alias("MONTH")
year_col = FunctionExpression("year", rundate_col).alias("YEAR")


duckdb.read_csv(
    "data/melb_bike_share.csv", 
    dtype={"RUNDATE": "TIMESTAMP"},
    timestamp_format="%Y%m%d%H%M%S",
).select(star, month_col, year_col)


# Ibis

In [None]:
import ibis
from ibis import _

ibis.options.interactive = True

ibis.read_csv(
    "data/melb_bike_share.csv",
    types={"RUNDATE": "TIMESTAMP"},
    timestampformat="%Y%m%d%H%M%S"
).filter(_.RUNDATE.year() == 2017).count()

# In-Memory databases vs Persistant file database

In [None]:
mem_conn = duckdb.connect()

In [None]:
file_conn = duckdb.connect("bike_share.duckdb")