# Loading and exploring the Melbourne Bike Share Dataset

Dataset:

https://melbournetestbed.opendatasoft.com/explore/dataset/melbourne-bike-share-station-readings-2011-2017/information/

In [None]:
! pip install duckdb pandas

## Loading the dataset with Pandas

In [None]:
import pandas as pd

df = pd.read_csv(
    "data/melb_bike_share.csv",
    usecols=["ID", "NAME", "NBBIKES", "RUNDATE", "LAT", "LONG"],
    parse_dates=["RUNDATE"],
    date_format="%Y%m%d%H%M%S",
)

df

## Loading the dataset with DuckDB

Key observations
* compare and contrast with Pandas equivalent
* result is a relation object (separate slide on relations)

In [None]:
import duckdb

rel = duckdb.sql(
    """
    SELECT ID, NAME, NBBIKES, RUNDATE, LAT, LONG
    FROM read_csv( 
        'data/melb_bike_share.csv', 
        types={'RUNDATE': TIMESTAMP},
        timestampformat='%Y%m%d%H%M%S'
    )
    """
)

rel.show(max_rows=10)

# Querying the dataset

Notable callouts:
* replacement scanning of 'rel' variable
* 

In [None]:
duckdb.sql(
"""
SELECT 
    NAME,
    month(RUNDATE) AS MONTH,
    avg(NBBIKES) AS AVG_BIKES,
FROM rel
WHERE year(RUNDATE) = 2017
GROUP BY MONTH, NAME
ORDER BY MONTH, NAME
"""
)

## Pandas version of the query

Notable callouts:
* more inscrutable than the SQL
* results have to be materialised at each method call

In [None]:
avg_df = (
    df[df["RUNDATE"].dt.year == 2017]
    .groupby(["NAME", df["RUNDATE"].dt.month])["NBBIKES"]
    .mean()
    .reset_index(name="AVG_BIKES")
    .sort_values(["RUNDATE", "NAME"])
)

avg_df

## Exporting our dataset back to disk
* CSV
* JSON
* Parquet

## Pandas and polars integration

In [None]:
duckdb.sql("FROM rel LIMIT 3")

In [None]:
%%time
duckdb.sql("FROM rel USING SAMPLE 3")

### Converting to Pandas and Polars Dataframes

In [None]:
%%time
df = rel.df()

### Querying a dataframe

In [None]:
%%time
duckdb.sql("FROM df USING SAMPLE 3")

--------------------------

In [None]:
%%time
conn.sql(
    """
    CREATE OR REPLACE TABLE bikes AS
    FROM relation
    """
)

### JupySQL for convenient querying in Jupyter Notebooks

In [None]:
# load JupySQL extension requried for the %sql and %%sql magics
%load_ext sql 

# register the DuckDB connection with JupySQL 
%sql conn --alias duckdb 

# configure JupySQL to return Pandas DataFrames by default
%config SqlMagic.autopandas = True 

In [None]:
%%sql 
SELECT 
    NAME,
    extract('year' FROM RUNDATE) AS YEAR,
    extract('month' FROM RUNDATE) AS MONTH,
    COUNT(*) AS NUM_READINGS,
FROM bikes
WHERE MONTH = 1
GROUP BY YEAR, MONTH, NAME
ORDER BY YEAR, MONTH

In [None]:
station_reading_counts_df

In [None]:
import plotly.express as px 

px.line(
    station_reading_counts_df,
    x="MONTH",
    y="NUM_READINGS",
    markers=True, 
    symbol="YEAR",
    symbol_sequence=["square", "diamond", "circle"],
    color="YEAR",
    title="Records by month for each year the Melbourne Bike Share program was active",
    height=400,
).update_traces(marker_size=8)

## Working with the Relational API

In [None]:
conn.table("bikes").describe()

In [None]:
bikes_rel = conn.table("bikes")

In [None]:
bikes_rel.project("RUNDATE", "NBBIKES", "NBEMPTYDOCKS").describe()

In [None]:
bikes_rel.filter("LOCKED = true").value_counts("NAME").order("2 DESC")

# Scratch

In [None]:
%%sql
SELECT count(DISTINCT LOCATION) FROM bikes

In [None]:
%%sql
SELECT count(DISTINCT NAME) FROM bikes

In [None]:
%%sql
SELECT count(LOCATION)
FROM bikes
GROUP BY LOCATION

# Ad-hoc wrangling

In [None]:
rel

In [None]:
duckdb.sql("COPY (FROM rel USING SAMPLE 1000000) TO 'sample_bikes.csv'")

# In-Memory databases vs Persistant file database

In [None]:
mem_conn = duckdb.connect()

In [None]:
file_conn = duckdb.connect("bike_share.duckdb")