# Use duckdb python api

In this notebook, we will use duckdb python api to explore some functionality of the duck db. 

In [1]:
import pandas as pd
import duckdb

In [2]:
root_dir = "/home/pengfei/data_set/sf_fire"
csv_file_path = f"{root_dir}/sf_fire.csv"
parquet_file_path = f"{root_dir}/sf_fire_snappy.parquet"

## Create a duckdb instance

As we mentioned in the introduction, duckdb has two modes:
- in-memory: duckdb.connect()
- on-disk: duckdb.connect("path/to/file")

In [3]:
## Create an in-memory duckdb instance 
conn = duckdb.connect()
# create a on disk instance, you can also activate the read only option
# conn = duckdb.connect("mydb.db", read_only=True)

## read data with duck db

- read csv
- read parquet

In [7]:
# read csv
query = f"""select * from '{csv_file_path}' limit 10"""

# this will return a pandas dataframe
df = conn.execute(query).df()

In [9]:
print(type(df))
df.head(5)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Call Number,Unit ID,Incident Number,Call Type,Call Date,Watch Date,Received DtTm,Entry DtTm,Dispatch DtTm,Response DtTm,...,Call Type Group,Number of Alarms,Unit Type,Unit sequence in call dispatch,Fire Prevention District,Supervisor District,Neighborhooods - Analysis Boundaries,RowID,case_location,Analysis Neighborhoods
0,210391607,E19,21017645,Alarms,2021-02-08,2021-02-08,2021-02-08 13:00:14,2021-02-08 13:01:36,2021-02-08 13:01:40,2021-02-08 13:03:21,...,Alarm,1,ENGINE,1,8.0,7.0,Lakeshore,210391607-E19,POINT (-122.48045074945836 37.7190118676788),16.0
1,210391164,T04,21017596,Alarms,2021-02-08,2021-02-08,2021-02-08 10:54:56,2021-02-08 10:56:50,2021-02-08 10:56:57,2021-02-08 10:57:07,...,Alarm,1,TRUCK,1,3.0,6.0,Mission Bay,210391164-T04,POINT (-122.39227179213904 37.77288298280324),4.0
2,210391034,E16,21017578,Citizen Assist / Service Call,2021-02-08,2021-02-08,2021-02-08 10:18:53,2021-02-08 10:19:52,2021-02-08 10:19:58,2021-02-08 10:20:42,...,Alarm,1,ENGINE,1,4.0,2.0,Marina,210391034-E16,POINT (-122.42581353320875 37.79927566930728),13.0
3,210390767,T19,21017552,Other,2021-02-08,2021-02-08,2021-02-08 08:50:27,2021-02-08 08:54:27,2021-02-08 08:55:28,2021-02-08 08:57:51,...,Fire,1,TRUCK,9,,,,210390767-T19,POINT (-122.46239390119047 37.7049649190675),
4,210382984,B05,21017398,Alarms,2021-02-07,2021-02-07,2021-02-07 21:18:38,2021-02-07 21:20:02,2021-02-07 21:21:15,2021-02-07 21:21:15,...,Alarm,1,CHIEF,2,5.0,5.0,Lone Mountain/USF,210382984-B05,POINT (-122.45328305705388 37.77213783914884),18.0


In [10]:
# create a temporal view
conn.register("df_view", df)

table_name = "df_view"
query2 = f"Describe {table_name}"
schema = conn.execute(query2).df()

In [12]:
schema.head(15)

Unnamed: 0,column_name,column_type,null,key,default,extra
0,Call Number,VARCHAR,YES,,,
1,Unit ID,VARCHAR,YES,,,
2,Incident Number,VARCHAR,YES,,,
3,Call Type,VARCHAR,YES,,,
4,Call Date,TIMESTAMP,YES,,,
5,Watch Date,TIMESTAMP,YES,,,
6,Received DtTm,TIMESTAMP,YES,,,
7,Entry DtTm,TIMESTAMP,YES,,,
8,Dispatch DtTm,TIMESTAMP,YES,,,
9,Response DtTm,TIMESTAMP,YES,,,


In [13]:
# read parquet

read_parquet_query = f"""select * from '{parquet_file_path}' limit 10"""

# this will return a pandas dataframe
parquet_df = conn.execute(read_parquet_query).df()

In [14]:
parquet_df.head()

Unnamed: 0,CallNumber,UnitID,IncidentNumber,CallType,CallDate,WatchDate,ReceivedDtTm,EntryDtTm,DispatchDtTm,ResponseDtTm,...,ALSUnit,CallTypeGroup,NumberofAlarms,UnitType,Unitsequenceincalldispatch,FirePreventionDistrict,SupervisorDistrict,NeighborhoodDistrict,Location,RowID
0,210391607,E19,21017645,Alarms,02/08/2021,02/08/2021,02/08/2021 01:00:14 PM,02/08/2021 01:01:36 PM,02/08/2021 01:01:40 PM,02/08/2021 01:03:21 PM,...,True,Alarm,1,ENGINE,1,8.0,7.0,Lakeshore,210391607-E19,POINT (-122.48045074945836 37.7190118676788)
1,210391164,T04,21017596,Alarms,02/08/2021,02/08/2021,02/08/2021 10:54:56 AM,02/08/2021 10:56:50 AM,02/08/2021 10:56:57 AM,02/08/2021 10:57:07 AM,...,False,Alarm,1,TRUCK,1,3.0,6.0,Mission Bay,210391164-T04,POINT (-122.39227179213904 37.77288298280324)
2,210391034,E16,21017578,Citizen Assist / Service Call,02/08/2021,02/08/2021,02/08/2021 10:18:53 AM,02/08/2021 10:19:52 AM,02/08/2021 10:19:58 AM,02/08/2021 10:20:42 AM,...,True,Alarm,1,ENGINE,1,4.0,2.0,Marina,210391034-E16,POINT (-122.42581353320875 37.79927566930728)
3,210390767,T19,21017552,Other,02/08/2021,02/08/2021,02/08/2021 08:50:27 AM,02/08/2021 08:54:27 AM,02/08/2021 08:55:28 AM,02/08/2021 08:57:51 AM,...,True,Fire,1,TRUCK,9,,,,210390767-T19,POINT (-122.46239390119047 37.7049649190675)
4,210382984,B05,21017398,Alarms,02/07/2021,02/07/2021,02/07/2021 09:18:38 PM,02/07/2021 09:20:02 PM,02/07/2021 09:21:15 PM,02/07/2021 09:21:15 PM,...,False,Alarm,1,CHIEF,2,5.0,5.0,Lone Mountain/USF,210382984-B05,POINT (-122.45328305705388 37.77213783914884)


## read csv with pandas
