In [10]:
import pandas as pd
import duckdb as dk

con = dk.connect('///data/vermont.duckdb')

In [16]:
con.close()

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 2000)

In [3]:
personal_raw_df = pd.read_csv(
    "data/MyEBirdData.csv",
    dtype=str,
    keep_default_na=False,
    na_values=[''],
    encoding='utf-8',
    engine='python'
)

In [24]:
con.execute("""--sql
DROP TABLE IF EXISTS personal_raw;
CREATE TABLE personal_raw AS
    SELECT *
    FROM read_csv('data/MyEBirdData.csv', header=true, delim=',', quote='"', null_padding=true)
;
""")

<_duckdb.DuckDBPyConnection at 0x1e0bd8d0c30>

In [14]:
con.execute("""--sql
DROP TABLE IF EXISTS personal_staging;
CREATE TABLE personal_staging AS
SELECT
    "Taxonomic Order" AS taxonomic_order,
    "Common Name" AS common_name,
    "Scientific Name" AS scientific_name,
    TRY_CAST("Count" AS INT) AS observation_count,
    "State/Province" AS state,
    "County" AS county,
    "Location" AS locality,
    TRY_CAST(REGEXP_EXTRACT("Location ID", '(\\d+)$') AS BIGINT) AS locality_id,
    "Latitude"::FLOAT AS latitude,
    "Longitude"::FLOAT AS longitude,
    TRY_CAST("Date" AS DATE) AS observation_date,
    TRY_CAST(strptime("Time", '%I:%M %p') AS TIME) AS time_observations_started,
    TRY_CAST(REGEXP_EXTRACT("Submission ID", '(\\d+)$') AS BIGINT) AS sampling_id,
    "Protocol" AS observation_type,
    "Duration (Min)"::INT AS duration_minutes,
    "Distance Traveled (km)"::FLOAT AS effort_distance_km,
    "Number of Observers"::INT AS number_observers,
    "All Obs Reported"::BOOLEAN AS all_species_reported
FROM personal_raw
""")

<_duckdb.DuckDBPyConnection at 0x1f2593aa5f0>

In [15]:
con.execute("SELECT * FROM personal_staging").fetchdf()

Unnamed: 0,taxonomic_order,common_name,scientific_name,observation_count,state,county,locality,locality_id,latitude,longitude,observation_date,time_observations_started,sampling_id,observation_type,duration_minutes,effort_distance_km,number_observers,all_species_reported
0,267,Snow Goose,Anser caerulescens,1,US-VT,Addison,"2049 Little Chicago Road, Ferrisburgh, Vermont...",41289013,44.199104,-73.283531,2025-03-16,11:17:00,218884326,eBird - Traveling Count,47,0.329,1,True
1,267,Snow Goose,Anser caerulescens,2500,US-VT,Addison,Dead Creek WMA IBA--Gage Road,715236,44.073788,-73.328957,2025-11-16,16:47:00,284838194,eBird - Traveling Count,10,0.063,4,True
2,267,Snow Goose,Anser caerulescens,235,US-VT,Addison,Dead Creek WMA IBA--Goose Viewing Area,788246,44.085289,-73.336800,2025-10-24,09:19:00,281087275,eBird - Traveling Count,48,0.418,3,True
3,267,Snow Goose,Anser caerulescens,950,US-VT,Addison,Dead Creek WMA IBA--Goose Viewing Area,788246,44.085289,-73.336800,2025-11-13,15:28:00,284317780,eBird - Stationary Count,13,,3,True
4,267,Snow Goose,Anser caerulescens,1,US-VT,Chittenden,Charlotte Ferry Landing - McNeil Cove,165266,44.299904,-73.298645,2025-03-10,13:16:00,217728059,eBird - Traveling Count,50,0.060,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9118,34607,Indigo Bunting,Passerina cyanea,2,US-CT,New Haven,Hammonasset Beach SP,298379,41.261986,-72.554825,2025-08-16,05:59:00,267607652,eBird - Traveling Count,271,4.721,1,True
9119,34607,Indigo Bunting,Passerina cyanea,4,US-NY,Ulster,Mohonk Preserve--Duck Pond Rd. and Pond,4609134,41.756973,-74.142838,2025-05-24,06:11:00,241845105,eBird - Traveling Count,84,3.203,2,True
9120,34607,Indigo Bunting,Passerina cyanea,11,US-NY,Ulster,Mohonk Preserve--Duck Pond Rd. and Pond,4609134,41.756973,-74.142838,2025-05-25,06:30:00,242379361,eBird - Traveling Count,107,3.685,1,True
9121,34607,Indigo Bunting,Passerina cyanea,3,US-VT,Windsor,Lake Runnemede / Evarts Pond - (62 acres) - Pa...,752029,43.484402,-72.393349,2025-09-21,13:16:00,274766299,eBird - Traveling Count,175,1.323,1,True


In [4]:
# drop hybrids and slashes
personal_raw_df = personal_raw_df[~personal_raw_df['Common Name'].str.contains(r'\sx\s|\bhybrid|\s\\\s|/|sp\.', case=False)]
# remove subspecies classifications
personal_raw_df['Common Name'] = personal_raw_df['Common Name'].str.replace(r'\s(?=\().*', '', regex=True)
# keep first observation of each species
personal_raw_df.sort_values(by=['Taxonomic Order', 'Date'], inplace=True)
personal_staging_df = personal_raw_df.drop_duplicates(subset=['Common Name'])
personal_clean_df = personal_staging_df[['Common Name','Scientific Name','Taxonomic Order', 'Date', 'Location']].sort_values(by=['Date'])

In [5]:
personal_clean_df_to_duckdb_query = """
DROP TABLE IF EXISTS life_list;
CREATE TABLE life_list AS
SELECT
    "Common Name" as common_name,
    "Scientific Name" as scientific_name,
    "Taxonomic Order" as taxonomic_order,
    TRY_CAST("Date" AS DATE) as date,
    "Location" as location
FROM personal_clean_df;
"""

con.execute(personal_clean_df_to_duckdb_query)

create_unique_index_query = """
CREATE UNIQUE INDEX idx_life_list_scientific_name
ON life_list(scientific_name);
"""

con.execute(create_unique_index_query)

con.close()