In [1]:
import pandas as pd
import duckdb as dk
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook"

con = dk.connect('///data/vermont.duckdb')

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 2000)

In [26]:
personal_raw_df = pd.read_csv(
    "data/MyEBirdData.csv",
    dtype=str,
    keep_default_na=False,
    na_values=[''],
    encoding='utf-8',
    engine='python'
)

In [27]:
personal_raw_df.head()

Unnamed: 0,Submission ID,Common Name,Scientific Name,Taxonomic Order,Count,State/Province,County,Location ID,Location,Latitude,Longitude,Date,Time,Protocol,Duration (Min),All Obs Reported,Distance Traveled (km),Area Covered (ha),Number of Observers,Breeding Code,Observation Details,Checklist Comments,ML Catalog Numbers
0,S218884326,Snow Goose,Anser caerulescens,267,1,US-VT,Addison,L41289013,"2049 Little Chicago Road, Ferrisburgh, Vermont...",44.1991056,-73.2835316,2025-03-16,11:17 AM,eBird - Traveling Count,47,1,0.329,,1,,Blue morph,,
1,S284838194,Snow Goose,Anser caerulescens,267,2500,US-VT,Addison,L715236,Dead Creek WMA IBA--Gage Road,44.0737892,-73.3289552,2025-11-16,04:47 PM,eBird - Traveling Count,10,1,0.063,,4,,"*high. All seen in flight, counted by 100s.",,
2,S281087275,Snow Goose,Anser caerulescens,267,235,US-VT,Addison,L788246,Dead Creek WMA IBA--Goose Viewing Area,44.0852885,-73.336798,2025-10-24,09:19 AM,eBird - Traveling Count,48,1,0.418,,3,,Exact,,
3,S284317780,Snow Goose,Anser caerulescens,267,950,US-VT,Addison,L788246,Dead Creek WMA IBA--Goose Viewing Area,44.0852885,-73.336798,2025-11-13,03:28 PM,eBird - Stationary Count,13,1,,,3,,,,
4,S217728059,Snow Goose,Anser caerulescens,267,1,US-VT,Chittenden,L165266,Charlotte Ferry Landing - McNeil Cove,44.2999029,-73.2986445,2025-03-10,01:16 PM,eBird - Traveling Count,50,1,0.06,,1,,,,


In [32]:
state = "US-VT"
year = 2025

In [37]:
# drop hybrids and slashes
personal_raw_df = personal_raw_df[~personal_raw_df['Common Name'].str.contains(r'\sx\s|\bhybrid|\s\\\s|/|sp\.', case=False)]
# remove subspecies classifications
personal_raw_df['Common Name'] = personal_raw_df['Common Name'].str.replace(r'\s(?=\().*', '', regex=True)
personal_staging_df = personal_raw_df[(personal_raw_df['State/Province'] == state) & (pd.to_datetime(personal_raw_df['Date']).dt.year == year)]
personal_clean_df = personal_staging_df[['Common Name','Scientific Name','Taxonomic Order', 'Count', 'Date', 'Time', 'Location', 'County', 'Latitude', 'Longitude', 'Protocol', 'Duration (Min)', 'Distance Traveled (km)', 'Submission ID']].sort_values(by=['Taxonomic Order', 'Date'])

In [54]:
checklists_staging_df = personal_clean_df.drop(columns=['Common Name', 'Scientific Name', 'Taxonomic Order', 'Count'])
checklists_df = checklists_staging_df.drop_duplicates(subset=['Submission ID'])
checklists_df.sort_values(by=['Date'], inplace=True)
checklists_df



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Date,Time,Location,County,Latitude,Longitude,Protocol,Duration (Min),Distance Traveled (km),Submission ID
5522,2025-01-11,01:21 PM,Sterling Pond,Lamoille,44.5559679,-72.7743816,eBird - Casual Observation,,,S208873793
378,2025-01-14,04:21 PM,Burlington Waterfront--Wastewater Treatment Plant,Chittenden,44.4718122,-73.2204008,eBird - Stationary Count,1.0,,S209307634
4455,2025-01-15,12:13 PM,The Other Laundromat,Chittenden,44.4185734,-73.1775358,eBird - Stationary Count,1.0,,S209408585
947,2025-01-16,10:21 AM,The Laundromat,Chittenden,44.4747812,-73.216012,eBird - Casual Observation,,,S209492324
6308,2025-02-02,03:33 PM,Shelburne Farms,Chittenden,44.3958699,-73.2690485,eBird - Traveling Count,22.0,0.0,S211745916
5415,2025-02-04,08:18 AM,The Laundromat,Chittenden,44.4747812,-73.216012,eBird - Casual Observation,,,S211934872
4431,2025-02-15,11:32 AM,Shelburne Point,Chittenden,44.4288766,-73.2516861,eBird - Traveling Count,36.0,0.086,S213513471
4905,2025-02-15,03:01 PM,The Other Laundromat,Chittenden,44.4185734,-73.1775358,eBird - Stationary Count,48.0,,S213592834
5788,2025-02-22,02:47 PM,The Laundromat,Chittenden,44.4747812,-73.216012,eBird - Casual Observation,,,S214885579
3378,2025-02-22,02:06 PM,Shelburne Point,Chittenden,44.4288766,-73.2516861,eBird - Stationary Count,25.0,,S214885507
