In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd

In [80]:
df = dd.read_csv('data/ebd_US-VT_smp_relJul-2025.txt', 
                 sep='\t', 
                 usecols=['GLOBAL UNIQUE IDENTIFIER', 'LAST EDITED DATE', 'TAXONOMIC ORDER', 'CATEGORY', 'COMMON NAME', 'SCIENTIFIC NAME', 'OBSERVATION COUNT', 'STATE', 'COUNTY', 'COUNTY CODE', 'LOCALITY', 'LOCALITY ID', 'LOCALITY TYPE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'TIME OBSERVATIONS STARTED', 'OBSERVER ID', 'SAMPLING EVENT IDENTIFIER', 'OBSERVATION TYPE', 'DURATION MINUTES', 'EFFORT DISTANCE KM', 'NUMBER OBSERVERS', 'ALL SPECIES REPORTED', 'GROUP IDENTIFIER'],
                 blocksize=25e6,
                 na_values={'OBSERVATION COUNT': 'X'},
                 dtype={
                        'GLOBAL UNIQUE IDENTIFIER': 'string',
                        'LAST EDITED DATE': 'string',
                        'TAXONOMIC ORDER': 'UInt32',
                        'CATEGORY': 'category',
                        'COMMON NAME': 'category',
                        'SCIENTIFIC NAME': 'category',
                        'OBSERVATION COUNT': 'UInt32',
                        'STATE': 'category',
                        'COUNTY': 'category',
                        'COUNTY CODE': 'category',
                        'LOCALITY': 'string',
                        'LOCALITY ID': 'string',
                        'LOCALITY TYPE': 'category',
                        'LATITUDE': 'float64',
                        'LONGITUDE': 'float64',
                        'OBSERVATION DATE': 'period[D]',
                        'TIME OBSERVATIONS STARTED': 'string',
                        'OBSERVER ID': 'string',
                        'SAMPLING EVENT IDENTIFIER': 'string',
                        'OBSERVATION TYPE': 'category',
                        'DURATION MINUTES': 'UInt16',
                        'EFFORT DISTANCE KM': 'Float32',
                        'NUMBER OBSERVERS': 'UInt8',
                        'ALL SPECIES REPORTED': 'boolean',
                        'GROUP IDENTIFIER': 'string',

                        }
                )
df['GLOBAL UNIQUE IDENTIFIER'] = df['GLOBAL UNIQUE IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
df['LAST EDITED DATE'] = dd.to_datetime(df['LAST EDITED DATE'], errors='coerce')
df = df.categorize(columns=['COMMON NAME', 'SCIENTIFIC NAME', 'COUNTY CODE'])

In [81]:
dd.to_parquet(df, 'data/VT_observations.parquet', engine="pyarrow", write_index=False)

In [82]:
df = dd.read_parquet('data/VT_observations.parquet', engine="pyarrow")

In [96]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
print(df.tail(10))

       GLOBAL UNIQUE IDENTIFIER           LAST EDITED DATE  TAXONOMIC ORDER CATEGORY            COMMON NAME     SCIENTIFIC NAME  OBSERVATION COUNT    STATE   COUNTY COUNTY CODE                                                           LOCALITY LOCALITY ID LOCALITY TYPE   LATITUDE  LONGITUDE OBSERVATION DATE TIME OBSERVATIONS STARTED OBSERVER ID SAMPLING EVENT IDENTIFIER OBSERVATION TYPE  DURATION MINUTES  EFFORT DISTANCE KM  NUMBER OBSERVERS  ALL SPECIES REPORTED GROUP IDENTIFIER
64713                3338581419 2025-07-04 18:59:43.783644            33874  species  Yellow-rumped Warbler  Setophaga coronata                  1  Vermont  Windsor   US-VT-027  Marsh-Billings-Rockefeller National Historical Park--Mansion Area     L769958             H  43.630753 -72.517950       2025-07-04                  11:22:00  obsr557970                S256849892        Traveling                83               0.904                 1                  True             <NA>
64714                335993616

In [107]:
hotspot_df = df[df['LOCALITY TYPE'] == 'H']
hotspot_df.head()

Unnamed: 0,GLOBAL UNIQUE IDENTIFIER,LAST EDITED DATE,TAXONOMIC ORDER,CATEGORY,COMMON NAME,SCIENTIFIC NAME,OBSERVATION COUNT,STATE,COUNTY,COUNTY CODE,LOCALITY,LOCALITY ID,LOCALITY TYPE,LATITUDE,LONGITUDE,OBSERVATION DATE,TIME OBSERVATIONS STARTED,OBSERVER ID,SAMPLING EVENT IDENTIFIER,OBSERVATION TYPE,DURATION MINUTES,EFFORT DISTANCE KM,NUMBER OBSERVERS,ALL SPECIES REPORTED,GROUP IDENTIFIER
4,1596380719,2024-03-28 11:25:32.329503,5411,species,Common Gallinule,Gallinula galeata,1.0,Vermont,Addison,US-VT-001,Little Otter Creek WMA IBA--Greenbush Rd. Access,L2296796,H,44.230855,-73.255291,1879-04-28,,obsr939641,S125262378,Historical,,,,False,
6,174974689,NaT,5411,species,Common Gallinule,Gallinula galeata,,Vermont,Rutland,US-VT-021,Lake Bomoseen/Hubbardton Marshes IBA,L165378,H,43.687836,-73.196011,1881-05-28,,obsr350731,S12471507,Incidental,,,1.0,False,
8,174974692,NaT,5411,species,Common Gallinule,Gallinula galeata,,Vermont,Rutland,US-VT-021,Lake Bomoseen/Hubbardton Marshes IBA,L165378,H,43.687836,-73.196011,1882-05-28,,obsr350731,S12471510,Incidental,,,1.0,False,
26,701556368,NaT,11997,species,Peregrine Falcon,Falco peregrinus,1.0,Vermont,Orleans,US-VT-019,Lake Willoughby - Westmore (1653 acres),L752032,H,44.744694,-72.058413,1896-06-17,,obsr27544,S51939208,Incidental,,,,False,
27,701547055,NaT,11997,species,Peregrine Falcon,Falco peregrinus,1.0,Vermont,Orleans,US-VT-019,Mt. Pisgah,L165334,H,44.7297,-72.0315,1896-06-17,,obsr27544,S51937927,Incidental,,,,False,
