In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd

In [165]:
sightings_df = dd.read_csv('data/ebd_US-VT_smp_relJul-2025.txt', 
                 sep='\t', 
                 usecols=['GLOBAL UNIQUE IDENTIFIER', 'LAST EDITED DATE', 'TAXONOMIC ORDER', 'CATEGORY', 'COMMON NAME', 'SCIENTIFIC NAME', 'OBSERVATION COUNT', 'STATE', 'COUNTY', 'COUNTY CODE', 'LOCALITY', 'LOCALITY ID', 'LOCALITY TYPE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'TIME OBSERVATIONS STARTED', 'OBSERVER ID', 'SAMPLING EVENT IDENTIFIER', 'OBSERVATION TYPE', 'DURATION MINUTES', 'EFFORT DISTANCE KM', 'NUMBER OBSERVERS', 'ALL SPECIES REPORTED', 'GROUP IDENTIFIER'],
                 blocksize=25e6,
                 na_values={'OBSERVATION COUNT': 'X'},
                 dtype={
                        'GLOBAL UNIQUE IDENTIFIER': 'string',
                        'LAST EDITED DATE': 'string',
                        'TAXONOMIC ORDER': 'UInt32',
                        'CATEGORY': 'category',
                        'COMMON NAME': 'category',
                        'SCIENTIFIC NAME': 'category',
                        'OBSERVATION COUNT': 'UInt32',
                        'STATE': 'category',
                        'COUNTY': 'category',
                        'COUNTY CODE': 'category',
                        'LOCALITY': 'string',
                        'LOCALITY ID': 'string',
                        'LOCALITY TYPE': 'category',
                        'LATITUDE': 'float64',
                        'LONGITUDE': 'float64',
                        'OBSERVATION DATE': 'period[D]',
                        'TIME OBSERVATIONS STARTED': 'string',
                        'OBSERVER ID': 'string',
                        'SAMPLING EVENT IDENTIFIER': 'string',
                        'OBSERVATION TYPE': 'category',
                        'DURATION MINUTES': 'UInt16',
                        'EFFORT DISTANCE KM': 'Float32',
                        'NUMBER OBSERVERS': 'UInt8',
                        'ALL SPECIES REPORTED': 'boolean',
                        'GROUP IDENTIFIER': 'string',

                        }
                )

sightings_df['GLOBAL UNIQUE IDENTIFIER'] = sightings_df['GLOBAL UNIQUE IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['SAMPLING EVENT IDENTIFIER'] = sightings_df['SAMPLING EVENT IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['LOCALITY ID'] = sightings_df['LOCALITY ID'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['GROUP IDENTIFIER'] = sightings_df['GROUP IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['LAST EDITED DATE'] = dd.to_datetime(sightings_df['LAST EDITED DATE'], errors='coerce')
sightings_df = sightings_df.categorize(columns=['COMMON NAME', 'SCIENTIFIC NAME', 'COUNTY CODE'])

In [170]:
print(sightings_df.head())
print(sightings_df['GROUP IDENTIFIER'].count())

   GLOBAL UNIQUE IDENTIFIER           LAST EDITED DATE  TAXONOMIC ORDER CATEGORY             COMMON NAME       SCIENTIFIC NAME  OBSERVATION COUNT    STATE   COUNTY COUNTY CODE                                          LOCALITY  LOCALITY ID LOCALITY TYPE   LATITUDE  LONGITUDE OBSERVATION DATE TIME OBSERVATIONS STARTED  OBSERVER ID  SAMPLING EVENT IDENTIFIER OBSERVATION TYPE  DURATION MINUTES  EFFORT DISTANCE KM  NUMBER OBSERVERS  ALL SPECIES REPORTED  GROUP IDENTIFIER
0                1657117801 2023-03-16 15:49:30.066547             7969  species            Golden Eagle     Aquila chrysaetos                  1  Vermont  Rutland   US-VT-021                     Haystack Mountain, Pawlet, VT     23075098             P  43.386310 -73.181181       1863-05-12                      <NA>  obsr3650024                  131046398       Historical              <NA>                <NA>              <NA>                 False              <NA>
1                 551244913 2024-03-28 11:44:55.166136    

In [148]:
dd.to_parquet(sightings_df, 'data/VT_observations.parquet', engine="pyarrow", write_index=False)

In [149]:
sightings_df = dd.read_parquet('data/VT_observations.parquet', engine="pyarrow")

In [150]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
print(len(sightings_df))
print(sightings_df.tail(10))

10580275
       GLOBAL UNIQUE IDENTIFIER           LAST EDITED DATE  TAXONOMIC ORDER CATEGORY            COMMON NAME     SCIENTIFIC NAME  OBSERVATION COUNT    STATE   COUNTY COUNTY CODE                                           LOCALITY  LOCALITY ID LOCALITY TYPE   LATITUDE  LONGITUDE OBSERVATION DATE TIME OBSERVATIONS STARTED OBSERVER ID  SAMPLING EVENT IDENTIFIER OBSERVATION TYPE  DURATION MINUTES  EFFORT DISTANCE KM  NUMBER OBSERVERS  ALL SPECIES REPORTED GROUP IDENTIFIER
64713                3338581419 2025-07-04 18:59:43.783644            33874  species  Yellow-rumped Warbler  Setophaga coronata                  1  Vermont  Windsor   US-VT-027  Marsh-Billings-Rockefeller National Historical...       769958             H  43.630753 -72.517950       2025-07-04                  11:22:00  obsr557970                  256849892        Traveling                83               0.904                 1                  True             <NA>
64714                3359936160 2025-07-11 10:03:

In [157]:
complete_hotspot_sightings_df = sightings_df[
      (sightings_df['LOCALITY TYPE'] == 'H')
    & (sightings_df['CATEGORY'] == 'species')
    & (sightings_df['ALL SPECIES REPORTED'])
]
# REMOVE DUPLICATE ROWS FROM GROUP CHECKLISTS 
unique_complete_hotspot_sightings_df = complete_hotspot_sightings_df.drop_duplicates(subset=['GROUP IDENTIFIER'])

In [159]:
print(len(sightings_df))
print(len(complete_hotspot_sightings_df))
print(len(unique_complete_hotspot_sightings_df))

10580275
4341571
40808


In [123]:
HOTSPOTS = complete_hotspot_sightings_df['LOCALITY ID'].unique()
SPECIES = complete_hotspot_sightings_df['COMMON NAME'].unique()
#DAYS = pd.date_range(start=)

In [None]:
checklists_df = dd.read_csv('data/ebd_US-VT_smp_relJul-2025_sampling.txt',
                        sep='\t', 
                        blocksize=25e6,
                        usecols=['LAST EDITED DATE', 'OBSERVATION DATE', 'LOCALITY ID', 'LOCALITY TYPE', 'SAMPLING EVENT IDENTIFIER', 'OBSERVATION TYPE', 'DURATION MINUTES', 'EFFORT DISTANCE KM', 'NUMBER OBSERVERS', 'ALL SPECIES REPORTED', 'GROUP IDENTIFIER'],
                        dtype={
                            'LAST EDITED DATE': 'string',
                            'LOCALITY ID': 'string',
                            'LOCALITY TYPE': 'category',
                            'SAMPLING EVENT IDENTIFIER': 'string',
                            'OBSERVATION TYPE': 'category',
                            'DURATION MINUTES': 'UInt16',
                            'EFFORT DISTANCE KM': 'Float32',
                            'NUMBER OBSERVERS': 'UInt8',
                            'ALL SPECIES REPORTED': 'boolean',
                            'GROUP IDENTIFIER': 'string'
                            }
                        )

checklists_df['LAST EDITED DATE'] = dd.to_datetime(checklists_df['LAST EDITED DATE'], errors='coerce')
checklists_df['LOCALITY ID'] = checklists_df['LOCALITY ID'].str.extract(r'(\d+)$')[0].astype('Int64')
checklists_df['SAMPLING EVENT IDENTIFIER'] = checklists_df['SAMPLING EVENT IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')

In [131]:
checklists_df.head()

Unnamed: 0,LAST EDITED DATE,LOCALITY ID,LOCALITY TYPE,OBSERVATION DATE,SAMPLING EVENT IDENTIFIER,OBSERVATION TYPE,DURATION MINUTES,EFFORT DISTANCE KM,NUMBER OBSERVERS,ALL SPECIES REPORTED,GROUP IDENTIFIER
0,2025-03-28 14:58:23.718586,11398723,P,2025-03-28,221347013,Traveling,30.0,0.483,2,True,
1,2025-07-02 16:01:44.022663,11360827,P,2025-07-02,256290564,Incidental,,,1,False,
2,2025-04-09 15:49:20.512607,11360827,P,2025-04-09,224162872,Incidental,,,1,False,
3,2025-05-26 10:18:02.301179,1140162,H,2025-05-26,242975716,Traveling,10.0,0.259,1,True,
4,2025-06-13 07:31:20.354768,1140162,H,2025-06-13,249781342,Stationary,7.0,,1,True,


In [132]:
dd.to_parquet(checklists_df, 'data/VT_checklists.parquet', engine="pyarrow", write_index=False)

In [133]:
checklists_df = dd.read_parquet('data/VT_checklists.parquet', engine="pyarrow")

In [None]:
complete_hotspot_checklists_df = checklists_df[
      (checklists_df['ALL SPECIES REPORTED'])
    & (checklists_df['LOCALITY TYPE'] == 'H')
]

In [135]:
complete_hotspot_checklists_df.head()

Unnamed: 0,LAST EDITED DATE,LOCALITY ID,LOCALITY TYPE,OBSERVATION DATE,SAMPLING EVENT IDENTIFIER,OBSERVATION TYPE,DURATION MINUTES,EFFORT DISTANCE KM,NUMBER OBSERVERS,ALL SPECIES REPORTED,GROUP IDENTIFIER
3,2025-05-26 10:18:02.301179,1140162,H,2025-05-26,242975716,Traveling,10,0.259,1,True,
4,2025-06-13 07:31:20.354768,1140162,H,2025-06-13,249781342,Stationary,7,,1,True,
5,2025-05-22 15:46:54.167567,1140162,H,2025-05-22,241308832,Stationary,10,,1,True,
6,2025-06-07 13:00:07.255729,1140162,H,2025-05-19,240407756,Traveling,30,0.139,1,True,
7,2025-08-08 03:20:40.611660,1140162,H,2025-03-21,219770714,Traveling,6,0.127,1,True,


In [145]:
complete_hotspot_sightings_df[complete_hotspot_sightings_df['SAMPLING EVENT IDENTIFIER' == 'S249781342']]

KeyError: False