In [None]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import dask.dataframe as dd

In [165]:
sightings_df = dd.read_csv('data/ebd_US-VT_smp_relJul-2025.txt', 
                 sep='\t', 
                 usecols=['GLOBAL UNIQUE IDENTIFIER', 'LAST EDITED DATE', 'TAXONOMIC ORDER', 'CATEGORY', 'COMMON NAME', 'SCIENTIFIC NAME', 'OBSERVATION COUNT', 'STATE', 'COUNTY', 'COUNTY CODE', 'LOCALITY', 'LOCALITY ID', 'LOCALITY TYPE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'TIME OBSERVATIONS STARTED', 'OBSERVER ID', 'SAMPLING EVENT IDENTIFIER', 'OBSERVATION TYPE', 'DURATION MINUTES', 'EFFORT DISTANCE KM', 'NUMBER OBSERVERS', 'ALL SPECIES REPORTED', 'GROUP IDENTIFIER'],
                 blocksize=25e6,
                 na_values={'OBSERVATION COUNT': 'X'},
                 dtype={
                        'GLOBAL UNIQUE IDENTIFIER': 'string',
                        'LAST EDITED DATE': 'string',
                        'TAXONOMIC ORDER': 'UInt32',
                        'CATEGORY': 'category',
                        'COMMON NAME': 'category',
                        'SCIENTIFIC NAME': 'category',
                        'OBSERVATION COUNT': 'UInt32',
                        'STATE': 'category',
                        'COUNTY': 'category',
                        'COUNTY CODE': 'category',
                        'LOCALITY': 'string',
                        'LOCALITY ID': 'string',
                        'LOCALITY TYPE': 'category',
                        'LATITUDE': 'float64',
                        'LONGITUDE': 'float64',
                        'OBSERVATION DATE': 'period[D]',
                        'TIME OBSERVATIONS STARTED': 'string',
                        'OBSERVER ID': 'string',
                        'SAMPLING EVENT IDENTIFIER': 'string',
                        'OBSERVATION TYPE': 'category',
                        'DURATION MINUTES': 'UInt16',
                        'EFFORT DISTANCE KM': 'Float32',
                        'NUMBER OBSERVERS': 'UInt8',
                        'ALL SPECIES REPORTED': 'boolean',
                        'GROUP IDENTIFIER': 'string',

                        }
                )

sightings_df['GLOBAL UNIQUE IDENTIFIER'] = sightings_df['GLOBAL UNIQUE IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['SAMPLING EVENT IDENTIFIER'] = sightings_df['SAMPLING EVENT IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['LOCALITY ID'] = sightings_df['LOCALITY ID'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['GROUP IDENTIFIER'] = sightings_df['GROUP IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['LAST EDITED DATE'] = dd.to_datetime(sightings_df['LAST EDITED DATE'], errors='coerce')
sightings_df = sightings_df.categorize(columns=['COMMON NAME', 'SCIENTIFIC NAME', 'COUNTY CODE'])

In [148]:
dd.to_parquet(sightings_df, 'data/VT_observations.parquet', engine="pyarrow", write_index=False)

In [3]:
sightings_df = dd.read_parquet('data/VT_observations.parquet', engine="pyarrow")

In [12]:
complete_hotspot_sightings_df = sightings_df[
      (sightings_df['LOCALITY TYPE'] == 'H')
    & (sightings_df['CATEGORY'] == 'species')
    & (sightings_df['ALL SPECIES REPORTED'])
]

pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)
# print("Num sightings: ", len(complete_hotspot_sightings_df))
# print("Num group sightings: ", complete_hotspot_sightings_df['GROUP IDENTIFIER'].count().compute())
# print("Num solo sightings: ", complete_hotspot_sightings_df['GROUP IDENTIFIER'].isna().sum().compute())

# REMOVE DUPLICATE ROWS FROM GROUP CHECKLISTS 

# not sure why this logic isn't working, fix later, workaround below
# unique_complete_hotspot_sightings_df = complete_hotspot_sightings_df.drop_duplicates(subset=['GROUP IDENTIFIER'], split_every=False)

solo_sightings_df = complete_hotspot_sightings_df[complete_hotspot_sightings_df['GROUP IDENTIFIER'].isna()]
group_sightings_df = complete_hotspot_sightings_df[complete_hotspot_sightings_df['GROUP IDENTIFIER'].notnull()]
group_sightings_df = group_sightings_df.drop_duplicates(subset=['GROUP IDENTIFIER', 'COMMON NAME'])

unique_complete_hotspot_sightings_df = dd.concat([solo_sightings_df, group_sightings_df])
# print("Num unique sightings: ", len(unique_complete_hotspot_sightings_df))
# print("Num solo sightings: ", unique_complete_hotspot_sightings_df['GROUP IDENTIFIER'].isna().sum().compute())

In [13]:
HOTSPOTS = unique_complete_hotspot_sightings_df['LOCALITY ID'].unique()
SPECIES = unique_complete_hotspot_sightings_df['COMMON NAME'].unique()
#DAYS = pd.date_range(start=)
# print(len(HOTSPOTS))
# print(len(SPECIES))

In [9]:
checklists_df = dd.read_csv('data/ebd_US-VT_smp_relJul-2025_sampling.txt',
                        sep='\t', 
                        blocksize=25e6,
                        usecols=['LAST EDITED DATE', 'OBSERVATION DATE', 'LOCALITY ID', 'LOCALITY TYPE', 'SAMPLING EVENT IDENTIFIER', 'OBSERVATION TYPE', 'DURATION MINUTES', 'EFFORT DISTANCE KM', 'NUMBER OBSERVERS', 'ALL SPECIES REPORTED', 'GROUP IDENTIFIER'],
                        dtype={
                            'LAST EDITED DATE': 'string',
                            'LOCALITY ID': 'string',
                            'LOCALITY TYPE': 'category',
                            'SAMPLING EVENT IDENTIFIER': 'string',
                            'OBSERVATION TYPE': 'category',
                            'DURATION MINUTES': 'UInt16',
                            'EFFORT DISTANCE KM': 'Float32',
                            'NUMBER OBSERVERS': 'UInt8',
                            'ALL SPECIES REPORTED': 'boolean',
                            'GROUP IDENTIFIER': 'string'
                            }
                        )

checklists_df['LAST EDITED DATE'] = dd.to_datetime(checklists_df['LAST EDITED DATE'], errors='coerce')
checklists_df['LOCALITY ID'] = checklists_df['LOCALITY ID'].str.extract(r'(\d+)$')[0].astype('Int64')
checklists_df['SAMPLING EVENT IDENTIFIER'] = checklists_df['SAMPLING EVENT IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['GROUP IDENTIFIER'] = sightings_df['GROUP IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')


In [15]:
dd.to_parquet(checklists_df, 'data/VT_checklists.parquet', engine="pyarrow", write_index=False)

In [7]:
checklists_df = dd.read_parquet('data/VT_checklists.parquet', engine="pyarrow")

In [8]:
complete_hotspot_checklists_df = checklists_df[
      (checklists_df['ALL SPECIES REPORTED'])
    & (checklists_df['LOCALITY TYPE'] == 'H')
]

solo_checklists_df = complete_hotspot_checklists_df[complete_hotspot_checklists_df['GROUP IDENTIFIER'].isna()]
group_checklists_df = complete_hotspot_checklists_df[complete_hotspot_checklists_df['GROUP IDENTIFIER'].notnull()]
unique_group_checklists_df = group_checklists_df.drop_duplicates(subset=['GROUP IDENTIFIER'])
unique_complete_hotspot_checklists_df = dd.concat([solo_checklists_df, unique_group_checklists_df])

In [9]:
unique_complete_hotspot_checklists_df.head()

Unnamed: 0,LAST EDITED DATE,LOCALITY ID,LOCALITY TYPE,OBSERVATION DATE,SAMPLING EVENT IDENTIFIER,OBSERVATION TYPE,DURATION MINUTES,EFFORT DISTANCE KM,NUMBER OBSERVERS,ALL SPECIES REPORTED,GROUP IDENTIFIER
3,2025-05-26 10:18:02.301179,1140162,H,2025-05-26,242975716,Traveling,10,0.259,1,True,
4,2025-06-13 07:31:20.354768,1140162,H,2025-06-13,249781342,Stationary,7,,1,True,
5,2025-05-22 15:46:54.167567,1140162,H,2025-05-22,241308832,Stationary,10,,1,True,
6,2025-06-07 13:00:07.255729,1140162,H,2025-05-19,240407756,Traveling,30,0.139,1,True,
7,2025-08-08 03:20:40.611660,1140162,H,2025-03-21,219770714,Traveling,6,0.127,1,True,


In [None]:
print(unique_complete_hotspot_sightings_df[(unique_complete_hotspot_sightings_df['SAMPLING EVENT IDENTIFIER'] == 242975716)].compute())

       GLOBAL UNIQUE IDENTIFIER           LAST EDITED DATE  TAXONOMIC ORDER CATEGORY          COMMON NAME       SCIENTIFIC NAME  OBSERVATION COUNT    STATE    COUNTY COUNTY CODE             LOCALITY  LOCALITY ID LOCALITY TYPE   LATITUDE  LONGITUDE OBSERVATION DATE TIME OBSERVATIONS STARTED  OBSERVER ID  SAMPLING EVENT IDENTIFIER OBSERVATION TYPE  DURATION MINUTES  EFFORT DISTANCE KM  NUMBER OBSERVERS  ALL SPECIES REPORTED GROUP IDENTIFIER
5946                 3213453148 2025-05-26 10:18:02.301179            32338  species   American Goldfinch        Spinus tristis                  2  Vermont  Franklin   US-VT-011  Fairfield Swamp WMA      1140162             H  44.795556 -72.996275       2025-05-26                  10:06:00  obsr2046614                  242975716        Traveling                10               0.259                 1                  True             <NA>
10676                3213453147 2025-05-26 10:18:02.301179              330  species         Canada Goose     Bran