In [3]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import dask.dataframe as dd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)

In [None]:
sightings_df = dd.read_csv('data/ebd_US-VT_smp_relJul-2025.txt', 
                 sep='\t', 
                 usecols=['GLOBAL UNIQUE IDENTIFIER', 'LAST EDITED DATE', 'TAXONOMIC ORDER', 'CATEGORY', 'COMMON NAME', 'SCIENTIFIC NAME', 'OBSERVATION COUNT', 'STATE', 'COUNTY', 'COUNTY CODE', 'LOCALITY', 'LOCALITY ID', 'LOCALITY TYPE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'TIME OBSERVATIONS STARTED', 'OBSERVER ID', 'SAMPLING EVENT IDENTIFIER', 'OBSERVATION TYPE', 'DURATION MINUTES', 'EFFORT DISTANCE KM', 'NUMBER OBSERVERS', 'ALL SPECIES REPORTED', 'GROUP IDENTIFIER'],
                 blocksize=25e6,
                 na_values={'OBSERVATION COUNT': 'X'},
                 dtype={
                        'GLOBAL UNIQUE IDENTIFIER': 'string',
                        'LAST EDITED DATE': 'string',
                        'TAXONOMIC ORDER': 'UInt32',
                        'CATEGORY': 'category',
                        'COMMON NAME': 'category',
                        'SCIENTIFIC NAME': 'category',
                        'OBSERVATION COUNT': 'UInt32',
                        'STATE': 'category',
                        'COUNTY': 'category',
                        'COUNTY CODE': 'category',
                        'LOCALITY': 'string',
                        'LOCALITY ID': 'string',
                        'LOCALITY TYPE': 'category',
                        'LATITUDE': 'float64',
                        'LONGITUDE': 'float64',
                        'OBSERVATION DATE': 'period[D]',
                        'TIME OBSERVATIONS STARTED': 'string',
                        'OBSERVER ID': 'string',
                        'SAMPLING EVENT IDENTIFIER': 'string',
                        'OBSERVATION TYPE': 'category',
                        'DURATION MINUTES': 'UInt16',
                        'EFFORT DISTANCE KM': 'Float32',
                        'NUMBER OBSERVERS': 'UInt8',
                        'ALL SPECIES REPORTED': 'boolean',
                        'GROUP IDENTIFIER': 'string',

                        }
                )

sightings_df['GLOBAL UNIQUE IDENTIFIER'] = sightings_df['GLOBAL UNIQUE IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['SAMPLING EVENT IDENTIFIER'] = sightings_df['SAMPLING EVENT IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['LOCALITY ID'] = sightings_df['LOCALITY ID'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['GROUP IDENTIFIER'] = sightings_df['GROUP IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['LAST EDITED DATE'] = dd.to_datetime(sightings_df['LAST EDITED DATE'], errors='coerce')
sightings_df = sightings_df.categorize(columns=['COMMON NAME', 'SCIENTIFIC NAME', 'COUNTY CODE'])

In [None]:
dd.to_parquet(sightings_df, 'data/VT_observations.parquet', engine="pyarrow", write_index=False)

In [4]:
sightings_df = dd.read_parquet('data/VT_observations.parquet', engine="pyarrow")

In [10]:
complete_hotspot_sightings_df = sightings_df[
      (sightings_df['LOCALITY TYPE'] == 'H')
    & (sightings_df['CATEGORY'] == 'species')
    & (sightings_df['ALL SPECIES REPORTED'])
]

# print("Num sightings: ", len(complete_hotspot_sightings_df))
# print("Num group sightings: ", complete_hotspot_sightings_df['GROUP IDENTIFIER'].count().compute())
# print("Num solo sightings: ", complete_hotspot_sightings_df['GROUP IDENTIFIER'].isna().sum().compute())

# REMOVE DUPLICATE ROWS FROM GROUP CHECKLISTS 

# not sure why this logic isn't working, fix later, workaround below
# unique_complete_hotspot_sightings_df = complete_hotspot_sightings_df.drop_duplicates(subset=['GROUP IDENTIFIER'], split_every=False)

solo_sightings_df = complete_hotspot_sightings_df[complete_hotspot_sightings_df['GROUP IDENTIFIER'].isna()]
group_sightings_df = complete_hotspot_sightings_df[complete_hotspot_sightings_df['GROUP IDENTIFIER'].notnull()]
group_sightings_df = group_sightings_df.sort_values('SAMPLING EVENT IDENTIFIER')
group_sightings_df = group_sightings_df.drop_duplicates(subset=['GROUP IDENTIFIER', 'COMMON NAME'])

unique_complete_hotspot_sightings_df = dd.concat([solo_sightings_df, group_sightings_df])
# print("Num solo sightings: ", unique_complete_hotspot_sightings_df['GROUP IDENTIFIER'].isna().sum().compute())
# print("Num species: ", unique_complete_hotspot_sightings_df['COMMON NAME'].nunique().compute())
# print("Num hotspots: ", unique_complete_hotspot_sightings_df['LOCALITY ID'].nunique().compute())

In [11]:
print(unique_complete_hotspot_sightings_df['SAMPLING EVENT IDENTIFIER'].nunique().compute())

249494


In [None]:
checklists_df = dd.read_csv('data/ebd_US-VT_smp_relJul-2025_sampling.txt',
                        sep='\t', 
                        blocksize=25e6,
                        usecols=['LAST EDITED DATE', 'OBSERVATION DATE', 'LOCALITY', 'LOCALITY ID', 'LOCALITY TYPE', 'SAMPLING EVENT IDENTIFIER', 'OBSERVATION TYPE', 'DURATION MINUTES', 'EFFORT DISTANCE KM', 'NUMBER OBSERVERS', 'ALL SPECIES REPORTED', 'GROUP IDENTIFIER'],
                        dtype={
                            'LAST EDITED DATE': 'string',
                            'OBSERVATION DATE': 'period[D]',
                            'LOCALITY': 'string',
                            'LOCALITY ID': 'string',
                            'LOCALITY TYPE': 'category',
                            'SAMPLING EVENT IDENTIFIER': 'string',
                            'OBSERVATION TYPE': 'category',
                            'DURATION MINUTES': 'UInt16',
                            'EFFORT DISTANCE KM': 'Float32',
                            'NUMBER OBSERVERS': 'UInt8',
                            'ALL SPECIES REPORTED': 'boolean',
                            'GROUP IDENTIFIER': 'string'
                            }
                        )

checklists_df['LAST EDITED DATE'] = dd.to_datetime(checklists_df['LAST EDITED DATE'], errors='coerce')
checklists_df['LOCALITY ID'] = checklists_df['LOCALITY ID'].str.extract(r'(\d+)$')[0].astype('Int64')
checklists_df['SAMPLING EVENT IDENTIFIER'] = checklists_df['SAMPLING EVENT IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')
sightings_df['GROUP IDENTIFIER'] = sightings_df['GROUP IDENTIFIER'].str.extract(r'(\d+)$')[0].astype('Int64')


In [None]:
dd.to_parquet(checklists_df, 'data/VT_checklists.parquet', engine="pyarrow", write_index=False)

In [4]:
checklists_df = dd.read_parquet('data/VT_checklists.parquet', engine="pyarrow")

In [5]:
complete_hotspot_checklists_df = checklists_df[
      (checklists_df['ALL SPECIES REPORTED'])
    & (checklists_df['LOCALITY TYPE'] == 'H')
]

solo_checklists_df = complete_hotspot_checklists_df[complete_hotspot_checklists_df['GROUP IDENTIFIER'].isna()]
group_checklists_df = complete_hotspot_checklists_df[complete_hotspot_checklists_df['GROUP IDENTIFIER'].notnull()]
group_checklists_df = group_checklists_df.sort_values('SAMPLING EVENT IDENTIFIER')
unique_group_checklists_df = group_checklists_df.drop_duplicates(subset=['GROUP IDENTIFIER'])
unique_complete_hotspot_checklists_df = dd.concat([solo_checklists_df, unique_group_checklists_df])

In [None]:
# count total checklists per day at each hotspot
total_checklists_per_day_df = (
    unique_complete_hotspot_checklists_df
    .groupby(['LOCALITY ID', 'OBSERVATION DATE'], observed=True)
    .size()
    .rename('TOTAL CHECKLISTS')
    .reset_index()
)

total_sightings_per_day_df = (
    unique_complete_hotspot_sightings_df
    .groupby(['LOCALITY ID', 'OBSERVATION DATE', 'COMMON NAME'], observed=True)
    ['SAMPLING EVENT IDENTIFIER']
    .nunique()
    .rename('TOTAL SIGHTINGS')
    .reset_index()
)

abundance_df = total_sightings_per_day_df.merge(
    total_checklists_per_day_df,
    on=['LOCALITY ID', 'OBSERVATION DATE'],
    how='left'
)

In [None]:
print(len(total_checklists_per_day_df))
print(len(total_sightings_per_day_df))

In [None]:
print(abundance_df['TOTAL CHECKLISTS'].value_counts().compute())

In [None]:
abundance_df['COMMON NAME'] = abundance_df['COMMON NAME'].cat.as_known()
abundance_df['TOTAL CHECKLISTS'] = abundance_df['TOTAL CHECKLISTS'].astype('int64')

In [None]:
abundance_df['ABUNDANCE'] = abundance_df['TOTAL SIGHTINGS'] / abundance_df['TOTAL CHECKLISTS']
abundance_df['DAY OF YEAR'] = abundance_df['OBSERVATION DATE'].dt.dayofyear

mean_abundance_df = (
    abundance_df
    .groupby(['LOCALITY ID', 'COMMON NAME', 'DAY OF YEAR'], observed=True)
    ['ABUNDANCE']
    .mean()
    .rename('MEAN ABUNDANCE')
    .reset_index()
)

In [None]:
abundance_df.to_parquet('data/VT_abundance.parquet', engine="pyarrow", write_index=False)
mean_abundance_df.to_parquet('data/VT_mean_abundance.parquet', engine="pyarrow", write_index=False)