In [1]:
import pandas as pd
import duckdb as dk
import numpy as np

con = dk.connect('///data/vermont.duckdb')

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 2000)

In [3]:
personal_raw_df = pd.read_csv(
    "data/MyEBirdData.csv",
    dtype=str,
    keep_default_na=False,
    na_values=[''],
    encoding='utf-8',
    engine='python'
)

In [4]:
# drop hybrids and slashes
personal_raw_df = personal_raw_df[~personal_raw_df['Common Name'].str.contains(r'\sx\s|\bhybrid|\s\\\s|/|sp\.', case=False)]
# remove subspecies classifications
personal_raw_df['Common Name'] = personal_raw_df['Common Name'].str.replace(r'\s(?=\().*', '', regex=True)
# keep first observation of each species
personal_raw_df.sort_values(by=['Taxonomic Order', 'Date'], inplace=True)
personal_staging_df = personal_raw_df.drop_duplicates(subset=['Common Name'])
personal_clean_df = personal_staging_df[['Common Name','Scientific Name','Taxonomic Order', 'Date', 'Location']].sort_values(by=['Date'])

In [5]:
personal_clean_df_to_duckdb_query = """
DROP TABLE IF EXISTS life_list;
CREATE TABLE life_list AS
SELECT
    "Common Name" as common_name,
    "Scientific Name" as scientific_name,
    "Taxonomic Order" as taxonomic_order,
    TRY_CAST("Date" AS DATE) as date,
    "Location" as location
FROM personal_clean_df;
"""

con.execute(personal_clean_df_to_duckdb_query)

create_unique_index_query = """
CREATE UNIQUE INDEX idx_life_list_scientific_name
ON life_list(scientific_name);
"""

con.execute(create_unique_index_query)

con.close()