In [47]:
import pandas as pd
import numpy as np

import os

In [78]:
files = os.listdir('data')
files.sort()
files = files[1:-1]
files

['ebd_US-TX-013_relDec-2023.txt',
 'ebd_US-TX-019_relDec-2023.txt',
 'ebd_US-TX-029_relDec-2023.txt',
 'ebd_US-TX-091_relDec-2023.txt',
 'ebd_US-TX-171_relDec-2023.txt',
 'ebd_US-TX-187_relDec-2023.txt',
 'ebd_US-TX-259_relDec-2023.txt',
 'ebd_US-TX-265_relDec-2023.txt',
 'ebd_US-TX-325_relDec-2023.txt',
 'ebd_US-TX-493_relDec-2023.txt']

In [94]:
def clean_ebird(df_init):
    
    #make it python friendly
    df_init.columns = df_init.columns.str.lower().str.replace('\W+','_',regex=True)

    #remove unnecessary columns
    df_init = df_init.drop(columns=['global_unique_identifier','last_edited_date','taxon_concept_id',
                                    'scientific_name','subspecies_common_name','subspecies_scientific_name',
                                    'exotic_code','breeding_code','breeding_category','behavior_code',
                                    'age_sex','country','country_code','state_code','state',
                                    'county_code','iba_code','bcr_code', 'locality_id', 'locality_type',
                                    'protocol_code','protocol_type', 'project_code','effort_area_ha',
                                    'has_media','approved','reviewed','reason','trip_comments',
                                    'species_comments','unnamed_49','usfws_code','atlas_block',])

    #replace all Xs with 1s, to count the bird once
    df_init.observation_count = np.where(df_init.observation_count == 'X', 1, df_init.observation_count)
    df_init.observation_count = df_init.observation_count.astype(int)

    #find all dupe checklists
    #sort them by observation count
    #drop the duplicates, except teh first instance
    df_keep_dups = df_init [df_init.group_identifier.notnull()].sort_values('observation_count', 
                                                   ascending=False).drop_duplicates(subset=['taxonomic_order',
                                                                                            'group_identifier'])
    #remove all duplicate checklists completely
    df_no_dups = df_init [df_init.group_identifier.isnull()]


    #combine the no dups df with the first dups only df
    df = pd.concat([df_no_dups, df_keep_dups])

    #drop group identifier column, as its no longer needed
    df = df.drop('group_identifier', axis=1)

    #remove everything before 2002
    df = df [df.observation_date >= '2002-01-01']

    return df

In [95]:
clean_dfs = []

for file in files:
    print(file)
    df = pd.read_csv('data/' + file, sep='\t')
    clean_df = clean_ebird(df)
    clean_dfs.append(clean_df)

ebd_US-TX-013_relDec-2023.txt
ebd_US-TX-019_relDec-2023.txt


  df = pd.read_csv('data/' + file, sep='\t')


ebd_US-TX-029_relDec-2023.txt


  df = pd.read_csv('data/' + file, sep='\t')


ebd_US-TX-091_relDec-2023.txt
ebd_US-TX-171_relDec-2023.txt
ebd_US-TX-187_relDec-2023.txt


  df = pd.read_csv('data/' + file, sep='\t')


ebd_US-TX-259_relDec-2023.txt


  df = pd.read_csv('data/' + file, sep='\t')


ebd_US-TX-265_relDec-2023.txt
ebd_US-TX-325_relDec-2023.txt
ebd_US-TX-493_relDec-2023.txt


In [103]:
dff = pd.concat(clean_dfs)
dff = dff.reset_index(drop=True)

In [104]:
dff.head()

Unnamed: 0,taxonomic_order,category,common_name,observation_count,county,locality,latitude,longitude,observation_date,time_observations_started,observer_id,sampling_event_identifier,duration_minutes,effort_distance_km,number_observers,all_species_reported
0,11791,species,American Kestrel,1,Atascosa,Peeler Tank,28.73534,-98.435998,2002-01-05,,obsr88598,S5554748,,,,1
1,233,species,Black-bellied Whistling-Duck,12,Atascosa,Peeler Tank,28.73534,-98.435998,2002-01-05,,obsr88598,S5554748,,,,1
2,33307,species,Brown-headed Cowbird,1,Atascosa,Atascosa Co.--CR411/412 FM99 area,28.710839,-98.21434,2002-01-11,14:35:00,obsr21142,S17226909,50.0,1.609,2.0,1
3,33325,species,Brewer's Blackbird,1,Atascosa,Atascosa Co.--CR411/412 FM99 area,28.710839,-98.21434,2002-01-11,14:35:00,obsr21142,S17226909,50.0,1.609,2.0,1
4,8416,species,Barn Owl,1,Atascosa,Peeler Tank,28.73534,-98.435998,2002-01-05,,obsr88598,S5554748,,,,1


In [105]:
dff.shape

(3626211, 16)

In [106]:
dff.county.value_counts()

Bexar        1765735
Guadalupe     396980
Kendall       372021
Comal         311381
Kerr          283787
Bandera       232628
Wilson         77269
Medina         75427
Gillespie      72553
Atascosa       38430
Name: county, dtype: int64

In [108]:
dff.to_csv('ebird_all_BAS.csv')