In [46]:
import pandas as pd

In [47]:
# Only use 7 columns (don't use observation count)
# filter for us birds
# Rename columns
# Drop nulls
# Clean up badly named birds
# convert date column to datetime object
# extract month from date
# map month column to season
# combine county and state

## Clean up for regions
# rename columns
# Strip leading whitespace from State
# drop numbers from regionName
# drop state from countyname
# Map alaska county names
# Capitalize city in St. Louis and Richmond works
# drop parish and county from CountyName
# combine county and state

# merge dfs
# create pivot table of rarities
# county-region mapping dict
# state-county mapping dict
# list of birds, seasons, and states


In [48]:
df = pd.read_csv("~/Desktop/repos/Bird-Check/bird_data.csv", 
                 sep='\t', 
                 nrows=10000, 
                 usecols=['COMMON NAME', 'COUNTRY', 'STATE', 'COUNTY', 'OBSERVATION DATE'])

print(df.shape)

df.head()

(10000, 5)


Unnamed: 0,COMMON NAME,COUNTRY,STATE,COUNTY,OBSERVATION DATE
0,Magnolia Warbler,United States,Illinois,Cook,1995-08-27
1,White-rumped Sandpiper,Canada,Quebec,Manicouagan,1993-11-07
2,Common Scoter,Sweden,Hallands län [SE-13],,1998-02-21
3,Ring-billed Gull,Canada,Manitoba,South Interlake,1985-04-14
4,Red-winged Blackbird,Canada,Manitoba,South Interlake,1986-09-01


In [49]:
def us_bird_filter(df):
    return df.query("country == 'United States'")

def bird_column_renamer(df):
    return df.rename(columns={
    'COMMON NAME': 'name',
    'OBSERVATION DATE': 'observ_date',
    'COUNTRY': 'country',
    'STATE': 'state',
    'COUNTY': 'county'})


def bad_name_cleaner(df):
    df['bad_name'] = df['name'].apply(lambda x: 0 if ("sp." in x) or ("(" in x) or ("/" in x) else 1)

    mask = df['bad_name'] == 0
    df = df[~mask].drop(columns=['bad_name'])
    
    return df

def get_season(df):
    # convert date to datetime
    df['observ_date'] = pd.to_datetime(df['observ_date'], infer_datetime_format=True)
    
    # extract month
    df['month'] = df['observ_date'].dt.month
    
    # map to a season
    def season_from_month(x):
        if x in [12, 1, 2]:
            return 'Winter'
        elif x in [3, 4, 5]:
            return 'Spring'
        elif x in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Fall'

    df['season'] = df['month'].apply(season_from_month)
    
    df = df.drop(columns=['observ_date', 'month'])
    
    return df


def county_state_merger(df):
    
    df['county_state'] = df['county'] + ',' + df['state']
    return df

In [None]:
def region_column_renamer(df):
    return df

def clean_cols(df):
    return df

def county_fixer(df):
    return df

In [50]:
def clean_regions(df):
    
    # Copy df 
    df = df.copy()
    # rename columns
    df = region_column_renamer(df)
    
    # strip leading whitespace from state column
    # drop numbers from region
    # drop state from county
    # clean up state, county, and region columns appropriately
    df = clean_cols(df)
    
   
    # fix county names
    df = county_fixer(df)
    
    # combine county state
    df = county_state_merger(df)
    
    return df

In [51]:
def clean_bird_chunks(df):
    
    # rename columns
    df = bird_column_renamer(df)
        
    # filter for us birds
    df = us_bird_filter(df)

    df.dropna(subset=['county'])
    
    # clean badly named birds
    df = bad_name_cleaner(df)
    
    # get season thru date
    # convert date to datetime
    # extract month
    # map to a season
    df = get_season(df)

    # combine county + state
    df = county_state_merger(df)
    
    return df

In [52]:
df = clean_bird_chunks(df)

In [53]:
df.head()

Unnamed: 0,name,country,state,county,season,county_state
0,Magnolia Warbler,United States,Illinois,Cook,Summer,"Cook,Illinois"
6,Greater Yellowlegs,United States,Texas,Aransas,Spring,"Aransas,Texas"
12,White-crowned Sparrow,United States,Arizona,Cochise,Fall,"Cochise,Arizona"
13,Green-winged Teal,United States,Idaho,Ada,Winter,"Ada,Idaho"
14,Yellow-rumped Warbler,United States,Idaho,Ada,Winter,"Ada,Idaho"


In [54]:
df.shape

(5463, 6)