In [1]:
import pandas as pd
import numpy as np

## Data Preparation

Steps:
- read in 200K lines of original csv with just **['COMMON NAME', 'COUNTRY', 'STATE', 'COUNTY', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'OBSERVATION COUNT']** columns
- rename columns for easier working
- replace 'X' with 1 in **observ_count** column
- filter just birds in 'United States'
- drop rows with NaN in **county**
- convert **OBSERVATION DATE** to datetime
- extract month and year from **OBSERVATION DATE** into their own columns
- **'season'** column from month
- **county_state** column to merge on, no space
- load region excel file
- strip leading whitespace from **State**
- Alaska has strange county names; map them to match the birds county names
- drop 'county' from **CountyName**
- merge on county
- **counts** Series with percentage of each bird
- **total_rarity** column mapped from **counts**
- **regional_rarity** from counts split by region
- **seasonal_rarity** from counts by region and season
- **rarity_label** if any of the three comes out as rare

In [2]:
# Step 1: Read in the first 200K lines, with 8 columns
df = pd.read_csv('C:\\Users\\ajaco\\Downloads\\ebd_relJan-2020.txt', sep='\t', nrows=200000, usecols=['COMMON NAME', 'COUNTRY', 'STATE', 'COUNTY', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'OBSERVATION COUNT'])

print(df.shape)

df.head()

(200000, 8)


Unnamed: 0,COMMON NAME,OBSERVATION COUNT,COUNTRY,STATE,COUNTY,LATITUDE,LONGITUDE,OBSERVATION DATE
0,Magnolia Warbler,2,United States,Illinois,Cook,41.775629,-87.583273,1995-08-27
1,White-rumped Sandpiper,4,Canada,Quebec,Manicouagan,49.21667,-68.15,1993-11-07
2,Common Scoter,1,Sweden,Hallands län [SE-13],,57.065084,12.243579,1998-02-21
3,Ring-billed Gull,15,Canada,Manitoba,South Interlake,50.193256,-97.137935,1985-04-14
4,Red-winged Blackbird,500,Canada,Manitoba,South Interlake,50.193256,-97.137935,1986-09-01


In [3]:
df.isnull().sum()

COMMON NAME              0
OBSERVATION COUNT        0
COUNTRY                  0
STATE                    0
COUNTY               29261
LATITUDE                 0
LONGITUDE                0
OBSERVATION DATE         0
dtype: int64

In [4]:
# Rename columns for ease of use
df.rename(columns={
    'COMMON NAME': 'name',
    'OBSERVATION COUNT': 'observ_count',
    'COUNTRY': 'country',
    'STATE': 'state',
    'COUNTY': 'county',
    'LATITUDE': 'latitude',
    'LONGITUDE': 'longitude',
    'OBSERVATION DATE': 'observ_date'
}, inplace=True)

In [5]:
df.head()

Unnamed: 0,name,observ_count,country,state,county,latitude,longitude,observ_date
0,Magnolia Warbler,2,United States,Illinois,Cook,41.775629,-87.583273,1995-08-27
1,White-rumped Sandpiper,4,Canada,Quebec,Manicouagan,49.21667,-68.15,1993-11-07
2,Common Scoter,1,Sweden,Hallands län [SE-13],,57.065084,12.243579,1998-02-21
3,Ring-billed Gull,15,Canada,Manitoba,South Interlake,50.193256,-97.137935,1985-04-14
4,Red-winged Blackbird,500,Canada,Manitoba,South Interlake,50.193256,-97.137935,1986-09-01


In [6]:
# Filter for just US birds
us_birds = df.query("country == 'United States'")

print(us_birds.shape)
us_birds.head()

(105294, 8)


Unnamed: 0,name,observ_count,country,state,county,latitude,longitude,observ_date
0,Magnolia Warbler,2,United States,Illinois,Cook,41.775629,-87.583273,1995-08-27
6,Greater Yellowlegs,X,United States,Texas,Aransas,28.240392,-96.818819,1986-04-06
12,White-crowned Sparrow,X,United States,Arizona,Cochise,31.898164,-109.115932,1998-11-27
13,Green-winged Teal,11,United States,Idaho,Ada,43.609793,-116.206427,1982-12-18
14,Yellow-rumped Warbler,5,United States,Idaho,Ada,43.609793,-116.206427,1982-12-18


In [7]:
#217 birds are missing 
us_birds.isnull().sum()

name              0
observ_count      0
country           0
state             0
county          217
latitude          0
longitude         0
observ_date       0
dtype: int64

In [8]:
# Drop missing column values
us_birds.dropna(subset=['county'], inplace=True)

us_birds.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


name            0
observ_count    0
country         0
state           0
county          0
latitude        0
longitude       0
observ_date     0
dtype: int64

In [9]:
# Replace 'X' in 'observ_count' with 1
us_birds['observ_count'] = us_birds['observ_count'].apply(lambda x: 1 if x == 'X' else x)

us_birds.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,name,observ_count,country,state,county,latitude,longitude,observ_date
0,Magnolia Warbler,2,United States,Illinois,Cook,41.775629,-87.583273,1995-08-27
6,Greater Yellowlegs,1,United States,Texas,Aransas,28.240392,-96.818819,1986-04-06
12,White-crowned Sparrow,1,United States,Arizona,Cochise,31.898164,-109.115932,1998-11-27
13,Green-winged Teal,11,United States,Idaho,Ada,43.609793,-116.206427,1982-12-18
14,Yellow-rumped Warbler,5,United States,Idaho,Ada,43.609793,-116.206427,1982-12-18


In [10]:
# Convert 'observ_date' to datetime and extract year and month
us_birds.observ_date = pd.to_datetime(us_birds['observ_date'], infer_datetime_format=True)
us_birds['year'] = us_birds.observ_date.dt.year
us_birds['month'] = us_birds.observ_date.dt.month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [11]:
us_birds.columns

Index(['name', 'observ_count', 'country', 'state', 'county', 'latitude',
       'longitude', 'observ_date', 'year', 'month'],
      dtype='object')

In [12]:
us_birds.dtypes

name                    object
observ_count            object
country                 object
state                   object
county                  object
latitude               float64
longitude              float64
observ_date     datetime64[ns]
year                     int64
month                    int64
dtype: object

In [13]:
us_birds.isnull().sum()

name            0
observ_count    0
country         0
state           0
county          0
latitude        0
longitude       0
observ_date     0
year            0
month           0
dtype: int64

In [14]:
# 'season' column from month values
def month_to_season(x):
    if x in [12, 1, 2]:
        return 'Winter'
    elif x in [3, 4, 5]:
        return 'Spring'
    elif x in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

us_birds['season'] = us_birds['month'].apply(month_to_season)
us_birds.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,name,observ_count,country,state,county,latitude,longitude,observ_date,year,month,season
0,Magnolia Warbler,2,United States,Illinois,Cook,41.775629,-87.583273,1995-08-27,1995,8,Summer
6,Greater Yellowlegs,1,United States,Texas,Aransas,28.240392,-96.818819,1986-04-06,1986,4,Spring
12,White-crowned Sparrow,1,United States,Arizona,Cochise,31.898164,-109.115932,1998-11-27,1998,11,Fall
13,Green-winged Teal,11,United States,Idaho,Ada,43.609793,-116.206427,1982-12-18,1982,12,Winter
14,Yellow-rumped Warbler,5,United States,Idaho,Ada,43.609793,-116.206427,1982-12-18,1982,12,Winter


In [15]:
us_birds.season.value_counts()

Spring    39610
Summer    25783
Fall      22449
Winter    17235
Name: season, dtype: int64

In [16]:
us_birds['county_state'] = us_birds['county'] + us_birds['state']

us_birds.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,name,observ_count,country,state,county,latitude,longitude,observ_date,year,month,season,county_state
0,Magnolia Warbler,2,United States,Illinois,Cook,41.775629,-87.583273,1995-08-27,1995,8,Summer,CookIllinois
6,Greater Yellowlegs,1,United States,Texas,Aransas,28.240392,-96.818819,1986-04-06,1986,4,Spring,AransasTexas
12,White-crowned Sparrow,1,United States,Arizona,Cochise,31.898164,-109.115932,1998-11-27,1998,11,Fall,CochiseArizona
13,Green-winged Teal,11,United States,Idaho,Ada,43.609793,-116.206427,1982-12-18,1982,12,Winter,AdaIdaho
14,Yellow-rumped Warbler,5,United States,Idaho,Ada,43.609793,-116.206427,1982-12-18,1982,12,Winter,AdaIdaho


In [17]:
# Checking county name data
us_birds.query("county == 'Los Angeles'")

Unnamed: 0,name,observ_count,country,state,county,latitude,longitude,observ_date,year,month,season,county_state
500,Hutton's Vireo,1,United States,California,Los Angeles,34.217551,-118.162250,1982-05-30,1982,5,Spring,Los AngelesCalifornia
1467,Glaucous-winged Gull,5,United States,California,Los Angeles,34.005131,-118.807549,1994-12-18,1994,12,Winter,Los AngelesCalifornia
1816,Lesser Goldfinch,10,United States,California,Los Angeles,34.005131,-118.807549,1980-01-15,1980,1,Winter,Los AngelesCalifornia
1817,American Goldfinch,30,United States,California,Los Angeles,34.005131,-118.807549,1980-01-15,1980,1,Winter,Los AngelesCalifornia
2291,Rock Pigeon,15,United States,California,Los Angeles,34.005131,-118.807549,1982-05-29,1982,5,Spring,Los AngelesCalifornia
...,...,...,...,...,...,...,...,...,...,...,...,...
198434,White-headed Woodpecker,1,United States,California,Los Angeles,34.326745,-118.004714,1995-11-24,1995,11,Fall,Los AngelesCalifornia
198586,Hooded Oriole,1,United States,California,Los Angeles,34.056010,-118.246332,1997-05-02,1997,5,Spring,Los AngelesCalifornia
198706,Mourning Dove,8,United States,California,Los Angeles,34.082415,-118.503149,1978-05-29,1978,5,Spring,Los AngelesCalifornia
199581,Townsend's Warbler,2,United States,California,Los Angeles,34.056010,-118.246332,1996-11-26,1996,11,Fall,Los AngelesCalifornia


In [59]:
# Read in regions df
regions = pd.read_excel("C:\\Users\\ajaco\\Desktop\\repos\\noreallyimfine\\ebird-project\\URAmericaMapCountyList.xlsx", skiprows=3)

print(regions.shape)

regions.head()

(3142, 11)


Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg
0,Alabama,"Autauga County, Alabama",1001,1,01 Deep South,3,3 Rural/Semi-Rural,55416,925.973699,1.913169,100
1,Alabama,"Baldwin County, Alabama",1003,3,03 Gulf Coast,3,3 Rural/Semi-Rural,208563,475.934591,0.983336,100
2,Alabama,"Barbour County, Alabama",1005,1,01 Deep South,3,3 Rural/Semi-Rural,25965,139.162914,0.287527,100
3,Alabama,"Bibb County, Alabama",1007,2,02 Appohzarka,3,3 Rural/Semi-Rural,22643,51.047742,0.105471,100
4,Alabama,"Blount County, Alabama",1009,2,02 Appohzarka,3,3 Rural/Semi-Rural,57704,97.358358,0.201154,100


In [51]:
regions.query("State == ' Alaska'")

Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg
67,Alaska,"Aleutians East Borough, Alaska",2013,11,11 Mountain West,3,3 Rural/Semi-Rural,3296,0.472503,0.000976,100
68,Alaska,"Aleutians West Census Area, Alaska",2016,11,11 Mountain West,3,3 Rural/Semi-Rural,5647,3.614802,0.007469,100
69,Alaska,"Anchorage Municipality, Alaska",2020,11,11 Mountain West,1,1 Urban,298192,3843.022384,7.940129,100
70,Alaska,"Bethel Census Area, Alaska",2050,11,11 Mountain West,3,3 Rural/Semi-Rural,17968,10.62591,0.021954,100
71,Alaska,"Bristol Bay Borough, Alaska",2060,11,11 Mountain West,3,3 Rural/Semi-Rural,898,1.869604,0.003863,100
72,Alaska,"Denali Borough, Alaska",2068,11,11 Mountain West,3,3 Rural/Semi-Rural,1953,0.166698,0.000344,100
73,Alaska,"Dillingham Census Area, Alaska",2070,11,11 Mountain West,3,3 Rural/Semi-Rural,4954,6.823104,0.014097,100
74,Alaska,"Fairbanks North Star Borough, Alaska",2090,11,11 Mountain West,3,3 Rural/Semi-Rural,100605,1087.797823,2.247516,100
75,Alaska,"Haines Borough, Alaska",2100,14,14 Northwest,3,3 Rural/Semi-Rural,2496,1.08956,0.002251,100
76,Alaska,"Hoonah-Angoon Census Area, Alaska",2105,14,14 Northwest,3,3 Rural/Semi-Rural,2078,1.702029,0.003517,100


In [52]:
regions.State.unique()

array([' Alabama', ' Alaska', ' Arizona', ' Arkansas', ' California',
       ' Colorado', ' Connecticut', ' Delaware', ' District of Columbia',
       ' Florida', ' Georgia', ' Hawaii', ' Idaho', ' Illinois',
       ' Indiana', ' Iowa', ' Kansas', ' Kentucky', ' Louisiana',
       ' Maine', ' Maryland', ' Massachusetts', ' Michigan', ' Minnesota',
       ' Mississippi', ' Missouri', ' Montana', ' Nebraska', ' Nevada',
       ' New Hampshire', ' New Jersey', ' New Mexico', ' New York',
       ' North Carolina', ' North Dakota', ' Ohio', ' Oklahoma',
       ' Oregon', ' Pennsylvania', ' Rhode Island', ' South Carolina',
       ' South Dakota', ' Tennessee', ' Texas', ' Utah', ' Vermont',
       ' Virginia', ' Washington', ' West Virginia', ' Wisconsin',
       ' Wyoming'], dtype=object)

In [60]:
# Strip leading whitespace
regions.State = regions.State.str.strip()

regions.State.unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [63]:
# Split state name off county
regions['CountyName'] = regions['CountyName'].apply(lambda x: x.split(',')[0])

regions.head()

Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg
0,Alabama,Autauga County,1001,1,01 Deep South,3,3 Rural/Semi-Rural,55416,925.973699,1.913169,100
1,Alabama,Baldwin County,1003,3,03 Gulf Coast,3,3 Rural/Semi-Rural,208563,475.934591,0.983336,100
2,Alabama,Barbour County,1005,1,01 Deep South,3,3 Rural/Semi-Rural,25965,139.162914,0.287527,100
3,Alabama,Bibb County,1007,2,02 Appohzarka,3,3 Rural/Semi-Rural,22643,51.047742,0.105471,100
4,Alabama,Blount County,1009,2,02 Appohzarka,3,3 Rural/Semi-Rural,57704,97.358358,0.201154,100


In [75]:
# Fix ALaska counties
alaska = regions.query("State == 'Alaska'")
alaska

Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg
67,Alaska,Aleutians East Borough,2013,11,11 Mountain West,3,3 Rural/Semi-Rural,3296,0.472503,0.000976,100
68,Alaska,Aleutians West Census Area,2016,11,11 Mountain West,3,3 Rural/Semi-Rural,5647,3.614802,0.007469,100
69,Alaska,Anchorage Municipality,2020,11,11 Mountain West,1,1 Urban,298192,3843.022384,7.940129,100
70,Alaska,Bethel Census Area,2050,11,11 Mountain West,3,3 Rural/Semi-Rural,17968,10.62591,0.021954,100
71,Alaska,Bristol Bay Borough,2060,11,11 Mountain West,3,3 Rural/Semi-Rural,898,1.869604,0.003863,100
72,Alaska,Denali Borough,2068,11,11 Mountain West,3,3 Rural/Semi-Rural,1953,0.166698,0.000344,100
73,Alaska,Dillingham Census Area,2070,11,11 Mountain West,3,3 Rural/Semi-Rural,4954,6.823104,0.014097,100
74,Alaska,Fairbanks North Star Borough,2090,11,11 Mountain West,3,3 Rural/Semi-Rural,100605,1087.797823,2.247516,100
75,Alaska,Haines Borough,2100,14,14 Northwest,3,3 Rural/Semi-Rural,2496,1.08956,0.002251,100
76,Alaska,Hoonah-Angoon Census Area,2105,14,14 Northwest,3,3 Rural/Semi-Rural,2078,1.702029,0.003517,100


In [67]:
alaska_birds = us_birds.query("state == 'Alaska'")
alaska_birds.shape

(1585, 12)

In [70]:
alaska_birds.county.unique().tolist()

['Aleutians West',
 'Skagway-Hoonah-Angoon',
 'Wrangell-Petersburg',
 'Nome',
 'Ketchikan Gateway',
 'Kenai Peninsula',
 'Denali',
 'Anchorage',
 'Kodiak Island',
 'Fairbanks North Star',
 'Valdez-Cordova',
 'Dillingham',
 'Haines',
 'Juneau',
 'Northwest Arctic',
 'Yukon-Koyukuk',
 'Lake and Peninsula',
 'North Slope',
 'Yakutat',
 'Prince of Wales-Outer Ketchikan',
 'Matanuska-Susitna',
 'Bristol Bay',
 'Sitka',
 'Southeast Fairbanks',
 'Aleutians East',
 'Kusilvak',
 'Bethel']

In [76]:
county_dict = {
    'Aleutians East Borough': 'Aleutians East',
    'Aleutians West Census Area': 'Aleutians West',
    'Anchorage Municipality': 'Anchorage',
    'Bethel Census Area': 'Bethel',
    'Bristol Bay Borough': 'Bristol Bay',
    'Denali Borough': 'Denali',
    'Dillingham Census Area': 'Dillingham',
    'Fairbanks North Star Borough': 'Fairbanks North Star',
    'Haines Borough': 'Haines',
    'Hoonah-Angoon Census Area': 'Skagway-Hoonah-Angoon',
    'Juneau City and Borough': 'Juneau',
    'Kenai Peninsula Borough': 'Kenai Peninsula',
    'Ketchikan Gateway Borough': 'Ketchikan Gateway',
    'Kodiak Island Borough': 'Kodiak Island',
    'Kusilvak Census Area': 'Kusilvak',
    'Lake and Peninsula Borough': 'Lake and Peninsula',
    'Matanuska-Susitna Borough': 'Matanuska-Susitna',
    'Nome Census Area': 'Nome',
    'North Slope Borough': 'North Slope',
    'Northwest Arctic Borough': 'Northwest Arctic',
    'Petersburg Borough': 'Petersburg Borough',
    'Prince of Wales-Hyder Census Area': 'Prince of Wales-Outer Ketchikan',
    'Sitka City and Borough': 'Sitka',
    'Southeast Fairbanks Census Area': 'Southeast Fairbanks',
    'Valdez-Cordova Census Area': 'Valdez-Cordova',
    'Yakutat City and Borough': 'Yakutat',
    'Yukon-Koyukuk Census Area': 'Yukon-Koyukuk'
}


alaska['CountyName'] = alaska['CountyName'].apply(lambda x: county_dict[x] if x in county_dict.keys() else x)
alaska

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg
67,Alaska,Aleutians East,2013,11,11 Mountain West,3,3 Rural/Semi-Rural,3296,0.472503,0.000976,100
68,Alaska,Aleutians West,2016,11,11 Mountain West,3,3 Rural/Semi-Rural,5647,3.614802,0.007469,100
69,Alaska,Anchorage,2020,11,11 Mountain West,1,1 Urban,298192,3843.022384,7.940129,100
70,Alaska,Bethel,2050,11,11 Mountain West,3,3 Rural/Semi-Rural,17968,10.62591,0.021954,100
71,Alaska,Bristol Bay,2060,11,11 Mountain West,3,3 Rural/Semi-Rural,898,1.869604,0.003863,100
72,Alaska,Denali,2068,11,11 Mountain West,3,3 Rural/Semi-Rural,1953,0.166698,0.000344,100
73,Alaska,Dillingham,2070,11,11 Mountain West,3,3 Rural/Semi-Rural,4954,6.823104,0.014097,100
74,Alaska,Fairbanks North Star,2090,11,11 Mountain West,3,3 Rural/Semi-Rural,100605,1087.797823,2.247516,100
75,Alaska,Haines,2100,14,14 Northwest,3,3 Rural/Semi-Rural,2496,1.08956,0.002251,100
76,Alaska,Skagway-Hoonah-Angoon,2105,14,14 Northwest,3,3 Rural/Semi-Rural,2078,1.702029,0.003517,100


In [78]:
regions.CountyName.nunique()

1877

In [80]:
# Change Alaska county names to match birds counties
county_dict = {
    'Aleutians East Borough': 'Aleutians East',
    'Aleutians West Census Area': 'Aleutians West',
    'Anchorage Municipality': 'Anchorage',
    'Bethel Census Area': 'Bethel',
    'Bristol Bay Borough': 'Bristol Bay',
    'Denali Borough': 'Denali',
    'Dillingham Census Area': 'Dillingham',
    'Fairbanks North Star Borough': 'Fairbanks North Star',
    'Haines Borough': 'Haines',
    'Hoonah-Angoon Census Area': 'Skagway-Hoonah-Angoon',
    'Juneau City and Borough': 'Juneau',
    'Kenai Peninsula Borough': 'Kenai Peninsula',
    'Ketchikan Gateway Borough': 'Ketchikan Gateway',
    'Kodiak Island Borough': 'Kodiak Island',
    'Kusilvak Census Area': 'Kusilvak',
    'Lake and Peninsula Borough': 'Lake and Peninsula',
    'Matanuska-Susitna Borough': 'Matanuska-Susitna',
    'Nome Census Area': 'Nome',
    'North Slope Borough': 'North Slope',
    'Northwest Arctic Borough': 'Northwest Arctic',
    'Petersburg Borough': 'Petersburg Borough',
    'Prince of Wales-Hyder Census Area': 'Prince of Wales-Outer Ketchikan',
    'Sitka City and Borough': 'Sitka',
    'Southeast Fairbanks Census Area': 'Southeast Fairbanks',
    'Valdez-Cordova Census Area': 'Valdez-Cordova',
    'Yakutat City and Borough': 'Yakutat',
    'Yukon-Koyukuk Census Area': 'Yukon-Koyukuk'
}

regions['CountyName'] = regions['CountyName'].apply(lambda x: county_dict[x] if x in county_dict.keys() else x)
regions.CountyName.nunique()

1877

In [81]:
regions[regions['State'] == 'Alaska']

Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg
67,Alaska,Aleutians East,2013,11,11 Mountain West,3,3 Rural/Semi-Rural,3296,0.472503,0.000976,100
68,Alaska,Aleutians West,2016,11,11 Mountain West,3,3 Rural/Semi-Rural,5647,3.614802,0.007469,100
69,Alaska,Anchorage,2020,11,11 Mountain West,1,1 Urban,298192,3843.022384,7.940129,100
70,Alaska,Bethel,2050,11,11 Mountain West,3,3 Rural/Semi-Rural,17968,10.62591,0.021954,100
71,Alaska,Bristol Bay,2060,11,11 Mountain West,3,3 Rural/Semi-Rural,898,1.869604,0.003863,100
72,Alaska,Denali,2068,11,11 Mountain West,3,3 Rural/Semi-Rural,1953,0.166698,0.000344,100
73,Alaska,Dillingham,2070,11,11 Mountain West,3,3 Rural/Semi-Rural,4954,6.823104,0.014097,100
74,Alaska,Fairbanks North Star,2090,11,11 Mountain West,3,3 Rural/Semi-Rural,100605,1087.797823,2.247516,100
75,Alaska,Haines,2100,14,14 Northwest,3,3 Rural/Semi-Rural,2496,1.08956,0.002251,100
76,Alaska,Skagway-Hoonah-Angoon,2105,14,14 Northwest,3,3 Rural/Semi-Rural,2078,1.702029,0.003517,100


In [82]:
# Split off 'county' from name
regions['CountyName'] = regions['CountyName'].apply(lambda x: x if 'County' not in x else ' '.join(x.split()[:-1]))

regions.head()

Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg
0,Alabama,Autauga,1001,1,01 Deep South,3,3 Rural/Semi-Rural,55416,925.973699,1.913169,100
1,Alabama,Baldwin,1003,3,03 Gulf Coast,3,3 Rural/Semi-Rural,208563,475.934591,0.983336,100
2,Alabama,Barbour,1005,1,01 Deep South,3,3 Rural/Semi-Rural,25965,139.162914,0.287527,100
3,Alabama,Bibb,1007,2,02 Appohzarka,3,3 Rural/Semi-Rural,22643,51.047742,0.105471,100
4,Alabama,Blount,1009,2,02 Appohzarka,3,3 Rural/Semi-Rural,57704,97.358358,0.201154,100


In [83]:
regions.query("State == 'Alaska'")

Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg
67,Alaska,Aleutians East,2013,11,11 Mountain West,3,3 Rural/Semi-Rural,3296,0.472503,0.000976,100
68,Alaska,Aleutians West,2016,11,11 Mountain West,3,3 Rural/Semi-Rural,5647,3.614802,0.007469,100
69,Alaska,Anchorage,2020,11,11 Mountain West,1,1 Urban,298192,3843.022384,7.940129,100
70,Alaska,Bethel,2050,11,11 Mountain West,3,3 Rural/Semi-Rural,17968,10.62591,0.021954,100
71,Alaska,Bristol Bay,2060,11,11 Mountain West,3,3 Rural/Semi-Rural,898,1.869604,0.003863,100
72,Alaska,Denali,2068,11,11 Mountain West,3,3 Rural/Semi-Rural,1953,0.166698,0.000344,100
73,Alaska,Dillingham,2070,11,11 Mountain West,3,3 Rural/Semi-Rural,4954,6.823104,0.014097,100
74,Alaska,Fairbanks North Star,2090,11,11 Mountain West,3,3 Rural/Semi-Rural,100605,1087.797823,2.247516,100
75,Alaska,Haines,2100,14,14 Northwest,3,3 Rural/Semi-Rural,2496,1.08956,0.002251,100
76,Alaska,Skagway-Hoonah-Angoon,2105,14,14 Northwest,3,3 Rural/Semi-Rural,2078,1.702029,0.003517,100


In [84]:
regions.query("CountyName == 'Los Angeles'")

Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg
204,California,Los Angeles,6037,13,13 West Coast,1,1 Urban,10137915,13535.741916,27.966409,100


In [85]:
regions['county_state'] = regions['CountyName'] + regions.State

In [86]:
regions.head()

Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg,county_state
0,Alabama,Autauga,1001,1,01 Deep South,3,3 Rural/Semi-Rural,55416,925.973699,1.913169,100,AutaugaAlabama
1,Alabama,Baldwin,1003,3,03 Gulf Coast,3,3 Rural/Semi-Rural,208563,475.934591,0.983336,100,BaldwinAlabama
2,Alabama,Barbour,1005,1,01 Deep South,3,3 Rural/Semi-Rural,25965,139.162914,0.287527,100,BarbourAlabama
3,Alabama,Bibb,1007,2,02 Appohzarka,3,3 Rural/Semi-Rural,22643,51.047742,0.105471,100,BibbAlabama
4,Alabama,Blount,1009,2,02 Appohzarka,3,3 Rural/Semi-Rural,57704,97.358358,0.201154,100,BlountAlabama


In [87]:
us_birds.shape

(105077, 12)

In [88]:
us_birds.merge(regions).shape

(103138, 23)

Up to merging step. losing 3500 rows on merge, must figure out why

- Alasks was the first culprit, figured that out but still losing ~2K rows
- Lo

In [89]:
reg_set = set(regions.county_state.tolist())
us_set = set(us_birds.county_state.tolist())

extra = []
for key in us_set:
    if key not in reg_set:
        extra.append(key)

In [90]:
extra

['AscensionLouisiana',
 'ManassasVirginia',
 'AlexandriaVirginia',
 'WashingtonLouisiana',
 'St. CharlesLouisiana',
 'Wrangell-PetersburgAlaska',
 'St. BernardLouisiana',
 'SuffolkVirginia',
 'Dona AnaNew Mexico',
 'East Baton RougeLouisiana',
 'VernonLouisiana',
 'TerrebonneLouisiana',
 'St. Louis CityMissouri',
 'CaddoLouisiana',
 'RapidesLouisiana',
 'BienvilleLouisiana',
 'IbervilleLouisiana',
 'MadisonLouisiana',
 'VermilionLouisiana',
 'St. John the BaptistLouisiana',
 'HopewellVirginia',
 'Jefferson DavisLouisiana',
 'Virginia BeachVirginia',
 'EvangelineLouisiana',
 'LafourcheLouisiana',
 'SalemVirginia',
 'West Baton RougeLouisiana',
 'MorehouseLouisiana',
 'WilliamsburgVirginia',
 'AllenLouisiana',
 'PlaqueminesLouisiana',
 'St. JamesLouisiana',
 'JeffersonLouisiana',
 'Newport NewsVirginia',
 'RadfordVirginia',
 'St. TammanyLouisiana',
 'OuachitaLouisiana',
 'De SotoLouisiana',
 'LivingstonLouisiana',
 'AvoyellesLouisiana',
 'HamptonVirginia',
 'East FelicianaLouisiana',
 'C

In [40]:
regions.query("State == 'Alaska'")

Unnamed: 0,State,CountyName,CountyFIPS,Region,RegionName,DensityCat,DensityCatName,Population2016,Density2016_SqMi,Density2016_PFbF,DensityAggreg,county_state
67,Alaska,A l e u t i a n s E a s t B o r o u g h,2013,11,11 Mountain West,3,3 Rural/Semi-Rural,3296,0.472503,0.000976,100,A l e u t i a n s E a s t B o r o u g hAlaska
68,Alaska,A l e u t i a n s W e s t C e n s u s A ...,2016,11,11 Mountain West,3,3 Rural/Semi-Rural,5647,3.614802,0.007469,100,A l e u t i a n s W e s t C e n s u s A ...
69,Alaska,A n c h o r a g e M u n i c i p a l i t y,2020,11,11 Mountain West,1,1 Urban,298192,3843.022384,7.940129,100,A n c h o r a g e M u n i c i p a l i t yAlaska
70,Alaska,B e t h e l C e n s u s A r e a,2050,11,11 Mountain West,3,3 Rural/Semi-Rural,17968,10.62591,0.021954,100,B e t h e l C e n s u s A r e aAlaska
71,Alaska,B r i s t o l B a y B o r o u g h,2060,11,11 Mountain West,3,3 Rural/Semi-Rural,898,1.869604,0.003863,100,B r i s t o l B a y B o r o u g hAlaska
72,Alaska,D e n a l i B o r o u g h,2068,11,11 Mountain West,3,3 Rural/Semi-Rural,1953,0.166698,0.000344,100,D e n a l i B o r o u g hAlaska
73,Alaska,D i l l i n g h a m C e n s u s A r e a,2070,11,11 Mountain West,3,3 Rural/Semi-Rural,4954,6.823104,0.014097,100,D i l l i n g h a m C e n s u s A r e aAlaska
74,Alaska,F a i r b a n k s N o r t h S t a r B o ...,2090,11,11 Mountain West,3,3 Rural/Semi-Rural,100605,1087.797823,2.247516,100,F a i r b a n k s N o r t h S t a r B o ...
75,Alaska,H a i n e s B o r o u g h,2100,14,14 Northwest,3,3 Rural/Semi-Rural,2496,1.08956,0.002251,100,H a i n e s B o r o u g hAlaska
76,Alaska,H o o n a h - A n g o o n C e n s u s A r e a,2105,14,14 Northwest,3,3 Rural/Semi-Rural,2078,1.702029,0.003517,100,H o o n a h - A n g o o n C e n s u s A r ...
