In [1]:
import hashlib, re, time
import numpy as np
import pandas as pd

time_start = time.time()

raw_csv = pd.read_csv('raw_data.csv')
max_colname_len = max(
    [len(col) for col in raw_csv.columns.values]
)
for col in raw_csv.columns.values:
    print('{:{}}<{}>'.format(
        col, max_colname_len+2, raw_csv[col].dtype)
    )

Bird Species  <object>
Distance Bin  <object>
Date          <object>
Time          <object>
Coordinates   <object>
Location      <object>
Habitat Type  <object>
Time Period   <object>
Group Name    <object>
Remarks       <object>


In [2]:
num_locations = len(set(raw_csv['Location']))
num_coords = len(set(raw_csv['Coordinates']))
print('Number of Unique Locations: {}'.format(num_locations))
print('Number of Unique Coordinates: {}'.format(num_coords))

Number of Unique Locations: 207
Number of Unique Coordinates: 270


**First of all, we need to be able to clearly distinguish survey sites.** As from the cell directly above, we cannot reliably use either the number of unique locations or coordinates in order to do this.

Instead, we will use both. Going through the list from the first to last row, _adjacent_ rows with identical locations **or** coordinate values will be grouped together as one survey site. Each survey site is identifed with a hash generated from the cocatenated string of its first row's location plus coordinate and group name.

In [3]:
csv_unique_sites = pd.DataFrame()
old_row = None
current_hash = None

def get_loc_hash(row):
    '''@returns {str} Location Hash'''
    combined_in_bytes = (
        row['Coordinates'] + row['Location'] + row['Group Name']
    ).encode()
    loc_hash = hashlib.md5(combined_in_bytes).hexdigest()
    return loc_hash

def get_new_row(row, loc_hash):
    '''@returns {dict} Dictionary containing values for new row'''
    new_row = dict(row)
    new_row.pop('Date')
    new_row['Location Hash'] = loc_hash
    return new_row

def is_same_site(row1, row2):
    if (row1 is None or row2 is None):
        return False
    else:
        is_same_coords = row1['Coordinates'] == row2['Coordinates']
        is_same_loc = row1['Location'] == row2['Location']
        return is_same_coords or is_same_loc

for _, row in raw_csv.iterrows():
    if not is_same_site(old_row, row):
        current_hash = get_loc_hash(row)
    csv_unique_sites = csv_unique_sites.append(
        get_new_row(row, current_hash),
        ignore_index=True)
    old_row = row

In [4]:
num_locations = len(set(csv_unique_sites['Location Hash']))
print('Number of Unique Location Hashes: {}'.format(num_locations))

Number of Unique Location Hashes: 191


In [5]:
csv_unique_sites.head(3)

Unnamed: 0,Bird Species,Coordinates,Distance Bin,Group Name,Habitat Type,Location,Location Hash,Remarks,Time,Time Period
0,Black-naped Oriole,Latitude: 1.29587 | Longitude: 103.770656,2,Asian Koel,Urban,8 Kent Ridge Drive (Just outside Ventus),8b577534acf524e9fac700ce1bb8753a,,09 35,09 32 - 09 42
1,Black-naped Oriole,Latitude: 1.29587 | Longitude: 103.770656,3,Asian Koel,Urban,8 Kent Ridge Drive (Just outside Ventus),8b577534acf524e9fac700ce1bb8753a,,09 40,09 32 - 09 42
2,Black-naped Oriole,Latitude: 1.29587 | Longitude: 103.770656,Heard,Asian Koel,Urban,8 Kent Ridge Drive (Just outside Ventus),8b577534acf524e9fac700ce1bb8753a,,09 41,09 32 - 09 42


In [6]:
set(csv_unique_sites['Bird Species'])

{'"Chweet chweet" then 4 secs pause, repeat',
 '"Parking car sound" bird',
 '-',
 'Asian Glossy Starling',
 'Asian Koel',
 'Asian Koele',
 'Bird of prey(?)',
 'Bird that makes a low-pitched quack',
 'Bird that sounds like monkey',
 'Black-capped Kingfisher',
 'Black-naped Oriole',
 'Black-naped Oriole ',
 'Black-naped oriole (?)',
 'Brahminy Kite',
 'Brahminy Kite ',
 'Brahminy kite',
 'Cinnamon-headed Green Pigeon',
 'Collared Kingfisher',
 'Collared Kingfisher ',
 'Collared Kingfisher x2',
 'Crimson Sunbird',
 'Crow',
 'Eagle',
 'Eagle ',
 'Eurasian Tree Sparrow',
 'Eurasian Tree Sparrow ',
 'Eurasian Tree Sparrow x3',
 'Flame-breasted sunbird',
 'Jarvan Myna',
 'Javan Myna',
 'Javan Myna ',
 'Javan Myna (x2)',
 'Javan Myna x4',
 'Javan Mynah',
 'Javan myna',
 'Jungle Crow',
 'Kingfisher',
 'Koel',
 'Large black bird',
 'Large grey/black bird, suspected pigeon',
 'Large-billed Crow',
 'None ',
 'Olive-backed Sunbird',
 'Olive-backed Sunbird ',
 'Olive-backed Sunbird (?)',
 'Olive-bac

Next, we want to standardise the formatting of each column, from the right to left. **Starting with the species names, we...**
1. Expand rows with multiple observations, e.g. Javan Myna x4
2. Replace observations with no clear IDs with NaN values
3. Standardise trailing white-spaces, spellings, and capitalisations

In [7]:
better_names = pd.DataFrame()
contains_digits = re.compile('\d')
bad_idents = re.compile(
    '(\`|[Un]nknown|[Ss]uspected|\?|None|^\-$|'
    'sound|chweet|with| w |that|Large black bird|'
    '[Ss]potted [Pp]igeon)'
)

def process_row(row):
    '''@returns {tuple} <repeats: int, row: dict>'''
    row = dict(row)  # none-destructive
    # first, check if this row contains multiple sightings
    if contains_digits.search(row['Bird Species']):
        print('Has multiple: {}'.format(row['Bird Species']))
        repeats = contains_digits.search(row['Bird Species']).group(0)
        repeats = int(repeats)
        row['Bird Species'] = row['Bird Species'].strip()
        last_whitespace_i = row['Bird Species'].rfind(' ')
        row['Bird Species'] = row['Bird Species'][:last_whitespace_i]
    else:
        repeats = 1
    # then, standardise the species naming
    row['Bird Species'] = row['Bird Species'].strip().lower().capitalize()
    # next, check for bad IDs
    if bad_idents.search(row['Bird Species']):
        print('Is bad ID: {}'.format(row['Bird Species']))
        row['Bird Species'] = np.nan
    return (repeats, dict(row))
        
for _, row in csv_unique_sites.iterrows():
    repeats, new_row = process_row(row)
    for _ in range(repeats):
        better_names = better_names.append(new_row, ignore_index=True)

Is bad ID: Spotted pigeon
Is bad ID: None
Is bad ID: Pigeon(?)
Is bad ID: Olive-backed sunbird (?)
Is bad ID: Swiflet (?)
Is bad ID: `
Is bad ID: Sunbird with red crown
Is bad ID: Bird that sounds like monkey
Is bad ID: Bird that sounds like monkey
Is bad ID: Suspected javan myna
Has multiple: Swiflet (x3)
Has multiple: Swiftlet (x2)
Has multiple: Swiftlets (x8)
Has multiple: Swiftlets (x3)
Has multiple: Swiftlets (x2)
Has multiple: Swiftlets (x2)
Has multiple: Swiftlets (x2)
Has multiple: Swiftlet (x2)
Has multiple: Swiftlet (x4)
Has multiple: Javan Myna (x2)
Has multiple: Swiflet (x2)
Has multiple: Swiftlet (x2)
Is bad ID: Spotted pigeon
Is bad ID: Spotted pigeon
Is bad ID: Unknown (might be swiftlet)
Is bad ID: Unknown (might be swiftlet)
Is bad ID: Unknown (might be swiftlet)
Is bad ID: Bird of prey(?)
Is bad ID: Unknown
Is bad ID: Unknown
Has multiple: Swiflet x10
Has multiple: Collared Kingfisher x2
Is bad ID: Unknown bird of prey
Has multiple: Swiflet x2
Has multiple: Swiflet x6

In [8]:
sorted(
    [str(x) for x in list(set(better_names['Bird Species']))]
)  # str conversion because nan is a float, not comparable with str

['Asian glossy starling',
 'Asian koel',
 'Asian koele',
 'Black-capped kingfisher',
 'Black-naped oriole',
 'Brahminy kite',
 'Cinnamon-headed green pigeon',
 'Collared kingfisher',
 'Crimson sunbird',
 'Crow',
 'Eagle',
 'Eurasian tree sparrow',
 'Flame-breasted sunbird',
 'Jarvan myna',
 'Javan myna',
 'Javan mynah',
 'Jungle crow',
 'Kingfisher',
 'Koel',
 'Large-billed crow',
 'Olive-backed sunbird',
 'Olive-winged bulbol',
 'Olve-backed sunbird',
 'Paddyfield pipit',
 'Pink neck green pegion',
 'Pink neck green pigeon',
 'Pink-necked green pigeon',
 'Red junglefowl',
 'Rock pigeon',
 'Rocked pigeon',
 'Scarlet backed flowerpecker',
 'Scarlet-backed flowerpecker',
 'Spotted dove',
 'Stripped tit-babler',
 'Swallow',
 'Swiflet',
 'Swiflets',
 'Swiftlet',
 'Swiftlets',
 'White-crested laughingthrush',
 'Yellow-vented bulbul',
 'Zebra dove',
 'nan']

Now, we standardise spellings...
```
Asian koele -> Asian koel
Crow -> Jungle crow
Javan mynah -> Javan myna
Koel -> Asian koel
Large-billed crow -> Jungle crow
Olive-winged bulbol -> Olive-winged bulbul
Olve-backed sunbird -> Olive-backed sunbird
Pink neck green pegion -> Pink-necked green pigeon
Pink neck green pigeon -> Pink-necked green pigeon
Rocked pigeon -> Rock pigeon
Swiflet -> Swiftlet
Swiflets -> Swiftlet
Swiftlets -> Swiftlet
```

In [9]:
renames = {
    'Asian koel': re.compile('(Asian koele|^Koel$)'),
    'Jungle crow': re.compile('(^Crow$|Large-billed crow)'),
    'Javan myna': re.compile('Javan mynah'),
    'Olive-winged bulbul': re.compile('Olive-winged bulbol'),
    'Olive-backed sunbird': re.compile('Olve-backed sunbird'),
    'Pink-necked green pigeon': re.compile('Pink neck green p[ei]g[ie]on'),
    'Rock pigeon': re.compile('Rocked pigeon'),
    'Swiftlet': re.compile('(Swiflet[s]|Swiftlets)')
}

for _, row in better_names.iterrows():
    for proper_name, has_bad_name in renames.items():
        try:
            if has_bad_name.search(row['Bird Species']):
                print('Caught: {} -> {}'.format(
                    row['Bird Species'], proper_name))
                row['Bird Species'] = proper_name
        except TypeError:
            pass  # because of nan values

Caught: Swiftlets -> Swiftlet
Caught: Rocked pigeon -> Rock pigeon
Caught: Rocked pigeon -> Rock pigeon
Caught: Rocked pigeon -> Rock pigeon
Caught: Rocked pigeon -> Rock pigeon
Caught: Crow -> Jungle crow
Caught: Large-billed crow -> Jungle crow
Caught: Swiflets -> Swiftlet
Caught: Large-billed crow -> Jungle crow
Caught: Crow -> Jungle crow
Caught: Koel -> Asian koel
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Pink neck green pegion -> Pink-necked green pigeon
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Swiftlets -> Swiftlet
Caught: Asian koele -> Asian 

In [10]:
sorted(
    [str(x) for x in list(set(better_names['Bird Species']))]
)  # str conversion because nan is a float, not comparable with str

['Asian glossy starling',
 'Asian koel',
 'Black-capped kingfisher',
 'Black-naped oriole',
 'Brahminy kite',
 'Cinnamon-headed green pigeon',
 'Collared kingfisher',
 'Crimson sunbird',
 'Eagle',
 'Eurasian tree sparrow',
 'Flame-breasted sunbird',
 'Jarvan myna',
 'Javan myna',
 'Jungle crow',
 'Kingfisher',
 'Olive-backed sunbird',
 'Olive-winged bulbul',
 'Paddyfield pipit',
 'Pink-necked green pigeon',
 'Red junglefowl',
 'Rock pigeon',
 'Scarlet backed flowerpecker',
 'Scarlet-backed flowerpecker',
 'Spotted dove',
 'Stripped tit-babler',
 'Swallow',
 'Swiflet',
 'Swiftlet',
 'White-crested laughingthrush',
 'Yellow-vented bulbul',
 'Zebra dove',
 'nan']

In [11]:
set(better_names['Distance Bin'])

{'-',
 nan,
 'Flyby (4)',
 'heard',
 'Did not land',
 'Heard ',
 'flyby',
 '4',
 'In Flight',
 'In flight',
 'Heard',
 '2',
 'in flight',
 '1',
 '3',
 'flyby(3)',
 'Flyby'}

Next, we want to discount all birds in flight or heard but not seen. These observations will have a distance bin of `nan`

In [12]:
bad_distance = re.compile(
    '([Hh]eard|[Ff]lyby|[Ff]light|Did not land|^\-$)'
)

for _, row in better_names.iterrows():
    try:
        if bad_distance.search(row['Distance Bin']):
            row['Distance Bin'] = np.nan
    except TypeError:
        pass  # once again, nan is a float and cannot be searched

In [13]:
set(better_names['Distance Bin'])

{nan, '4', '1', '3', '2'}

In [14]:
set(better_names['Time'])

{'07 45',
 '07 50',
 '0759',
 '08 00',
 '08 06',
 '08 11',
 '08 12',
 '08 14',
 '08 18',
 '08 19',
 '08 22',
 '08 23',
 '08 28',
 '08 29',
 '08 30',
 '08 33',
 '08 37',
 '08 46',
 '08 59',
 '0801',
 '0802',
 '0803',
 '0804',
 '0805',
 '0806',
 '0807',
 '0808',
 '0810',
 '0811',
 '0812',
 '0814',
 '0816',
 '0817',
 '0819',
 '0820',
 '0823',
 '0824',
 '0825',
 '0826',
 '0827',
 '0828',
 '0829',
 '0831',
 '0832',
 '0833',
 '0835',
 '0836',
 '0837',
 '0838',
 '0839',
 '0840',
 '0841',
 '0842',
 '0843',
 '0844',
 '0845',
 '0846',
 '0847',
 '0848',
 '0849',
 '0850',
 '0852',
 '0854',
 '0858',
 '0859',
 '09 00',
 '09 01',
 '09 02',
 '09 04',
 '09 06',
 '09 08',
 '09 09',
 '09 12',
 '09 15',
 '09 19',
 '09 21',
 '09 28',
 '09 31',
 '09 32',
 '09 35',
 '09 36',
 '09 38',
 '09 40',
 '09 41',
 '09 46',
 '09 48',
 '09 51',
 '09 53',
 '09 54',
 '09 55',
 '0900',
 '0901',
 '0902',
 '0903',
 '0904',
 '0905',
 '0906',
 '0907',
 '0908',
 '0909',
 '0910',
 '0911',
 '0912',
 '0914',
 '0915',
 '0916',
 '0

Now, we want to standardise the times

In [15]:
is_dd_space_dd = re.compile('^\d\d \d\d$')
def fix_dd_space_dd(timestr):
    return timestr.replace(' ', '')

is_dd_colon_dd = re.compile('^\d\d:\d\d$')
def fix_dd_colon_dd(timestr):
    return timestr.replace(':', '')

is_d_dot_d = re.compile('^\d\.\d$')
def fix_d_dot_d(timestr):
    timestr = '0{}0'.format(timestr)
    return timestr.replace('.', '')

is_d_colon_dd = re.compile('^\d:\d\d$')
def fix_d_colon_dd(timestr):
    timestr = '0{}'.format(timestr)
    return fix_dd_colon_dd(timestr)

is_d_dot_dd = re.compile('^\d\.\d\d$')
def fix_d_dot_dd(timestr):
    timestr = '0{}'.format(timestr)
    return timestr.replace('.', '')

is_ddd = re.compile('^\d\d\d$')
def fix_ddd(timestr):
    return '0{}'.format(timestr)

is_d = re.compile('^\d$')
def fix_d(timestr):
    return '0{}00'.format(timestr)

time_fixes = {
    is_dd_space_dd: fix_dd_space_dd,
    is_dd_colon_dd: fix_dd_colon_dd,
    is_d_dot_d:     fix_d_dot_d,
    is_d_colon_dd:  fix_d_colon_dd,
    is_d_dot_dd:    fix_d_dot_dd,
    is_ddd:         fix_ddd,
    is_d:           fix_d
}

for _, row in better_names.iterrows():
    for bad_time_re, fixer in time_fixes.items():
        try:
            if bad_time_re.search(row['Time']):
                print('{} -> {}'.format(row['Time'], fixer(row['Time'])))
                row['Time'] = fixer(row['Time'])
        except TypeError:
            pass  # ignore nan values, as usual

09 35 -> 0935
09 40 -> 0940
09 41 -> 0941
09 36 -> 0936
10 07 -> 1007
10 09 -> 1009
10 10 -> 1010
10 36 -> 1036
10 36 -> 1036
10 42 -> 1042
10 38 -> 1038
10 44 -> 1044
10 39 -> 1039
10 39 -> 1039
10 40 -> 1040
10 35 -> 1035
10 55 -> 1055
10 55 -> 1055
11 00 -> 1100
10 56 -> 1056
10 57 -> 1057
11 01 -> 1101
11 02 -> 1102
11 04 -> 1104
10 56 -> 1056
10 58 -> 1058
10 59 -> 1059
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 04 -> 1104
11 05 -> 1105
10 57 -> 1057
11 02 -> 1102
11 03 -> 1103
11 29 -> 1129
11 26 -> 1126
11 34 -> 1134
9:25 -> 0925
9:25 -> 0925
9:25 -> 0925
9:25 -> 0925
9:25 -> 0925
9:25 -> 0925
9:28 -> 0928
9:30 -> 0930
9:30 -> 0930
9:31 -> 0931
9:34 -> 0934
9:35 -> 0935
9:28 -> 0928
9:25 -> 0925
9:32 -> 0932
9:44 -> 0944
9:45 -> 0945
9:47 -> 0947
9:49 -> 0949
9:52 -> 0952
9:49 -> 0949

09 21 -> 0921
09 21 -> 0921
09 21 -> 0921
09 21 -> 0921
09 21 -> 0921
09 21 -> 0921
09 21 -> 0921
09 21 -> 0921
09 21 -> 0921
09 21 -> 0921
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
09 35 -> 0935
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
10 05 -> 1005
07 45 -> 0745
07 45 -> 0745
07 45 -> 0745
07 45 -> 0745
07 45 -> 0745
07 45 -> 0745
07 45 -> 0745
07 45 -> 0745
07 45 -> 0745
07 45 -> 0745
08 00 -> 0800
08 00 -> 0800
08 00 -> 0800
08 00 -> 0800
08 00 -> 0800
08 00 -> 0800
08 00 -> 0800
08 00 -> 0800
08 00 -> 0800
8.25 -> 0825
8.25 -> 0825
8.25 -> 0825
8.25 -> 0825
8.25 -> 08

In [16]:
set(better_names['Time'])

{'0740',
 '0745',
 '0748',
 '0750',
 '0759',
 '0800',
 '0801',
 '0802',
 '0803',
 '0804',
 '0805',
 '0806',
 '0807',
 '0808',
 '0809',
 '0810',
 '0811',
 '0812',
 '0813',
 '0814',
 '0816',
 '0817',
 '0818',
 '0819',
 '0820',
 '0821',
 '0822',
 '0823',
 '0824',
 '0825',
 '0826',
 '0827',
 '0828',
 '0829',
 '0830',
 '0831',
 '0832',
 '0833',
 '0834',
 '0835',
 '0836',
 '0837',
 '0838',
 '0839',
 '0840',
 '0841',
 '0842',
 '0843',
 '0844',
 '0845',
 '0846',
 '0847',
 '0848',
 '0849',
 '0850',
 '0851',
 '0852',
 '0853',
 '0854',
 '0855',
 '0856',
 '0857',
 '0858',
 '0859',
 '0900',
 '0901',
 '0902',
 '0903',
 '0904',
 '0905',
 '0906',
 '0907',
 '0908',
 '0909',
 '0910',
 '0911',
 '0912',
 '0913',
 '0914',
 '0915',
 '0916',
 '0917',
 '0918',
 '0919',
 '0920',
 '0921',
 '0922',
 '0923',
 '0924',
 '0925',
 '0926',
 '0927',
 '0928',
 '0929',
 '0930',
 '0931',
 '0932',
 '0933',
 '0934',
 '0935',
 '0936',
 '0937',
 '0938',
 '0939',
 '0940',
 '0941',
 '0942',
 '0943',
 '0944',
 '0945',
 '0946',
 

Next up are the coordinates, which will be a bit challenging...

To start, we standardise the coordinates and locations of each unique survey site (identified by it's location hash, generated earlier). 

This fixes typos, and the cases where a user drag-copied location and coordinate cells, causing google sheets to increment the last character if it is a number.

In [17]:
num_locations = len(set(better_names['Location']))
num_coords = len(set(better_names['Coordinates']))
num_unique_locations = len(set(better_names['Location Hash']))
print('Number of Unique Locations: {}'.format(num_locations))
print('Number of Unique Coordinates: {}'.format(num_coords))
print('Number of Unique Location Hashes: {}'.format(num_unique_locations))

Number of Unique Locations: 207
Number of Unique Coordinates: 270
Number of Unique Location Hashes: 191


In [18]:
old_pointer = {'hash': '', 'coord': '', 'loc': ''}

for _, row in better_names.iterrows():
    if row['Location Hash'] != old_pointer['hash']:
        old_pointer['hash'] = row['Location Hash']
        old_pointer['coord'] = row['Coordinates']
        old_pointer['loc'] = row['Location']
    else:
        row['Coordinates'] = old_pointer['coord']
        row['Location'] = old_pointer['loc']

In [19]:
num_locations = len(set(better_names['Location']))
num_coords = len(set(better_names['Coordinates']))
num_unique_locations = len(set(better_names['Location Hash']))
print('Number of Unique Locations: {}'.format(num_locations))
print('Number of Unique Coordinates: {}'.format(num_coords))
print('Number of Unique Location Hashes: {}'.format(num_unique_locations))

Number of Unique Locations: 189
Number of Unique Coordinates: 190
Number of Unique Location Hashes: 191


It seems there are two duplicate coordinates, and two pairs of duplicate locations (or a triplet of identical locations). Let's figure which those are.

In [20]:
from collections import Counter

old_pointer = {'hash': '', 'coord': '', 'loc': ''}
locations = []
coords = []

for _, row in better_names.iterrows():
    if row['Location Hash'] != old_pointer['hash']:
        # add each new location to list
        locations.append(row['Location'])
        coords.append(row['Coordinates'])
        # reset the old_pointer
        old_pointer['hash'] = row['Location Hash']
        old_pointer['loc'] = row['Location']
        old_pointer['coord'] = row['Coordinates']
        
location_counts = Counter(locations)
coord_counts = Counter(coords)

for loc, reps in location_counts.items():
    if reps > 1:
        print('Location \'{}\' occured {} times'.format(
            loc, reps
        ))

for coord, reps in coord_counts.items():
    if reps > 1:
        print('Coordinate {} occured {} times'.format(
            coord, reps
        ))

Location 'Ridge View Residential College' occured 2 times
Location 'Prince George's Park Residences' occured 2 times
Coordinate 1°18'14"N 103°46'25"E occured 2 times


Repeat of PGP and RVRC locations are acceptable. 

The coordinate duplicates are for town green. These two survey sites were remarked: _"Town Green data points were very close to one another - there might be potential recounting of bird species."_

Now, to standardise the coordinates. We need to,

1. Remove verbose labellings e.g. 'Lat', 'Lon'
2. Convert DMS notation to Decimal degrees

In [21]:
set(better_names['Coordinates'])

{'1.291351, 103.776201',
 '1.291773, 103.781429',
 '1.292301, 103.779346',
 '1.29237, 103.77613',
 '1.29267, 103.78143',
 '1.293138, 103.778175',
 '1.2934, 103.78172',
 '1.293506,103.780347',
 '1.293629, 103.776953',
 '1.293647,103.771414',
 '1.293939,103.772468',
 '1.294235,103.7701660',
 '1.294779,103.775093',
 '1.295536,103.782199',
 '1.295884,103.780312',
 '1.29594, 103.77472',
 '1.296379,103.781756',
 '1.296686,103.778048',
 '1.297098,103.782353',
 '1.297178,103.773651',
 '1.297279,103.779146',
 '1.297714,103.780041',
 '1.298667,103.778781',
 '1.298967,103.778660',
 '1.30034, 103.77172',
 '1.30116, 103.77497',
 '1.30143, 103.77119',
 '1.3019, 103.77453',
 '1.303562, 103.774278',
 '1.303789, 103.775625',
 '1.303894, 103.775190',
 '1.30413, 103.77473',
 '1.304449, 103.774417',
 '1.304757,103.773307',
 '1.304759,103.77329',
 '1.304951, 103.771645',
 '1.305037, 103.772685',
 '1.305117,103.773674',
 '1.305254, 103.774441',
 '1.305516,103.774155',
 '1.305587,103.773146',
 '1.305734, 103

In [22]:
is_word = re.compile('[a-zA-Z]{2,}')

for _, row in better_names.iterrows():
    coord = row['Coordinates']
    coord = is_word.sub('', coord)
    coord = coord.replace('|', ',')
    coord = coord.replace(':', '')
    coord = coord.replace(' ', '')
    row['Coordinates'] = coord

In [23]:
set(better_names['Coordinates'])

{'1.29076,103.78033',
 '1.291,103.777',
 '1.291,103.781',
 '1.291351,103.776201',
 '1.291773,103.781429',
 '1.29223583,103.77424277',
 '1.292301,103.779346',
 '1.29237,103.77613',
 '1.292586,103.773483',
 '1.29267,103.78143',
 '1.29289434,103.77311567',
 '1.293,103.773',
 '1.293138,103.778175',
 '1.2934,103.78172',
 '1.293506,103.780347',
 '1.293629,103.776953',
 '1.293647,103.771414',
 '1.29369,103.77657',
 '1.293939,103.772468',
 '1.294,103.777',
 '1.294235,103.7701660',
 '1.294425,103.774844',
 '1.294768,103.784733',
 '1.294779,103.775093',
 '1.295,103.770',
 '1.295536,103.782199',
 '1.295547,103.77208',
 '1.29570838,103.7696633',
 '1.29587,103.770656',
 '1.295884,103.780312',
 '1.29594,103.77472',
 '1.296,103.780',
 '1.296379,103.781756',
 '1.296441,103.778897',
 '1.2964774923,103.7740066',
 '1.296686,103.778048',
 '1.29687935,103.7725442',
 '1.29698,103.77145',
 '1.297,103.778',
 '1.297,103.781',
 '1.297,103.783',
 '1.297098,103.782353',
 '1.29711,103.77396',
 '1.297178,103.773651

Now,

$$\mathrm{D_{dec}} = 
    \mathrm{D} + \frac{\mathrm{M}}{60} + \frac{\mathrm{S}}{3600}$$
    
Note that some seconds are recorded to 0 d.p. precision, others to 1 d.p.

$$ 1'' = 0.0002\bar{7}^{\ \circ} $$
$$ 0.1'' = 0.00002\bar{7}^{\ \circ} \mathrm{(6\ d.p.)}$$

Therefore, we round off to 6 d.p., _if_ the decimal degrees exceeds 6 d.p.

In [24]:
def get_dms(dms_coords):
    digits = re.compile('\d+\.?\d*')
    return tuple(map(float, digits.findall(dms_coords)))

def get_decimal(dms_xs):
    lat_d, lat_m, lat_s = dms_xs[:3]
    lon_d, lon_m, lon_s = dms_xs[3:]
    lat = lat_d + lat_m/60 + lat_s/3600
    lon = lon_d + lon_m/60 + lon_s/3600
    lat, lon = round(lat, 6), round(lon, 6)
    return '{},{}'.format(lat, lon)

for _, row in better_names.iterrows():
    if 'N' in row['Coordinates'] or 'E' in row['Coordinates']:
        dms = get_dms(row['Coordinates'])
        decimal = get_decimal(dms)
        print('{} -> {}'.format(row['Coordinates'], decimal))
        row['Coordinates'] = decimal

1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°17'40"N103°46'53"E -> 1.294444,103.781389
1°18'9"N103°46'17"E -> 1.3025,103.771389
1°18'9"N103°46'17"E -> 1.3025,103.771389
1°18'14"N103°46'25"E -> 1.303889,103.773611
1°18'14"N103°46'25"E -> 1.303889,103.773611
1°18'14"N103°46'25"E -> 1.303889,103.773611
1°18'14"N103°46'25"E -> 1.303889,103.773611
1°18'14"N103°46'25"E -> 1.303889,103.773611
1°17'49"N103°46'55"E -> 1.296944,103.781944
1°17'49"N103°46'55"E -> 1.296944,103.781944
1°17'49"N103°46'55"E -> 1.296944,103.7

Finally, to wrap up the coordinate clean-up, we split lat and lon into two separate columns.

In [25]:
latlon_df = pd.DataFrame()

for _, row in better_names.iterrows():
    row = dict(row)
    coords = row.pop('Coordinates')
    lat, lon = coords.split(',')
    row['Latitude'] = lat
    row['Longitude'] = lon
    latlon_df = latlon_df.append(row, ignore_index=True)

In [26]:
latlon_df.head(3)

Unnamed: 0,Bird Species,Distance Bin,Group Name,Habitat Type,Latitude,Location,Location Hash,Longitude,Remarks,Time,Time Period
0,Black-naped oriole,2.0,Asian Koel,Urban,1.29587,8 Kent Ridge Drive (Just outside Ventus),8b577534acf524e9fac700ce1bb8753a,103.770656,,935,09 32 - 09 42
1,Black-naped oriole,3.0,Asian Koel,Urban,1.29587,8 Kent Ridge Drive (Just outside Ventus),8b577534acf524e9fac700ce1bb8753a,103.770656,,940,09 32 - 09 42
2,Black-naped oriole,,Asian Koel,Urban,1.29587,8 Kent Ridge Drive (Just outside Ventus),8b577534acf524e9fac700ce1bb8753a,103.770656,,941,09 32 - 09 42


In [27]:
tuple(latlon_df.columns.values)

('Bird Species',
 'Distance Bin',
 'Group Name',
 'Habitat Type',
 'Latitude',
 'Location',
 'Location Hash',
 'Longitude',
 'Remarks',
 'Time',
 'Time Period')

In [28]:
set(latlon_df['Habitat Type'])

{'field',
 'Covered walkway',
 nan,
 'Urban (Pavement)',
 'Angsana tree canopy',
 'Urban (Under shade of building)',
 'On town green',
 'Urban (Tree)',
 'urban',
 'NIL(in flight)',
 'Urban (Rooftop)',
 'forest',
 'Road',
 'Urban (With tall trees around)',
 'Tree in front of Pizza Hut',
 'Urban (Small tree)',
 'NIL (heard)',
 'Tree (5m high in tree)',
 'NIL (heard only)',
 'Tree (30m high in tree)',
 'Pavement alongside shrubs',
 'Forest',
 'Tree (10m high in tree)',
 'Urban (Indoor carpark)',
 'Urban (Building; 3rd Storey)',
 'Tree (20m high in tree)',
 'Tree next to pavement',
 'garden',
 'Urban (Top of lamppost)',
 'Grass',
 'Urban (In Drain)',
 'Field ',
 'Shrubs',
 'Building opposite bus stop',
 'Tree',
 'urban with some vegetation',
 'Trees right outside Pizza Hut',
 'Garden',
 'Tree (8m high in tree)',
 'NIL (flight)',
 'Tree along pavement',
 'NIL(Iin flight)',
 'Tree opposite road',
 'NIL (in flight)',
 'Poolside',
 'Urban',
 'On town green closer to signboard',
 'Under tree',


In [29]:
for _, row in latlon_df.iterrows():
    try:
        if 'NIL' in row['Habitat Type']:
            print('Habitat Type: {}; Distance Bin: {}'.format(
                row['Habitat Type'], row['Distance Bin']
            ))
    except TypeError:
        pass

Habitat Type: NIL (flight); Distance Bin: 1
Habitat Type: NIL (flight); Distance Bin: 1
Habitat Type: NIL (flight); Distance Bin: 1
Habitat Type: NIL (flight); Distance Bin: 3
Habitat Type: NIL (flight); Distance Bin: 2
Habitat Type: NIL (in flight); Distance Bin: 4
Habitat Type: NIL (in flight); Distance Bin: 4
Habitat Type: NIL (in flight); Distance Bin: 1
Habitat Type: NIL(in flight); Distance Bin: 2
Habitat Type: NIL(in flight); Distance Bin: 2
Habitat Type: NIL (heard); Distance Bin: 2
Habitat Type: NIL (heard); Distance Bin: 2
Habitat Type: NIL(Iin flight); Distance Bin: 2
Habitat Type: NIL (in flight); Distance Bin: 3
Habitat Type: NIL (heard only); Distance Bin: 2
Habitat Type: NIL (in flight); Distance Bin: 2


Next up we have the 'Habitat Type' column. This column is largely useless, though some groups have recorded 'inflight' or 'heard but not seen' data here---while leaving legitimate values in the distance bin.

Since we are using `nan` values in the distance bin to signify inflights and seen-but-not-heards, we need to do a bit of modifying. Afterwards, the habitat column can be ignored.

In [30]:
for _, row in latlon_df.iterrows():
    try:
        if 'NIL' in row['Habitat Type']:
            row['Distance Bin'] = np.nan
    except TypeError:
        pass  # if row['Habitat Type'] == np.nan

In [31]:
# check to see the changes...

for _, row in latlon_df.iterrows():
    try:
        if 'NIL' in row['Habitat Type']:
            print('Habitat Type: {}; Distance Bin: {}'.format(
                row['Habitat Type'], row['Distance Bin']
            ))
    except TypeError:
        pass

Habitat Type: NIL (flight); Distance Bin: nan
Habitat Type: NIL (flight); Distance Bin: nan
Habitat Type: NIL (flight); Distance Bin: nan
Habitat Type: NIL (flight); Distance Bin: nan
Habitat Type: NIL (flight); Distance Bin: nan
Habitat Type: NIL (in flight); Distance Bin: nan
Habitat Type: NIL (in flight); Distance Bin: nan
Habitat Type: NIL (in flight); Distance Bin: nan
Habitat Type: NIL(in flight); Distance Bin: nan
Habitat Type: NIL(in flight); Distance Bin: nan
Habitat Type: NIL (heard); Distance Bin: nan
Habitat Type: NIL (heard); Distance Bin: nan
Habitat Type: NIL(Iin flight); Distance Bin: nan
Habitat Type: NIL (in flight); Distance Bin: nan
Habitat Type: NIL (heard only); Distance Bin: nan
Habitat Type: NIL (in flight); Distance Bin: nan


Similarly, we inspect the remarks to see if we might need to modify any rows based on the information within.

In [32]:
for _, row in latlon_df.iterrows():
    # np.nan is of type float
    if type(row['Remarks']) is not float:
        print(row)

Bird Species                                 Asian koel
Distance Bin                                          4
Group Name                                   Asian Koel
Habitat Type                                      Urban
Latitude                                       1.295547
Location         11 Arts Link (Outside AS5 facing CELC)
Location Hash          bf8575577c1f6d925e9b5aa22dbf4d9d
Longitude                                     103.77208
Remarks                                          Female
Time                                               1009
Time Period                               10 05 - 10 15
Name: 5, dtype: object
Bird Species                                              Swiftlet
Distance Bin                                                   NaN
Group Name                                              Asian Koel
Habitat Type                                                 Urban
Latitude                                                  1.292586
Location         15 Kent R

Name: 505, dtype: object
Bird Species                   Olive-backed sunbird
Distance Bin                                      2
Group Name                             Ashy Minivet
Habitat Type                Tree (10m high in tree)
Latitude                                    1.29625
Location          Beside bend along Science Drive 4
Location Hash      7e8a6ec7d3b6212d87ee0fd29331492d
Longitude                                103.778333
Remarks          moving among branches of same tree
Time                                           0955
Time Period                               0951-1001
Name: 506, dtype: object
Bird Species                   Olive-backed sunbird
Distance Bin                                      2
Group Name                             Ashy Minivet
Habitat Type                 Tree (5m high in tree)
Latitude                                    1.29625
Location          Beside bend along Science Drive 4
Location Hash      7e8a6ec7d3b6212d87ee0fd29331492d
Longitude     

Name: 660, dtype: object
Bird Species                                  Olive-backed sunbird
Distance Bin                                                   NaN
Group Name                                        Spotted Wood Owl
Habitat Type                                           Garden/Park
Latitude                                                  1.307031
Location                      In the middle of the Yale-NUS garden
Location Hash                     0de35806adcdaeb4f1505149ff8787e4
Longitude                                               103.772207
Remarks          180 degree vision only, blocked by Yale-NUS bu...
Time                                                          0841
Time Period                                              0834-0844
Name: 661, dtype: object
Bird Species                              Pink-necked green pigeon
Distance Bin                                                     2
Group Name                                        Spotted Wood Owl
Habitat Type

Name: 941, dtype: object
Bird Species                                           Jungle crow
Distance Bin                                                     1
Group Name                                           Brahminy Kite
Habitat Type                        Urban (With tall trees around)
Latitude                                                  1.293629
Location                         Prince George's Park, At the bend
Location Hash                     5746498679240216827612e094ada637
Longitude                                               103.776953
Remarks          Construction Site opposite, with backlighting ...
Time                                                          0941
Time Period                                            0940 - 0950
Name: 942, dtype: object
Bird Species                                            Javan myna
Distance Bin                                                     2
Group Name                                           Brahminy Kite
Habitat Type

Name: 1220, dtype: object
Bird Species                               Javan myna
Distance Bin                                        1
Group Name                         Coppersmith Barbet
Habitat Type                                    Urban
Latitude                                      1.29963
Location         Engineering Drive 1, T-Lab (Carpark)
Location Hash        501cd9cde5829330324e3528d2bba51d
Longitude                                   103.77212
Remarks                     Half blocked by buildings
Time                                             0740
Time Period                               7.40 - 7.50
Name: 1221, dtype: object
Bird Species                     Yellow-vented bulbul
Distance Bin                                        2
Group Name                         Coppersmith Barbet
Habitat Type                                    Urban
Latitude                                      1.29963
Location         Engineering Drive 1, T-Lab (Carpark)
Location Hash        501cd9cde

Visual inspection of the remarks column shows that some heard-but-not-seen and bad identifications have not had their distance bins or bird species columns respectively set to `nan`. Namely, they are:

```
Sound heard but not observed @ b39d42986a259bde849d4bd0ef74c4df  
Heard sound but not observed, 180 degree view @ 332d8684a20391d12ae81d9d8e0aa1c9  
Identity cannot be confirmed, but features see... @ c79289b16fa0a087f2ef63771d8dfe73
```

In [33]:
for _, row in latlon_df.iterrows():
    if (type(row['Remarks']) == str and (
        (
            'Sound heard but not observed' in row['Remarks'] or
            'Heard sound but not observed' in row['Remarks']
        ))):
        print('Caught: {}'.format(row['Remarks']))
        row['Distance Bin'] = np.nan
    elif (type(row['Remarks']) == str and
         'Identity cannot be confirmed' in row['Remarks']):
        row['Bird Species'] == np.nan
        print('Caught: {}'.format(row['Remarks']))

Caught: Sound heard but not observed
Caught: Heard sound but not observed, 180 degree view
Caught: Identity cannot be confirmed, but features seem to match closely with this species, found in tree
Caught: Identity cannot be confirmed, but features seem to match closely with this species, found in tree
Caught: Identity cannot be confirmed, but features seem to match closely with this species, found in tree


Lastly, we standardise the time period. Since all surveys are 10 minutes long, we can truncate the later bound of the time periods.

In [34]:
set(latlon_df['Time Period'])

{'07 45 - 07 55',
 '07 50 - 08 00',
 '0758-0808',
 '08 00 - 08 10',
 '08 06 - 08 16',
 '08 22 - 08 32',
 '08 37 - 08 47',
 '0805-0815',
 '0805-0816',
 '0807-0817',
 '0811 - 0821',
 '0817-0827',
 '0818-0828',
 '0821-0831',
 '0825-0835',
 '0828 - 0838',
 '0834-0844',
 '0835-0845',
 '0840-0850',
 '0842-0852',
 '0846 - 0856',
 '0848-0858',
 '0850-0900',
 '0853-0903',
 '0859 - 0909',
 '09 00 - 09 10',
 '09 21 - 09 31',
 '09 32 - 09 42',
 '09 35 - 09 45',
 '09.33-09.43',
 '0900-0910',
 '0901-0911',
 '0903 - 0912',
 '0905 - 0915',
 '0906-0916',
 '0910-0920',
 '0912 - 1012',
 '0913-0923',
 '0914-0924',
 '0915-0925',
 '0917-0927',
 '0920 - 0930',
 '0922-0932',
 '0924 - 0934',
 '0925-0935',
 '0928 - 0938',
 '0931-0941',
 '0932-0942',
 '0934-0944',
 '0935-0945',
 '0936-0946',
 '0937-0947',
 '0938-0948',
 '0939 - 0949',
 '0940 - 0950',
 '0940-0950',
 '0949-0959',
 '0951-1001',
 '0953-1003',
 '0956-1006',
 '0956-1007',
 '09:25-09:35',
 '09:27-09:37',
 '09:30 - 09:40',
 '09:34-09:44',
 '09:46 - 09:5

In [35]:
# we can reuse the functions from time_fixes from when
#     we sanitised the column 'Time' earlier. Hoever,
#     there is a new format HH.MM to include.
time_fixes[re.compile('^\d\d\.\d\d$')] = (
    lambda timestr: timestr.replace('.', '')
)

for _, row in latlon_df.iterrows():
    time_ = row['Time Period'].split('-')[0].strip()
    # we can reuse the functions earlier from the column 'Time'
    for bad_time_re, fixer in time_fixes.items():
        if bad_time_re.search(time_):
            time_ = fixer(time_)
    row['Time Period'] = time_

In [36]:
set(latlon_df['Time Period'])

{'0740',
 '0744',
 '0745',
 '0750',
 '0758',
 '0800',
 '0802',
 '0805',
 '0806',
 '0807',
 '0810',
 '0811',
 '0814',
 '0817',
 '0818',
 '0821',
 '0822',
 '0825',
 '0828',
 '0834',
 '0835',
 '0837',
 '0840',
 '0842',
 '0844',
 '0845',
 '0846',
 '0847',
 '0848',
 '0850',
 '0853',
 '0859',
 '0900',
 '0901',
 '0903',
 '0905',
 '0906',
 '0908',
 '0910',
 '0912',
 '0913',
 '0914',
 '0915',
 '0917',
 '0919',
 '0920',
 '0921',
 '0922',
 '0924',
 '0925',
 '0927',
 '0928',
 '0930',
 '0931',
 '0932',
 '0933',
 '0934',
 '0935',
 '0936',
 '0937',
 '0938',
 '0939',
 '0940',
 '0944',
 '0945',
 '0946',
 '0949',
 '0950',
 '0951',
 '0953',
 '0956',
 '1000',
 '1001',
 '1003',
 '1005',
 '1007',
 '1011',
 '1012',
 '1013',
 '1014',
 '1016',
 '1018',
 '1019',
 '1020',
 '1021',
 '1022',
 '1027',
 '1028',
 '1029',
 '1030',
 '1031',
 '1033',
 '1035',
 '1036',
 '1037',
 '1039',
 '1040',
 '1042',
 '1044',
 '1045',
 '1047',
 '1048',
 '1050',
 '1052',
 '1053',
 '1055',
 '1057',
 '1058',
 '1059',
 '1100',
 '1102',
 

In [37]:
for name in latlon_df.columns.values:
    print(set(latlon_df[name]))

{nan, 'Yellow-vented bulbul', 'Flame-breasted sunbird', 'Scarlet backed flowerpecker', 'Paddyfield pipit', 'Black-naped oriole', 'Cinnamon-headed green pigeon', 'Kingfisher', 'Crimson sunbird', 'Collared kingfisher', 'Stripped tit-babler', 'Jungle crow', 'Olive-backed sunbird', 'Eagle', 'Jarvan myna', 'Scarlet-backed flowerpecker', 'Olive-winged bulbul', 'Swiflet', 'Spotted dove', 'Swiftlet', 'Red junglefowl', 'Swallow', 'Javan myna', 'Rock pigeon', 'Eurasian tree sparrow', 'White-crested laughingthrush', 'Pink-necked green pigeon', 'Asian glossy starling', 'Brahminy kite', 'Black-capped kingfisher', 'Zebra dove', 'Asian koel'}
{nan, '4', '1', '3', '2'}
{'Collared Kingfisher', 'Crimson Sunbird', 'Pacific Swallow', 'Coppersmith Barbet', 'Asian Koel', 'Spotted Wood Owl', 'Brahminy Kite', 'Banded Woodpecker ', 'Spotted Dove', 'Ashy Minivet'}
{'field', 'Covered walkway', nan, 'Urban (Pavement)', 'Angsana tree canopy', 'Urban (Under shade of building)', 'On town green', 'Urban (Tree)', 'urb

Before we save to csv, reorder the columns for presentation's sake

In [38]:
latlon_df.columns.tolist()

['Bird Species',
 'Distance Bin',
 'Group Name',
 'Habitat Type',
 'Latitude',
 'Location',
 'Location Hash',
 'Longitude',
 'Remarks',
 'Time',
 'Time Period']

In [39]:
latlon_df = latlon_df[[
    'Bird Species',
    'Distance Bin',
    'Time',
    'Location Hash',
    'Latitude',
    'Longitude',
    'Time Period',
    'Group Name',
    'Location',
    'Habitat Type',
    'Remarks'
]]

In [40]:
latlon_df.to_csv('data.csv')
print('Time taken: {:.1f}s'.format(time.time() - time_start))

Time taken: 23.8s
