In [1]:
import re
import pandas as pd
df = pd.read_csv('data_raw.csv')
tuple(df.columns.values)

('Bird Species',
 'Distance Bin',
 'Date',
 'Time',
 'Coordinates',
 'Location',
 'Habitat Type',
 'Time Period',
 'Group Name',
 'Remarks')

# Standardise Location
Use the 'Location' column instead of 'Coordinates' because there's actually less variation and typos in that column. Besides, accuracy of coordinates entered by some groups are not good enough to use.

In [2]:
# There is one individual in one group that added in misc comments into their location column
# Manually target those rows to standardise
target_coords = (
    '1.303894, 103.775190', 
    '1.305994, 103.773311', 
    '1.306042, 103.774040', 
    '1.306989, 103.773073')

lagging_pointer = {'coords': '', 'loc': ''}  # tuple< current coord, current location >
for _, row in df.iterrows():
    current_pointer = {'coords': row[4], 'loc': row[5]}
    if current_pointer['coords'] in target_coords:
        if current_pointer['coords'] == lagging_pointer['coords']:
            row[5] = lagging_pointer['loc']
    lagging_pointer = {'coords': row[4], 'loc': row[5]}

In [3]:
df.iloc[965:1000]

Unnamed: 0,Bird Species,Distance Bin,Date,Time,Coordinates,Location,Habitat Type,Time Period,Group Name,Remarks
965,Javan Myna,2,1 Mar,09 08,"1.305037, 103.772685",Edge of Town Green near Pizza Hut,On town green,0828 - 0838,Ashy Minivet,
966,Javan Myna,2,1 Mar,09 08,"1.305037, 103.772685",Edge of Town Green near Pizza Hut,On town green closer to signboard,0828 - 0838,Ashy Minivet,
967,Javan Myna,2,1 Mar,09 09,"1.305037, 103.772685",Edge of Town Green near Pizza Hut,NIL(in flight),0828 - 0838,Ashy Minivet,
968,Javan Myna,1,1 Mar,09 12,"1.303894, 103.775190",Road/Parking space behind Create Tower,Road,0912 - 1012,Ashy Minivet,
969,Yellow-vented Bulbul,2,1 Mar,09 12,"1.303894, 103.775190",Road/Parking space behind Create Tower,NIL (heard),0912 - 1012,Ashy Minivet,Heard
970,Javan Myna,1,1 Mar,09 15,"1.303894, 103.775190",Road/Parking space behind Create Tower,Pavement,0912 - 1012,Ashy Minivet,
971,Javan Myna,2,1 Mar,09 19,"1.303894, 103.775190",Road/Parking space behind Create Tower,Tree,0912 - 1012,Ashy Minivet,
972,Javan Myna,1,2 Mar,09 28,"1.305994, 103.773311","30 College Ave E, Singapore 138599",Pavement,0928 - 0938,Ashy Minivet,
973,Yellow-vented Bulbul,2,2 Mar,09 31,"1.305994, 103.773311","30 College Ave E, Singapore 138599",Tree,0928 - 0938,Ashy Minivet,
974,Olive-backed Sunbird,2,2 Mar,09 32,"1.305994, 103.773311","30 College Ave E, Singapore 138599",NIL (heard),0928 - 0938,Ashy Minivet,Heard


In [4]:
# Remove trailing whitespaces
for _, row in df.iterrows():
    row[5] = row[5].strip()

In [5]:
# Fix the one guy who entered '...opposite Engineering E4' to E13 because they dragged the cell
for _, row in df.iterrows():
    if 'In the middle of YIH and CLB bus stop, opposite Engineering E' in row[5]:
        row[5] = 'In the middle of YIH and CLB bus stop, opposite Engineering E4'

# Check group sites

In [6]:
print('Number of unique locations: {}'.format(
    len(set(df['Location'])))) # if this is not 200, it is bad!

asian_koel = df[(df['Group Name'] == 'Asian Koel')]
ashy_minivet = df[(df['Group Name'] == 'Ashy Minivet')]
banded_woodpecker = df[(df['Group Name'] == 'Banded Woodpecker ')]
brahminy_kite = df[(df['Group Name'] == 'Brahminy Kite')]
collared_kingfisher = df[(df['Group Name'] == 'Collared Kingfisher')]
coppersmith_barbet = df[(df['Group Name'] == 'Coppersmith Barbet')]
crimson_sunbird = df[(df['Group Name'] == 'Crimson Sunbird')]
pacific_swallow = df[(df['Group Name'] == 'Pacific Swallow')]
spotted_dove = df[(df['Group Name'] == 'Spotted Dove')]
spotted_wood_owl = df[(df['Group Name'] == 'Spotted Wood Owl')]

groups = (asian_koel, ashy_minivet, banded_woodpecker,
          brahminy_kite, collared_kingfisher, coppersmith_barbet,
          crimson_sunbird, pacific_swallow, spotted_dove, spotted_wood_owl)

print('Number of rows: {}'.format(len(df)))
for group_num, group in enumerate(groups):
    print('Number of rows by group {}: {}'.format(group_num+1, len(group)))
    print('                locations: {}'.format(len(set(group['Location']))))

Number of unique locations: 195
Number of rows: 1722
Number of rows by group 1: 125
                locations: 20
Number of rows by group 2: 99
                locations: 20
Number of rows by group 3: 173
                locations: 19
Number of rows by group 4: 189
                locations: 20
Number of rows by group 5: 132
                locations: 18
Number of rows by group 6: 327
                locations: 20
Number of rows by group 7: 170
                locations: 20
Number of rows by group 8: 217
                locations: 20
Number of rows by group 9: 150
                locations: 20
Number of rows by group 10: 140
                locations: 20


# Remove unidentified and untracked
If the birds are not of the following species:
1. Javan myna
2. Yellow-vented bulbul
3. Rock pigeon
4. Olive-backed sunbird
5. Black-naped Oriole

Remove them from the dataframe. Additionally, fix typos, standardise capitalisation and hyphenisations, trailing whitespaces, so that we can programmatically differentiate between species well.

Also separate rows counting more than one observation, e.g. 'Javan Myna x4'

In [7]:
set(df['Bird Species'])

{'"Chweet chweet" then 4 secs pause, repeat',
 '"Parking car sound" bird',
 '-',
 'Asian Glossy Starling',
 'Asian Koel',
 'Asian Koele',
 'Bird of prey(?)',
 'Bird that makes a low-pitched quack',
 'Bird that sounds like monkey',
 'Black-capped Kingfisher',
 'Black-naped Oriole',
 'Black-naped Oriole ',
 'Black-naped oriole (?)',
 'Brahminy Kite',
 'Brahminy Kite ',
 'Brahminy kite',
 'Cinnamon-headed Green Pigeon',
 'Collared Kingfisher',
 'Collared Kingfisher ',
 'Collared Kingfisher x2',
 'Crimson Sunbird',
 'Crow',
 'Eagle',
 'Eagle ',
 'Eurasian Tree Sparrow',
 'Eurasian Tree Sparrow ',
 'Eurasian Tree Sparrow x3',
 'Flame-breasted sunbird',
 'Jarvan Myna',
 'Javan Myna',
 'Javan Myna ',
 'Javan Myna (x2)',
 'Javan Myna x4',
 'Javan Mynah',
 'Javan myna',
 'Jungle Crow',
 'Kingfisher',
 'Koel',
 'Large black bird',
 'Large grey/black bird, suspected pigeon',
 'Large-billed Crow',
 'None ',
 'Olive-backed Sunbird',
 'Olive-backed Sunbird ',
 'Olive-backed Sunbird (?)',
 'Olive-bac

In [8]:
myna_re = re.compile('[Mm]yna')
oriole_re = re.compile('[Oo]riole')
sunbird_re = re.compile('Ol[i]?ve-backed')
pigeon_re = re.compile('Rock')
bulbul_re = re.compile('[Yy]ellow-[Vv]ented')

tracked_only_df = pd.DataFrame()

for _, row in df.iterrows():
    # check if row represents > 1 observation (use later)
    has_digit = re.search('\d', row[0])
    # standardise species common names
    if myna_re.search(row[0]):
        row[0] = 'Javan myna'
    elif oriole_re.search(row[0]):
        row[0] = 'Blacked-naped oriole'
    elif sunbird_re.search(row[0]):
        row[0] = 'Olive-backed sunbird'
    elif pigeon_re.search(row[0]):
        row[0] = 'Rock pigeon'
    elif bulbul_re.search(row[0]):
        row[0] = 'Yellow-vented bulbul'
    else:
        # if not the above, goto next iteration
        continue
    # check for rows representing > 1 observations
    if has_digit:
        repeats = int(has_digit.group(0))
        for _ in range(repeats):
            tracked_only_df = tracked_only_df.append(row)
    else:
        tracked_only_df = tracked_only_df.append(row)

In [9]:
set(tracked_only_df['Bird Species'])

{'Blacked-naped oriole',
 'Javan myna',
 'Olive-backed sunbird',
 'Rock pigeon',
 'Yellow-vented bulbul'}

In [10]:
len(tracked_only_df)  # 1360, or 1364

1364

In [11]:
tuple(tracked_only_df.columns.values)  # !! indices changed

('Bird Species',
 'Coordinates',
 'Date',
 'Distance Bin',
 'Group Name',
 'Habitat Type',
 'Location',
 'Remarks',
 'Time',
 'Time Period')

# Remove heard-not-seen and fly-bys

In [12]:
set(tracked_only_df['Distance Bin'])
# there is a single observation with no distance bin
# it was probably accidentally deleted
# interpolating, we assign this a distance bin of 1

{'4',
 '3',
 'in flight',
 nan,
 '1',
 'flyby',
 'Heard ',
 '2',
 'Heard',
 'In Flight',
 'Flyby',
 'In flight',
 'heard',
 'Did not land'}

In [13]:
# also note that a group has noted flyby or heard-not-seen
# in the remarks column
set(tracked_only_df['Remarks'])

{nan,
 'Loud sound by leaf blower',
 'Female',
 'moving among branches of same tree; location has high vehicular traffic',
 'Found perched on building, 180 degree view',
 'x7 of them (4 were greyish, 3 were darker brown/black in colour, unsure if same species/diff gender)',
 'Heard',
 'These two bulbuls might have been recounted/moved here from the Cinnamon College data point.',
 'in flight; construction works',
 'Half blocked by buildings',
 'Sunlight was very strong which hindered vision',
 'Next to construction, very noisy',
 'moving from tree to tree',
 '180 degree vision only, blocked by RC4 building behind',
 'Noisy and barely any sky visibility as it is flanked by two tall buildings',
 'Surrounded by tall buildings, no trees around',
 'Town Green data points were very close to one another - there might be potential recounting of bird species.',
 'in flight',
 'near rubbish bin',
 'flew off about 3min after spotted',
 'moving from tree to tree; construction works',
 'Loud lawnmov

In [20]:
heard_and_inflight_re = re.compile('([Ff]light|[Ff]lyby|Did not land|[Hh]eard)')

tracked_inloc_df = pd.DataFrame()

for _, row in tracked_only_df.iterrows():
    if pd.isnull(row[3]):
        print('found the null!')
        row[3] = '1'
    if (not heard_and_inflight_re.search(str(row[3]))
    and not heard_and_inflight_re.search(str(row[7]))):
        tracked_inloc_df = tracked_inloc_df.append(row)

found the null!


In [21]:
len(tracked_inloc_df)

999

In [23]:
set(tracked_inloc_df['Distance Bin'])

{'1', '2', '3', '4'}