In [1]:
from math import pi as PI
import pandas as pd
df = pd.read_csv('data.csv', dtype=str)

In [2]:
tuple(df.columns.values)

('Unnamed: 0',
 'Bird Species',
 'Coordinates',
 'Date',
 'Distance Bin',
 'Group Name',
 'Habitat Type',
 'Location',
 'Remarks',
 'Time',
 'Time Period')

# Simplify data
by removing useless columns
- `Unnamed: 0` is a side effect column from preprocess
- `Coordinates` is too varied in formatting and accuracy to use
- `Date` is not useful information, because all within a few days
- `Remarks` does not provide additional useful information

In [3]:
df.drop(['Unnamed: 0', 'Coordinates', 'Date', 'Remarks'],
    axis=1, inplace=True)

In [4]:
tuple(df.columns.values)

('Bird Species',
 'Distance Bin',
 'Group Name',
 'Habitat Type',
 'Location',
 'Time',
 'Time Period')

In [5]:
df.iloc[0]['Location']

'8 Kent Ridge Drive (Just outside Ventus)'

In [6]:
# identify unique locs by concat group name & loc
lagging_pointer = '{}---{}'.format(
    df.iloc[0]['Group Name'], df.iloc[0]['Location'])
current_pointer = ''
last_time_period = ''

species_counter = {
    'Black-naped oriole': 0,
    'Javan myna': 0,
    'Olive-backed sunbird': 0,
    'Rock pigeon': 0,
    'Yellow-vented bulbul': 0
}

count_per_site_df = pd.DataFrame()

for _, row in df.iterrows():
    current_pointer = '{}---{}'.format(row[2], row[4])
    if current_pointer == lagging_pointer:
        # still on the same site
        species_counter[row[0]] += 1
    else:
        # current_pointer is on a new site
        # first, save the information
        for species, count in species_counter.items():
            count_per_site_df = count_per_site_df.append(
                {
                    'Bird Species': species,
                    'Count': count,
                    'Density': count / (PI * 0.25 ** 2),
                    'Location': lagging_pointer,
                    'Time Period': last_time_period
                }
                , ignore_index=True
            )
        # reset the species_counter...
        species_counter = {
            'Black-naped oriole': 0,
            'Javan myna': 0,
            'Olive-backed sunbird': 0,
            'Rock pigeon': 0,
            'Yellow-vented bulbul': 0
        }
        # then, count current_pointer
        species_counter[row[0]] += 1
    # move lagging_pointer up
    lagging_pointer = current_pointer
    last_time_period = row[6]

# clean up the last site
for species, count in species_counter.items():
    count_per_site_df = count_per_site_df.append(
        {
            'Bird Species': species,
            'Count': count,
            'Density': count / (PI * 0.25 ** 2),
            'Location': lagging_pointer,
            'Time Period': last_time_period
        }, ignore_index=True)

In [7]:
count_per_site_df

Unnamed: 0,Bird Species,Count,Density,Location,Time Period
0,Rock pigeon,0.0,0.000000,Asian Koel---8 Kent Ridge Drive (Just outside ...,0932
1,Olive-backed sunbird,1.0,5.092958,Asian Koel---8 Kent Ridge Drive (Just outside ...,0932
2,Javan myna,0.0,0.000000,Asian Koel---8 Kent Ridge Drive (Just outside ...,0932
3,Yellow-vented bulbul,0.0,0.000000,Asian Koel---8 Kent Ridge Drive (Just outside ...,0932
4,Black-naped oriole,2.0,10.185916,Asian Koel---8 Kent Ridge Drive (Just outside ...,0932
5,Rock pigeon,0.0,0.000000,Asian Koel---11 Arts Link (Outside AS5 facing ...,1005
6,Olive-backed sunbird,1.0,5.092958,Asian Koel---11 Arts Link (Outside AS5 facing ...,1005
7,Javan myna,0.0,0.000000,Asian Koel---11 Arts Link (Outside AS5 facing ...,1005
8,Yellow-vented bulbul,0.0,0.000000,Asian Koel---11 Arts Link (Outside AS5 facing ...,1005
9,Black-naped oriole,0.0,0.000000,Asian Koel---11 Arts Link (Outside AS5 facing ...,1005


In [8]:
for i, row in df.iterrows():
    df.loc[i, 'Location'] = row[2] + '---' + row[4]

In [9]:
print('len(count_per_site_df): {}\nlen(df): {}'.format(
    len(count_per_site_df), len(df)))
print('len(set(count_per_site_df[\'Location\'])): {}'.format(
    len(set(count_per_site_df['Location']))))
print('len(set(df[\'Location\'])): {}'.format(
    len(set(df['Location']))))

len(count_per_site_df): 895
len(df): 999
len(set(count_per_site_df['Location'])): 179
len(set(df['Location'])): 179


In [10]:
empty_sites = 200 - len(set(count_per_site_df['Location']))

for i in range(empty_sites):
    for species in species_counter.keys():
        count_per_site_df = count_per_site_df.append(
            {
                'Bird Species': species,
                'Count': 0,
                'Density': 0,
                'Location': 'empty{}'.format(i+1),
                'Time Period': None
            }, ignore_index=True
        )

In [11]:
print('len(count_per_site_df): {}\nlen(df): {}'.format(
    len(count_per_site_df), len(df)))
print('len(set(count_per_site_df[\'Location\'])): {}'.format(
    len(set(count_per_site_df['Location']))))
print('len(set(df[\'Location\'])): {}'.format(
    len(set(df['Location']))))

len(count_per_site_df): 1000
len(df): 999
len(set(count_per_site_df['Location'])): 200
len(set(df['Location'])): 179


In [12]:
count_per_site_df

Unnamed: 0,Bird Species,Count,Density,Location,Time Period
0,Rock pigeon,0.0,0.000000,Asian Koel---8 Kent Ridge Drive (Just outside ...,0932
1,Olive-backed sunbird,1.0,5.092958,Asian Koel---8 Kent Ridge Drive (Just outside ...,0932
2,Javan myna,0.0,0.000000,Asian Koel---8 Kent Ridge Drive (Just outside ...,0932
3,Yellow-vented bulbul,0.0,0.000000,Asian Koel---8 Kent Ridge Drive (Just outside ...,0932
4,Black-naped oriole,2.0,10.185916,Asian Koel---8 Kent Ridge Drive (Just outside ...,0932
5,Rock pigeon,0.0,0.000000,Asian Koel---11 Arts Link (Outside AS5 facing ...,1005
6,Olive-backed sunbird,1.0,5.092958,Asian Koel---11 Arts Link (Outside AS5 facing ...,1005
7,Javan myna,0.0,0.000000,Asian Koel---11 Arts Link (Outside AS5 facing ...,1005
8,Yellow-vented bulbul,0.0,0.000000,Asian Koel---11 Arts Link (Outside AS5 facing ...,1005
9,Black-naped oriole,0.0,0.000000,Asian Koel---11 Arts Link (Outside AS5 facing ...,1005


In [13]:
count_per_site_df.Count = count_per_site_df.Count.astype(int)

In [14]:
count_per_site_df.to_csv('counts_per_site.csv')