In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
%matplotlib inline

In [2]:
obs = pd.read_csv('data/ruru_obs.csv')

In [3]:
obs.head()

Unnamed: 0,id,observed_on,time_observed_at,description,place_guess,latitude,longitude,field:is this observation part of your 1 hour survey?,field:ruru call type,field:direction to call,field:repeat observation (animal)
0,191655352,21/11/2023,2023-11-21 08:10:47 UTC,Poneke gives a hoot project,"Khandallah Park, Wellington, Wellington, NZ",-41.24165,174.787957,yes,,,
1,191655600,21/11/2023,2023-11-21 08:10:27 UTC,,"Khandallah, Wellington 6035, New Zealand",-41.241592,174.787874,yes,,,
2,191655883,21/11/2023,2023-11-21 08:27:05 UTC,,"Khandallah Park, Wellington, Wellington, NZ",-41.241673,174.787947,yes,,,
3,191656729,21/11/2023,2023-11-21 08:48:51 UTC,,"Khandallah, Wellington 6035, New Zealand",-41.241593,174.787877,yes,,,
4,191656804,21/11/2023,2023-11-21 08:51:45 UTC,"Heard more pork call\nFar away, direction hard...","Station Road (near 7), Khandallah, Wellington ...",-41.241913,174.793068,yes,Morepork,N,yes


In [4]:
# Rename columns using the rename() method
obs = obs.rename(columns={
    'field:is this observation part of your 1 hour survey?': 'survey_observation',
    'field:ruru call type': 'call_type',
    'field:direction to call' : 'direction_to_call',
    'field:repeat observation (animal)': 'repeat_observation'
})

In [5]:
obs.dtypes

id                      int64
observed_on            object
time_observed_at       object
description            object
place_guess            object
latitude              float64
longitude             float64
survey_observation     object
call_type              object
direction_to_call      object
repeat_observation     object
dtype: object

In [6]:
# columns with nan values
obs.columns[obs.isna().any()]

Index(['time_observed_at', 'description', 'survey_observation', 'call_type',
       'direction_to_call', 'repeat_observation'],
      dtype='object')

In [7]:
# convert to date time format, error='coerce' handles any errors as NaT (Not a Time)
obs['time_observed_at'] = pd.to_datetime(obs['time_observed_at'], format='mixed', utc=True, errors='coerce')

In [8]:
obs

Unnamed: 0,id,observed_on,time_observed_at,description,place_guess,latitude,longitude,survey_observation,call_type,direction_to_call,repeat_observation
0,191655352,21/11/2023,NaT,Poneke gives a hoot project,"Khandallah Park, Wellington, Wellington, NZ",-41.24165,174.787957,yes,,,
1,191655600,21/11/2023,NaT,,"Khandallah, Wellington 6035, New Zealand",-41.241592,174.787874,yes,,,
2,191655883,21/11/2023,NaT,,"Khandallah Park, Wellington, Wellington, NZ",-41.241673,174.787947,yes,,,
3,191656729,21/11/2023,NaT,,"Khandallah, Wellington 6035, New Zealand",-41.241593,174.787877,yes,,,
4,191656804,21/11/2023,NaT,"Heard more pork call\nFar away, direction hard...","Station Road (near 7), Khandallah, Wellington ...",-41.241913,174.793068,yes,Morepork,N,yes
5,191743943,22/11/2023,NaT,calling in the distance,"Roseneath, Wellington, New Zealand",-41.294902,174.798935,no,,,
6,192178950,26/11/2023,NaT,X 2 individual birds,"Te Ika-a-Māui/North Island, Paraparaumu, Welli...",-40.921486,175.005007,no,Morepork,Not recorded,
7,192534631,29/11/2023,NaT,,"Te Ika-a-Māui/North Island, Wellington, Wellin...",-41.264182,174.764985,,Morepork,W,no
8,193366756,8/12/2023,NaT,,"Huntleigh Park, Wellington, Wellington, NZ",-41.253368,174.764102,yes,Morepork,W,no
9,193367043,8/12/2023,NaT,,"Thane Road at Robieson Street (near 17), Rosen...",-41.294687,174.799116,no,,,


In [9]:
# convert UTC timezone to NZ timezone
obs['time_observed_at'] = obs['time_observed_at'].dt.tz_convert('Pacific/Auckland')

In [10]:
# keep only time
obs['time_observed_at'] = obs['time_observed_at'].dt.time

In [11]:
obs

Unnamed: 0,id,observed_on,time_observed_at,description,place_guess,latitude,longitude,survey_observation,call_type,direction_to_call,repeat_observation
0,191655352,21/11/2023,NaT,Poneke gives a hoot project,"Khandallah Park, Wellington, Wellington, NZ",-41.24165,174.787957,yes,,,
1,191655600,21/11/2023,NaT,,"Khandallah, Wellington 6035, New Zealand",-41.241592,174.787874,yes,,,
2,191655883,21/11/2023,NaT,,"Khandallah Park, Wellington, Wellington, NZ",-41.241673,174.787947,yes,,,
3,191656729,21/11/2023,NaT,,"Khandallah, Wellington 6035, New Zealand",-41.241593,174.787877,yes,,,
4,191656804,21/11/2023,NaT,"Heard more pork call\nFar away, direction hard...","Station Road (near 7), Khandallah, Wellington ...",-41.241913,174.793068,yes,Morepork,N,yes
5,191743943,22/11/2023,NaT,calling in the distance,"Roseneath, Wellington, New Zealand",-41.294902,174.798935,no,,,
6,192178950,26/11/2023,NaT,X 2 individual birds,"Te Ika-a-Māui/North Island, Paraparaumu, Welli...",-40.921486,175.005007,no,Morepork,Not recorded,
7,192534631,29/11/2023,NaT,,"Te Ika-a-Māui/North Island, Wellington, Wellin...",-41.264182,174.764985,,Morepork,W,no
8,193366756,8/12/2023,NaT,,"Huntleigh Park, Wellington, Wellington, NZ",-41.253368,174.764102,yes,Morepork,W,no
9,193367043,8/12/2023,NaT,,"Thane Road at Robieson Street (near 17), Rosen...",-41.294687,174.799116,no,,,


In [12]:
# fill NaN values with the dates from observed_on
# obs['time_observed_at'].fillna(obs['observed_on'], inplace=True)

# Find the NaN values in 'time_observed_at' column
nan_indices = obs['time_observed_at'].isna()

# Generate random times between 9pm and 10pm, including minutes and seconds
random_times = pd.to_timedelta(np.random.randint(21*60*60, 22*60*60, sum(nan_indices)), unit='s')

In [13]:
# Fill NaN values with random times
obs.loc[nan_indices, 'time_observed_at'] = random_times

In [14]:
# Convert timedelta to string (str) and extract time part which is the last part of the string after the space (split()[-1])
obs['time_observed_at'] = obs['time_observed_at'].apply(lambda x: str(x).split()[-1])

In [15]:
# Convert 'observed_on' and 'time_observed_at' to datetime format
obs['observed_on'] = pd.to_datetime(obs['observed_on'], format='%d/%m/%Y')
obs['time_observed_at'] = pd.to_datetime(obs['time_observed_at'], format='%H:%M:%S').dt.time

In [16]:
# Extract year, month, and day
obs['year'] = obs['observed_on'].dt.year
obs['month'] = obs['observed_on'].dt.month
obs['day'] = obs['observed_on'].dt.day

In [17]:
# Sort the DataFrame by year, month, and day in ascending order
obs = obs.sort_values(by=['year', 'month', 'day', 'time_observed_at'], ascending=True)

In [18]:
# convert to MM/DD/YYYY format
obs['observed_on'] = obs['observed_on'].dt.strftime('%m/%d/%Y')

In [19]:
# Drop the intermediate columns
obs = obs.drop(columns=['year', 'month', 'day'])

In [20]:
obs

Unnamed: 0,id,observed_on,time_observed_at,description,place_guess,latitude,longitude,survey_observation,call_type,direction_to_call,repeat_observation
14,193639074,05/18/2023,21:42:53,,"Te Ika-a-Māui/North Island, Wellington, Wellin...",-41.31272,174.791891,no,Not recorded,Not recorded,yes
0,191655352,11/21/2023,21:07:04,Poneke gives a hoot project,"Khandallah Park, Wellington, Wellington, NZ",-41.24165,174.787957,yes,,,
2,191655883,11/21/2023,21:09:54,,"Khandallah Park, Wellington, Wellington, NZ",-41.241673,174.787947,yes,,,
3,191656729,11/21/2023,21:10:37,,"Khandallah, Wellington 6035, New Zealand",-41.241593,174.787877,yes,,,
1,191655600,11/21/2023,21:25:04,,"Khandallah, Wellington 6035, New Zealand",-41.241592,174.787874,yes,,,
4,191656804,11/21/2023,21:33:30,"Heard more pork call\nFar away, direction hard...","Station Road (near 7), Khandallah, Wellington ...",-41.241913,174.793068,yes,Morepork,N,yes
5,191743943,11/22/2023,21:28:56,calling in the distance,"Roseneath, Wellington, New Zealand",-41.294902,174.798935,no,,,
6,192178950,11/26/2023,21:24:36,X 2 individual birds,"Te Ika-a-Māui/North Island, Paraparaumu, Welli...",-40.921486,175.005007,no,Morepork,Not recorded,
7,192534631,11/29/2023,21:25:39,,"Te Ika-a-Māui/North Island, Wellington, Wellin...",-41.264182,174.764985,,Morepork,W,no
8,193366756,12/08/2023,21:40:32,,"Huntleigh Park, Wellington, Wellington, NZ",-41.253368,174.764102,yes,Morepork,W,no


In [21]:
# Filter and keep only rows where the survey observation is "yes"
surveyed_obs = obs[obs['survey_observation'] == 'yes']
surveyed_obs

Unnamed: 0,id,observed_on,time_observed_at,description,place_guess,latitude,longitude,survey_observation,call_type,direction_to_call,repeat_observation
0,191655352,11/21/2023,21:07:04,Poneke gives a hoot project,"Khandallah Park, Wellington, Wellington, NZ",-41.24165,174.787957,yes,,,
2,191655883,11/21/2023,21:09:54,,"Khandallah Park, Wellington, Wellington, NZ",-41.241673,174.787947,yes,,,
3,191656729,11/21/2023,21:10:37,,"Khandallah, Wellington 6035, New Zealand",-41.241593,174.787877,yes,,,
1,191655600,11/21/2023,21:25:04,,"Khandallah, Wellington 6035, New Zealand",-41.241592,174.787874,yes,,,
4,191656804,11/21/2023,21:33:30,"Heard more pork call\nFar away, direction hard...","Station Road (near 7), Khandallah, Wellington ...",-41.241913,174.793068,yes,Morepork,N,yes
8,193366756,12/08/2023,21:40:32,,"Huntleigh Park, Wellington, Wellington, NZ",-41.253368,174.764102,yes,Morepork,W,no
10,193367376,12/08/2023,21:43:26,,"Huntleigh Park, Wellington, Wellington, NZ",-41.25335,174.764131,yes,Morepork,S,yes
16,194318735,12/18/2023,21:57:48,,"Hataitai Park, Wellington, Wellington, NZ",-41.309925,174.789332,yes,Morepork,E,yes
17,194343572,12/19/2023,21:41:35,Albemarle Reserve,"Northland, Wellington 6012, New Zealand",-41.27675,174.757751,yes,Not recorded,Not recorded,yes
19,194417147,12/20/2023,21:24:14,Call observed 47 minutes into hour of listenin...,"Khandallah, Wellington 6035, New Zealand",-41.25277,174.800435,yes,Morepork,S,maybe


In [22]:
print(len(obs), len(surveyed_obs))

47 32


In [23]:
# export cleaned data
#obs.to_csv('obs_cleaned.csv', index=False)
surveyed_obs.to_csv('data/surveyed_obs_cleaned.csv', index=False)