In [1]:
import sys
import datetime as dt
import pandas as pd
import numpy as np
import h5py

import matplotlib.pyplot as plt
%matplotlib inline

sys.path.append('../')
from envir import config

In [2]:
# read in datasets
spl = pd.read_csv(config.dataFol+'clean_spl/clean_spl.csv')
spl['sonyc_sensor_id'] = spl['sonyc_sensor_id'].str[10:22]
nexrad = pd.read_csv(config.dataFol+'clean_nexrad/clean_nexrad.csv')
yamnet = pd.read_csv(config.dataFol+'clean_yamnet/clean_yamnet.csv',
                     names = ['sonyc_sensor_id', 'timestamp', 'count_of_positive_predictions', 'pct_positive_predictions'])
weather = pd.read_csv(config.dataFol+'clean_noaa/clean_weather.csv')

In [3]:
# create a timestamp index
df = pd.DataFrame({'timestamp': pd.date_range(start="2017-01-01", end="2020-05-31", freq='H')})
df['month'] = pd.DatetimeIndex(df['timestamp']).month
df = df[(df.month >=3) & (df.month <= 5)].reset_index() # filter for only the months of march through may
df.drop(columns = ['index', 'month'], inplace=True)
df['timestamp'] = df['timestamp'].astype('str').str[0:16]

In [4]:
df.head()

Unnamed: 0,timestamp
0,2017-03-01 00:00
1,2017-03-01 01:00
2,2017-03-01 02:00
3,2017-03-01 03:00
4,2017-03-01 04:00


In [5]:
# merge spl data to the time index
# seems like the coverage for available spl data is by the minute so no need to fill forward
sensor = spl.sonyc_sensor_id.unique().tolist()
ind_spl = []
for s in sensor:
    temp_spl = spl[spl.sonyc_sensor_id == s].copy()
    temp_spl['timestamp'] = temp_spl.timestamp.astype('str').str[0:13]+':00'
    temp_spl = temp_spl.groupby(['timestamp', 'sonyc_sensor_id', 'sonyc_sensor_name']) \
        .mean().reset_index()
    temp_spl = temp_spl.set_index('timestamp')
    ind_spl.append(df.merge(temp_spl, how='left', on='timestamp'))
spl = pd.concat(ind_spl)

In [6]:
spl.dropna()

Unnamed: 0,timestamp,sonyc_sensor_id,sonyc_sensor_name,dBAS_lin_mean,l1,l10,l5,l90
0,2017-03-01 00:00,b827eb815321,19 Washington Square North,62.712239,68.846158,65.941533,66.922908,56.441534
1,2017-03-01 01:00,b827eb815321,19 Washington Square North,63.787942,70.100764,66.749350,67.969359,57.898450
2,2017-03-01 02:00,b827eb815321,19 Washington Square North,61.531418,67.735310,64.605150,65.804025,56.298967
3,2017-03-01 03:00,b827eb815321,19 Washington Square North,59.935650,65.939944,63.204317,64.173442,55.170867
4,2017-03-01 04:00,b827eb815321,19 Washington Square North,60.911971,67.575702,64.035050,65.309650,55.093917
...,...,...,...,...,...,...,...,...
8804,2020-05-30 20:00,b827eb1685c7,Shimkin Reading Room,66.118863,72.242632,67.994250,69.306917,62.725901
8805,2020-05-30 21:00,b827eb1685c7,Shimkin Reading Room,69.078037,74.700570,71.268150,72.507534,65.495634
8806,2020-05-30 22:00,b827eb1685c7,Shimkin Reading Room,67.539580,73.814456,69.447217,70.949709,62.989150
8807,2020-05-30 23:00,b827eb1685c7,Shimkin Reading Room,64.593642,71.922603,66.719649,68.527350,60.398450


In [7]:
# clean up radar data from the 2 sites and merge to timeindex with fill forward
nexrad['timestamp'] = nexrad.timestamp.astype('str').str[0:13]+':00'
nexrad = nexrad.groupby('timestamp').mean().reset_index()
colnames = nexrad.columns.tolist()[1:]
newcolnames=["{}_{}".format('avg', i) for i in colnames]
nexrad.rename(columns=dict(zip(colnames, newcolnames)), inplace=True)
nexrad = df.merge(nexrad, how='left', on='timestamp')
nexrad.fillna(method='ffill', limit=3, inplace=True)
nexrad.fillna(method='bfill', limit=3, inplace=True)

In [8]:
nexrad.dropna()

Unnamed: 0,timestamp,avg_mtr_#/km/h,avg_mt_#/km,avg_height_m
0,2017-03-01 00:00,59.271393,35833.470506,245.631511
1,2017-03-01 01:00,12.862799,35017.393097,537.770985
2,2017-03-01 02:00,9.296641,35025.928772,639.823196
3,2017-03-01 03:00,6.836857,35479.194658,628.437721
4,2017-03-01 04:00,9.013211,35043.044021,499.123438
...,...,...,...,...
8804,2020-05-30 20:00,105.570058,448580.663588,147.959201
8805,2020-05-30 21:00,105.570058,448580.663588,147.959201
8806,2020-05-30 22:00,105.570058,448580.663588,147.959201
8807,2020-05-30 23:00,64.071785,457965.399417,137.550646


In [9]:
ind_yamnet = []
for s in sensor:
    temp_yamnet = df.merge(yamnet[yamnet.sonyc_sensor_id==s], how='left', on='timestamp')
    ind_yamnet.append(temp_yamnet)
yamnet = pd.concat(ind_yamnet)

In [10]:
yamnet.dropna()

Unnamed: 0,timestamp,sonyc_sensor_id,count_of_positive_predictions,pct_positive_predictions
0,2017-03-01 00:00,b827eb815321,0.0,0.0
1,2017-03-01 01:00,b827eb815321,0.0,0.0
2,2017-03-01 02:00,b827eb815321,0.0,0.0
3,2017-03-01 03:00,b827eb815321,0.0,0.0
4,2017-03-01 04:00,b827eb815321,0.0,0.0
...,...,...,...,...
8366,2020-05-08 21:00,b827eb1685c7,0.0,0.0
8367,2020-05-08 21:00,b827eb1685c7,0.0,0.0
8368,2020-05-08 22:00,b827eb1685c7,0.0,0.0
8369,2020-05-08 22:00,b827eb1685c7,0.0,0.0


In [11]:
weather = df.merge(weather, how='left', on='timestamp')

In [12]:
weather.head()

Unnamed: 0,timestamp,temp_celcius,dewp_celcius,rh_percentage,wind_dir,wind_speed_mph,sea_level_pressure_mb,precipitation_mm,visibility_miles,gust_mph,peak_wind_gust_mph
0,2017-03-01 00:00,10.0,10.0,100.0,140.0,9.2,1021.0,0.0,0.25,,
1,2017-03-01 01:00,8.89,8.89,100.0,30.0,4.6,1020.1,0.0,1.5,,
2,2017-03-01 02:00,10.61,10.61,100.0,160.0,3.45,1018.5,0.0,10.0,,
3,2017-03-01 03:00,11.11,10.0,92.86,170.0,10.35,1017.7,0.0,10.0,,
4,2017-03-01 04:00,10.61,10.0,96.0,180.0,12.65,1016.4,0.0,10.0,,


In [13]:
df = df.merge(spl, how='left', on='timestamp')
df = df.merge(yamnet, how='left', on=['timestamp', 'sonyc_sensor_id'])
df = df.merge(nexrad, how='left', on='timestamp')
df = df.merge(weather, how='left', on='timestamp')

In [14]:
df.shape

(84894, 23)

In [15]:
df.columns

Index(['timestamp', 'sonyc_sensor_id', 'sonyc_sensor_name', 'dBAS_lin_mean',
       'l1', 'l10', 'l5', 'l90', 'count_of_positive_predictions',
       'pct_positive_predictions', 'avg_mtr_#/km/h', 'avg_mt_#/km',
       'avg_height_m', 'temp_celcius', 'dewp_celcius', 'rh_percentage',
       'wind_dir', 'wind_speed_mph', 'sea_level_pressure_mb',
       'precipitation_mm', 'visibility_miles', 'gust_mph',
       'peak_wind_gust_mph'],
      dtype='object')

In [16]:
df.describe()

Unnamed: 0,dBAS_lin_mean,l1,l10,l5,l90,count_of_positive_predictions,pct_positive_predictions,avg_mtr_#/km/h,avg_mt_#/km,avg_height_m,temp_celcius,dewp_celcius,rh_percentage,wind_dir,wind_speed_mph,sea_level_pressure_mb,precipitation_mm,visibility_miles,gust_mph,peak_wind_gust_mph
count,42070.0,42070.0,42070.0,42070.0,42070.0,21208.0,21208.0,68229.0,68229.0,67972.0,84767.0,84767.0,84751.0,83920.0,84727.0,84703.0,76619.0,84767.0,17151.0,11326.0
mean,62.392565,68.771251,64.568859,66.097445,58.109643,0.046916,0.020091,281.795512,228130.556899,395.109123,10.829976,4.71498,69.940838,185.513227,12.313335,1016.559045,0.172691,8.84354,29.611276,35.256547
std,4.331114,5.012012,4.451976,4.565623,4.037505,0.460252,0.051773,1004.777831,189675.30817,284.402476,6.302805,8.546773,23.305034,106.905478,6.411839,8.157287,0.800153,2.696053,6.164652,5.34473
min,51.201847,53.65115,52.199133,52.812983,44.926833,0.0,0.0,0.0,1.427082,100.0,-8.89,-25.0,12.36,0.0,0.0,984.1,0.0,0.0,16.1,29.9
25%,59.286493,65.490411,61.398766,62.975906,55.232096,0.0,0.0,13.324093,75976.420194,198.892235,6.72,-1.11,50.71,100.0,8.05,1011.4,0.0,10.0,25.3,31.05
50%,62.063274,68.795751,64.417958,66.052396,57.673342,0.0,0.0,33.806346,164238.843855,315.715327,10.61,6.11,72.81,180.0,11.5,1016.7,0.0,10.0,28.75,33.35
75%,65.049746,71.92603,67.367379,68.931152,60.320588,0.0,0.01667,111.5022,368998.891584,524.939444,15.0,11.72,92.43,290.0,16.1,1021.7,0.0,10.0,33.35,37.95
max,87.530438,94.367235,88.079666,90.526901,86.849617,19.0,0.81967,21623.973685,886602.803696,4424.088705,32.78,21.11,100.0,360.0,44.85,1040.6,14.73,10.0,57.5,66.7


In [17]:
df['sonyc_sensor_name'].unique()

array(['19 Washington Square North', nan, 'Silver lab', 'Juan Carlos',
       'Shimkin Reading Room', '4 Washington Square North',
       'Kimmel Center'], dtype=object)

#### One Hot the Sensors

In [18]:
df['s_19WashingtonSquareNorth'] = df['sonyc_sensor_name'].apply(lambda x: 1 if x == '19 Washington Square North' else 0)
#df['s_silverLab'] = df['sonyc_sensor_name'].apply(lambda x: 1 if x == 'Silver lab' else 0)
df['s_juanCarlos'] = df['sonyc_sensor_name'].apply(lambda x: 1 if x == 'Juan Carlos' else 0)
df['s_shimkinReadingRoom'] = df['sonyc_sensor_name'].apply(lambda x: 1 if x == 'Shimkin Reading Room' else 0)
df['s_4WashingtonSquareNorth'] = df['sonyc_sensor_name'].apply(lambda x: 1 if x == '4 Washington Square North' else 0)
df['s_kimmelCenter'] = df['sonyc_sensor_name'].apply(lambda x: 1 if x == 'Kimmel Center' else 0)

In [19]:
df.rename(columns={'timestamp':'timestamp_utc'}, inplace=True)

In [20]:
df.columns

Index(['timestamp_utc', 'sonyc_sensor_id', 'sonyc_sensor_name',
       'dBAS_lin_mean', 'l1', 'l10', 'l5', 'l90',
       'count_of_positive_predictions', 'pct_positive_predictions',
       'avg_mtr_#/km/h', 'avg_mt_#/km', 'avg_height_m', 'temp_celcius',
       'dewp_celcius', 'rh_percentage', 'wind_dir', 'wind_speed_mph',
       'sea_level_pressure_mb', 'precipitation_mm', 'visibility_miles',
       'gust_mph', 'peak_wind_gust_mph', 's_19WashingtonSquareNorth',
       's_juanCarlos', 's_shimkinReadingRoom', 's_4WashingtonSquareNorth',
       's_kimmelCenter'],
      dtype='object')

In [21]:
df.to_csv(config.dataFol+'all_cleaned_wsp_sonyc_birds_by_hour.csv', index=False)

In [22]:
df.dropna()['sonyc_sensor_name'].unique()

array(['19 Washington Square North', 'Juan Carlos',
       'Shimkin Reading Room', '4 Washington Square North',
       'Kimmel Center'], dtype=object)