In [1]:
import sys
import datetime as dt
import pandas as pd
import numpy as np
import h5py
import dask.dataframe as dd
import dask.array as da

import matplotlib.pyplot as plt
%matplotlib inline

sys.path.append('../')
from envir import config

In [2]:
# read in datasets
spl = pd.read_csv(config.dataFol+'clean_spl/clean_spl.csv')
spl['sonyc_sensor_id'] = spl['sonyc_sensor_id'].str[10:22]
nexrad = pd.read_csv(config.dataFol+'clean_nexrad/clean_nexrad.csv')
yamnet = pd.read_csv(config.dataFol+'clean_yamnet/clean_yamnet.csv',
                     names = ['sonyc_sensor_id', 'timestamp', 'count_of_predictions_over_50', 'average_prediction'])
noaa = pd.read_csv(config.dataFol+'clean_noaa/clean_noaa.csv')

In [3]:
# create a timestamp index
df = pd.DataFrame({'timestamp': pd.date_range(start="2017-01-01", end="2020-05-31", freq='T')})
df['month'] = pd.DatetimeIndex(df['timestamp']).month
df = df[(df.month >=3) & (df.month <= 5)].reset_index() # filter for only the months of march through may
df.drop(columns = ['index', 'month'], inplace=True)
df['timestamp'] = df['timestamp'].astype('str').str[0:16]

In [4]:
df.head()

Unnamed: 0,timestamp
0,2017-03-01 00:00
1,2017-03-01 00:01
2,2017-03-01 00:02
3,2017-03-01 00:03
4,2017-03-01 00:04


In [5]:
# merge spl data to the time index
# seems like the coverage for available spl data is by the minute so no need to fill forward
sensor = spl.sonyc_sensor_id.unique().tolist()
ind_spl = []
for s in sensor:
    ind_spl.append(df.merge(spl[spl.sonyc_sensor_id==s], how='left', on='timestamp'))
spl = pd.concat(ind_spl)

In [6]:
spl.dropna()

Unnamed: 0,timestamp,sonyc_sensor_id,sonyc_sensor_name,dBAS_lin_mean
20777,2017-03-15 10:17,b827eb905497,Kimmel Center,69.283400
20778,2017-03-15 10:18,b827eb905497,Kimmel Center,76.952240
20779,2017-03-15 10:19,b827eb905497,Kimmel Center,73.372570
20780,2017-03-15 10:20,b827eb905497,Kimmel Center,71.376250
20781,2017-03-15 10:21,b827eb905497,Kimmel Center,72.206830
...,...,...,...,...
791870,2020-05-30 23:56,b827eb0fedda,Juan Carlos,58.455940
791871,2020-05-30 23:57,b827eb0fedda,Juan Carlos,57.976265
791872,2020-05-30 23:58,b827eb0fedda,Juan Carlos,58.577415
791873,2020-05-30 23:59,b827eb0fedda,Juan Carlos,58.600876


In [7]:
# clean up radar data from the 2 sites and merge to timeindex with fill forward
nexrad = nexrad.groupby('timestamp').mean().reset_index()
colnames = nexrad.columns.tolist()[1:]
newcolnames=["{}_{}".format('avg', i) for i in colnames]
nexrad.rename(columns=dict(zip(colnames, newcolnames)), inplace=True)
nexrad = df.merge(nexrad, how='left', on='timestamp')
nexrad.fillna(method='ffill', limit=3, inplace=True)
nexrad.fillna(method='bfill', limit=3, inplace=True)

In [8]:
nexrad.dropna()

Unnamed: 0,timestamp,avg_mtr,avg_mt,avg_height,avg_airspeed,avg_heading,avg_airspeed_u,avg_airspeed_v
0,2017-03-01 00:00,5.917225,44334.097307,277.105735,3.823076,186.065650,-0.706602,-3.643550
1,2017-03-01 00:01,5.917225,44334.097307,277.105735,3.823076,186.065650,-0.706602,-3.643550
2,2017-03-01 00:02,95.663130,25566.371530,162.629644,2.167413,-70.369662,-1.766382,0.424816
3,2017-03-01 00:03,95.663130,25566.371530,162.629644,2.167413,-70.369662,-1.766382,0.424816
4,2017-03-01 00:04,95.663130,25566.371530,162.629644,2.167413,-70.369662,-1.766382,0.424816
...,...,...,...,...,...,...,...,...
396566,2019-05-31 09:26,221.819050,807988.428116,445.013048,4.174607,44.954754,2.520911,2.825846
396567,2019-05-31 09:27,221.819050,807988.428116,445.013048,4.174607,44.954754,2.520911,2.825846
396568,2019-05-31 09:28,221.819050,807988.428116,445.013048,4.174607,44.954754,2.520911,2.825846
396569,2019-05-31 09:29,221.819050,807988.428116,445.013048,4.174607,44.954754,2.520911,2.825846


In [9]:
ind_yamnet = []
for s in sensor:
    temp_yamnet = df.merge(yamnet[yamnet.sonyc_sensor_id==s], how='left', on='timestamp')
    temp_yamnet.fillna(method='ffill', limit=3, inplace=True)
    temp_yamnet.fillna(method='bfill', limit=3, inplace=True)
    ind_yamnet.append(temp_yamnet)
yamnet = pd.concat(ind_yamnet)

In [10]:
yamnet.dropna()

Unnamed: 0,timestamp,sonyc_sensor_id,count_of_predictions_over_50,average_prediction
20844,2017-03-15 11:24,b827eb905497,1.0,0.625783
20845,2017-03-15 11:25,b827eb905497,1.0,0.625783
20846,2017-03-15 11:26,b827eb905497,1.0,0.625783
20847,2017-03-15 11:27,b827eb905497,1.0,0.625783
20848,2017-03-15 11:28,b827eb905497,1.0,0.625783
...,...,...,...,...
431789,2020-03-24 18:57,b827eb0fedda,1.0,0.603433
431790,2020-03-24 18:58,b827eb0fedda,1.0,0.603433
431791,2020-03-24 18:59,b827eb0fedda,1.0,0.603433
431792,2020-03-24 19:00,b827eb0fedda,1.0,0.603433


In [11]:
noaa = df.merge(noaa, how='left', on='timestamp')
noaa.fillna(method='ffill', inplace=True)

In [12]:
noaa.head()

Unnamed: 0,timestamp,prcp,snow,awnd,tmax,tmin
0,2017-03-01 00:00,0.0,0.0,,,
1,2017-03-01 00:01,0.0,0.0,,,
2,2017-03-01 00:02,0.0,0.0,,,
3,2017-03-01 00:03,0.0,0.0,,,
4,2017-03-01 00:04,0.0,0.0,,,


In [13]:
df = df.merge(spl, how='left', on='timestamp')
df = df.merge(yamnet, how='left', on=['timestamp', 'sonyc_sensor_id'])
df = df.merge(nexrad, how='left', on='timestamp')
df = df.merge(noaa, how='left', on='timestamp')

In [14]:
df.shape

(7719882, 18)

In [15]:
df.columns

Index(['timestamp', 'sonyc_sensor_id', 'sonyc_sensor_name', 'dBAS_lin_mean',
       'count_of_predictions_over_50', 'average_prediction', 'avg_mtr',
       'avg_mt', 'avg_height', 'avg_airspeed', 'avg_heading', 'avg_airspeed_u',
       'avg_airspeed_v', 'prcp', 'snow', 'awnd', 'tmax', 'tmin'],
      dtype='object')

In [16]:
df.describe()

Unnamed: 0,dBAS_lin_mean,count_of_predictions_over_50,average_prediction,avg_mtr,avg_mt,avg_height,avg_airspeed,avg_heading,avg_airspeed_u,avg_airspeed_v,prcp,snow,awnd,tmax,tmin
count,3788026.0,188105.0,188105.0,2600788.0,2600788.0,2600788.0,2600788.0,2600788.0,2600788.0,2600788.0,7719882.0,7719882.0,7638022.0,7638022.0,7638022.0
mean,62.63783,2.499508,0.608464,426.4618,220134.6,493.9517,5.509743,58.3663,1.146321,2.390052,0.1625002,0.0879048,5.798184,54.94174,41.90562
std,5.351225,2.47974,0.081013,1405.834,193610.4,284.7426,3.343937,66.24486,3.602357,3.978321,0.362165,0.6153192,2.79212,12.97469,10.01958
min,44.70492,1.0,0.500006,0.0,25566.37,115.725,0.0,-85.21002,-73.15356,-62.88553,0.0,0.0,0.89,29.0,14.0
25%,58.92521,1.0,0.546923,10.10677,84141.61,260.5537,3.182131,12.54544,-0.9150214,-0.1450217,0.0,0.0,3.8,43.0,35.0
50%,62.1089,2.0,0.593091,35.31654,145139.9,433.2725,4.873088,47.24176,0.9498256,2.053523,0.0,0.0,5.82,52.0,41.0
75%,65.63741,3.0,0.650277,208.7594,281540.8,664.0571,7.374653,99.00711,3.224946,4.938206,0.13,0.0,8.5,66.0,49.0
max,103.6288,19.0,0.989875,30555.7,889043.4,2246.075,80.6657,267.5055,66.90149,75.10923,2.85,7.0,10.29,85.0,64.0


In [18]:
df.dropna()

Unnamed: 0,timestamp,sonyc_sensor_id,sonyc_sensor_name,dBAS_lin_mean,count_of_predictions_over_50,average_prediction,avg_mtr,avg_mt,avg_height,avg_airspeed,avg_heading,avg_airspeed_u,avg_airspeed_v,prcp,snow,awnd,tmax,tmin
84570,2017-03-04 02:15,b827eb1685c7,Shimkin Reading Room,62.448280,1.0,0.500168,3.827990,26615.883628,677.577651,4.035907,-47.940615,-3.455276,1.882161,0.0,0.0,8.95,30.0,17.0
84571,2017-03-04 02:15,b827eb1685c7,Shimkin Reading Room,61.750122,1.0,0.500168,3.827990,26615.883628,677.577651,4.035907,-47.940615,-3.455276,1.882161,0.0,0.0,8.95,30.0,17.0
84588,2017-03-04 02:16,b827eb1685c7,Shimkin Reading Room,79.349724,1.0,0.500168,3.827990,26615.883628,677.577651,4.035907,-47.940615,-3.455276,1.882161,0.0,0.0,8.95,30.0,17.0
84589,2017-03-04 02:16,b827eb1685c7,Shimkin Reading Room,65.003200,1.0,0.500168,3.827990,26615.883628,677.577651,4.035907,-47.940615,-3.455276,1.882161,0.0,0.0,8.95,30.0,17.0
84606,2017-03-04 02:17,b827eb1685c7,Shimkin Reading Room,63.331158,1.0,0.500168,3.827990,26615.883628,677.577651,4.035907,-47.940615,-3.455276,1.882161,0.0,0.0,8.95,30.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4647159,2019-03-24 10:55,b827eb0fedda,Juan Carlos,63.350700,12.0,0.703187,81.773388,85240.904459,262.844529,9.150045,-43.909698,-6.298935,6.496996,0.0,0.0,2.01,46.0,32.0
4647166,2019-03-24 10:56,b827eb815321,19 Washington Square North,59.360382,1.0,0.549274,81.773388,85240.904459,262.844529,9.150045,-43.909698,-6.298935,6.496996,0.0,0.0,2.01,46.0,32.0
4647171,2019-03-24 10:56,b827eb0fedda,Juan Carlos,63.147263,12.0,0.703187,81.773388,85240.904459,262.844529,9.150045,-43.909698,-6.298935,6.496996,0.0,0.0,2.01,46.0,32.0
4647178,2019-03-24 10:57,b827eb815321,19 Washington Square North,60.480965,1.0,0.549274,81.773388,85240.904459,262.844529,9.150045,-43.909698,-6.298935,6.496996,0.0,0.0,2.01,46.0,32.0


In [None]:
import seaborn as sns; sns.set(style="ticks", color_codes=True)

g = sns.pairplot(df.dropna())
