forked from sprinkler/rainmachine-developer-resources
-
Notifications
You must be signed in to change notification settings - Fork 0
/
noaa-observed-parser.py
243 lines (198 loc) · 8.79 KB
/
noaa-observed-parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
from RMParserFramework.rmParser import RMParser # Mandatory include for parser definition
from RMUtilsFramework.rmLogging import log # Optional include for logging
from RMUtilsFramework.rmTimeUtils import * #rmNowDateTime, rmGetStartOfDay, rmCurrentDayTimestamp, rmDeltaDayFromTimestamp, rmCurrentTimestamp
import json
from datetime import datetime
from HTMLParser import HTMLParser
import re
class NoaaTableParser(HTMLParser):
#Initializing lists
tableList = []
tableRef = None
rowRef = None
colRef = None
#HTML Parser Methods
def handle_starttag(self, startTag, attrs):
if startTag == 'table':
self.tableList.append([])
self.tableRef = self.tableList[-1]
elif startTag == 'tr':
self.tableRef.append([])
self.rowRef = self.tableRef[-1]
elif startTag == 'th':
#This may span cols, so add additional columns to bridge the span
numCols = 1
for tup in attrs:
if tup[0] == 'colspan':
numCols = int(tup[1])
break
self.rowRef += ['']*numCols
self.colRef = self.rowRef
elif startTag == 'td':
#This may span cols, so add additional columns to bridge the span
numCols = 1
for tup in attrs:
if tup[0] == 'colspan':
numCols = int(tup[1])
break
self.rowRef += ['']*numCols
self.colRef = self.rowRef
else:
pass
def handle_endtag(self, endTag):
if endTag == 'table':
self.tableRef = None
elif endTag == 'tr':
self.rowRef = None
elif endTag == 'th':
self.colRef = None
elif endTag == 'td':
self.colRef = None
else:
pass
def handle_data(self, data):
if self.colRef:
self.colRef[-1]+= data.strip()
def handle_startendtag(self,startendTag, attrs):
pass
def handle_comment(self,data):
pass
def get_table(self):
return self.tableList
class NoaaObsParser(RMParser):
parserName = "NOAA Observations" # Your parser name
parserDescription = "NOAA Observations Data" # A short description of your parser
parserForecast = False # True if parser provides future forecast data
parserHistorical = True # True if parser also provides historical data (only actual observed data)
parserInterval = 6 * 3600 # Your parser running interval in seconds, data will only be mixed in hourly intervals
parserDebug = False
parserEnabled = False
params = {
"_stationURL" : "https://forecast.weather.gov/data/obhistory/KBDU.html",
"stationID" : 'KBDU',
"dailyAccum": True,
"_lastRain" : '',
"_lastRainTS": '',
"_lastTS": ''
}
def perform(self): # The function that will be executed must have this name
if self.params['stationID']:
stationID = self.params['stationID']
else:
log.error("*** No Station ID")
self.lastKnownError = "No Station ID"
#NOAA has a bunch of different feeds for the weather data, but only obhistory has rainfall
# A full listing of stations/urls can be found here: https://forecast.weather.gov/xml/current_obs/index.xml
stationURL = "https://forecast.weather.gov/data/obhistory/" + stationID + ".html"
self.params['_stationURL'] = stationURL
# downloading data from a URL convenience function since other python libraries can be used
self.getObservations(stationURL)
if self.parserDebug:
log.debug(self.result)
def __parse_time(self, date, time):
try:
log_day = int(date)
log_time = [int(x) for x in time.split(':')]
except:
return None
if self.params['dailyAccum']:
tsToday = rmCurrentDayTimestamp()
tsYesterDay = rmDeltaDayFromTimestamp(tsToday, -1)
today = rmTimestampToDate(tsToday)
yester = rmTimestampToDate(tsYesterDay)
#Only update rain for today and yesterday as a daily average
if(today.day == log_day):
log_date = today
elif(yester.day == log_day):
log_date = yester
else:
return None
else:
curTime = rmNowDateTime()
# The date format logged is very annoying... it's just the day of the month.
# Since the log is only 1 week long, we can't have a date bigger than today unless it's from the previous month.
month = curTime.month
day = curTime.day
year = curTime.year
#Don't short circuit for just today since NOAA only updates HTML every few hours and we may miss rain
# in the late evening
#if day != log_day:
# return None
if day - log_day < 0:
month -= 1
if month == 0:
month = 12
year -= 1
#Construct a new date with the month/day/time information
# Rainmachine addValue rounds to the hour, so drop the minutes and we'll accumulate outside
log_date = datetime(year, month, log_day, log_time[0])
#convert to our unix timestamp for logging
# This is the same method RM uses in rmGetStartOfDay() which matches all the timestamp styles
timestamp = int(log_date.strftime("%s"))
return timestamp
def __parse_precip(self, hr1, hr3, hr6):
try:
rain = float(hr1) * 25.4 #i n to mm
return rain
except:
return 0
def getObservations(self,stationURL):
d = self.openURL(stationURL)
parser = NoaaTableParser()
if d is None:
log.error("*** Failed to fetch URL")
self.lastKnownError = "Failed to Fetch"
return False
try:
parser.feed(d.read().decode('utf-8'))
tree = parser.get_table()
except:
log.error("*** Failed to parse response!")
self.lastKnownError = "Failed to Parse"
return False
# It's very likely that all stations generate the same table layout, but just quickly search for 'Date' to find
# the correct table
tableIdx = None
for tIdx, table in enumerate(tree):
if len(table) and len(table[0]) and 'Date' == table[0][0]:
tableIdx = tIdx
if tableIdx==None:
log.error("*** No information found in response!")
self.lastKnownError = "Retrying hourly data retrieval"
return False
# Dynamically build a list of column names for indexing
weatherObs = tree[tableIdx]
fieldIdx = {}
for cIdx, col in enumerate(weatherObs[0]):
colStr = re.split('[^a-zA-Z]',col)
fieldIdx[colStr[0]] = cIdx
# Walk our table and build a list of tuples to insert
# Skip the first 3 and last 3 rows since that's the headers
headerLen = len(weatherObs[0])
rainData = {}
for row in weatherObs[3:-3]:
if len(row) == headerLen:
time = self.__parse_time(row[fieldIdx['Date']],row[fieldIdx['Time']])
if time:
# Need to see how the precipitation is logged for 1/3/6 hr, when rows are ~28min apart
# Update: It looks like each row is just the rain (inches) for the current interval, so while we query today, just insert the rain
# at the matching timestamp and let the rainmachine parser sum it up for the entire day.
rain = self.__parse_precip(row[fieldIdx['Precipitation'] - 2], row[fieldIdx['Precipitation'] - 1],
row[fieldIdx['Precipitation']])
if time in rainData:
rainData[time] += rain
else:
rainData[time] = rain
for k, v in sorted(rainData.items()):
self.addValue(RMParser.dataType.RAIN, k, v)
if v > 0:
self.params['_lastRain'] = v
self.params['_lastRainTS'] = k
self.params['_lastTS'] = k
#log.info("times:%s (%d) Rain:%f"%(rmTimestampToDateAsString(k), k, v))
# Reset lastKnownError from a previous function call
self.lastKnownError = ""
pass
if __name__ == "__main__":
p = NoaaObsParser()
p.perform()