In [107]:
# -*- coding: utf-8 -*-
import urllib
import urllib.request as request
import re
import html
import sys, os
import pickle
sys.path.append("C:\\Program Files\\Anaconda3\\envs\\tensorflow\\lib\\site-packages")
import tweepy

from collections import deque

urlstr = "https://uk-air.defra.gov.uk/latest/currentlevels?view=site#L"
shorturlstr = "https://goo.gl/ZpELjS"

urlWHO = "http://apps.who.int/iris/bitstream/10665/69477/1/WHO_SDE_PHE_OEH_06.02_eng.pdf"

sitename = b'Liverpool'

mgm3 = '\u03BCgm\u207B\u00B3'
O3, NO2, SO2, PM25, PM100 = "O\u2083", "NO\u2082", "SO\u2082", "PM\u2082\u2085", "PM\u2081\u2080\u2080"
guides = {O3:100, NO2:200, SO2:20, PM25:25, PM100:50} # source: http://apps.who.int/iris/bitstream/10665/69477/1/WHO_SDE_PHE_OEH_06.02_eng.pdf  
meansWHO = {O3:'8h', NO2:'1h', SO2:'10m', PM25:'24h', PM100:'24h'}
meansDEFRA = {O3:'8h', NO2:'1h', SO2:'max 15m', PM25:'24h', PM100:'24h'}


def tweet(status, replyto=None):
    if not status:
        return
    consumer_key, consumer_secret, access_token, access_token_secret = pickle.load(open("apikeys.bin", "rb")) 
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', status)
    print("urls = ", urls)
    # take out all url texts from status for count, all urls count as 23
    rstat = status
    for u in urls:
        rstat = rstat.replace(u, '')
    nchars = len(rstat) + 23 * len(urls)
    if nchars > 140:
        print("Tweet too long")
        
    #print(status)
    
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth)
    try:
        if replyto:
            stat = api.update_status(status=status, in_reply_to_status_id=replyto.id)
        else:
            stat = api.update_status(status=status)
    except Exception as e:
        print(e)
        stat = None
    return stat
    
def compose(day, clock, reading):    
    status = ["%s, %s (%s)" % (day, clock, mgm3)]
    skeys = list(reading.keys())
    skeys.sort()
    for k in skeys:
        status.append("%s: %.0f %s" % (k, reading[k][0], reading[k][1]))
    status.append("%s" % shorturlstr)
    status = '\n'.join(status)
    return status

def composeAboveTweet(day, clock, above, origtweetstat):
    status = []
    hrnow = int(clock[:clock.index(':')])
    daynow = int(day[:day.index('/')])
    for k in above:
        # count hours above
        #print("In composeAboveTweet", k, above[k])
        lday, lclock, lvalue = above[k][0]
        if lday == day and lclock == clock:
            stat = []
            # count hours above
            nhours = 1
            for lday, lclock, lvalue in above[k][1:]:
                if lday == day and lclock == clock:
                    continue # skip duplicate entries
                hr = int(lclock[:lclock.index(':')])
                d = int(lday[:lday.index('/')])
                daydiff = daynow - d
                print(hr,daydiff,nhours)
                if (hr - daydiff * 24) + nhours == hrnow:
                    nhours += 1
                else:
                    break
            stat.append("@lpoolcouncil @DefraUKAir @LiverpoolFoE: %s %dh above @WHO guide (%.0f%s %s-mean %s) #airpollution #liverpool" % 
                        (k, nhours, guides[k], mgm3, meansWHO[k], urlWHO))
            if meansWHO[k] != meansDEFRA[k]:
                stat.append("(Note #DEFRA data is %s mean)" % meansDEFRA[k])            
            status.append('\n'.join(stat))
    return status
        


def scrape():
    f = request.urlopen(urlstr)

    r = f.read()
    g = re.search(b".*<tr>.*(%s.*?)</tr>" % sitename, r, re.DOTALL)
    #print(g.group(1))

    # split into <td></td>
    row = g.group(1)
    #print("row = %s\n" % row)

    # date and time
    dategroups = re.search(b".*<td>(.*?)<br.*?>(.*?)</td>", row, re.DOTALL)
    day = dategroups.group(1).decode("utf-8")
    clock = dategroups.group(2).decode("utf-8")


    # data
    cols = re.findall(b"<span.*?>(.*?)</span>", row, re.DOTALL)
    assert len(cols) == 5
    units = [O3, NO2, SO2, PM25, PM100]
    datanums = []
    for v in cols:
        if b' ' in v:
            value = float(v[:v.index(b' ')])
        else:
            value = float(v[:v.index(b'&')])
        nv = v.replace(b'&nbsp;', b' ')
        ix = re.match(b".*?(\(.*?\))", nv).group(1)
        datanums.append((value, ix.decode("utf-8")))

    reading = dict(zip(units, datanums))
    return day, clock, reading

def loadReadings():
    fall = "allreadings.bin"
    allreadings = deque()
    if os.path.isfile(fall):
        allreadings = pickle.load(open(fall, "rb"))
    return allreadings

def pickleReadings(allreading):
    fall = "allreadings.bin"
    pickle.dump(allreadings, open(fall, "wb"))
    
def compareWHO(allreadings):
    above = {}
    for (day, clock, reading) in allreadings:
        for k in guides:
            if reading[k][0] > guides[k]:
                if k not in above:
                    above[k] = []
                above[k].append((day,clock, reading[k][0]))
    return above


debug = False

if debug:
    stat = tweet("TTEESSTT")
    print(stat)
    #tweet("In reply to: TEST3", stat)

else:
    allreadings = loadReadings()
    
    # remove duplicate entries (could have come in while debugging)
    ic = 0
    while ic < len(allreadings):
        r = allreadings[ic]
        while allreadings.count(r) > 1:
            allreadings.remove(r)
        ic += 1
    for r in allreadings:
        print(r)
            

    lastday, lastclock, lastreading = allreadings[0]
    day, clock, reading = scrape()
    if ((day, clock) != (lastday, lastclock)):
        status = compose(day, clock, reading)
        rtweet = tweet(status)

        allreadings.appendleft((day, clock, reading))
        pickleReadings(allreadings)

        # compare with WHO recommendations
        r = compareWHO(allreadings)
        if r:
            stats = composeAboveTweet(day, clock, r, rtweet)
            for s in stats:
                tweet(s, replyto=rtweet)
    else:
        print("Reading already known")




('27/01/2017', '16:00', {'O₃': (19.0, '(1 Low)'), 'SO₂': (7.0, '(1 Low)'), 'NO₂': (34.0, '(1 Low)'), 'PM₁₀₀': (53.0, '(4 Moderate)'), 'PM₂₅': (48.0, '(6 Moderate)')})
('27/01/2017', '15:00', {'O₃': (17.0, '(1 Low)'), 'PM₁₀₀': (52.0, '(4 Moderate)'), 'NO₂': (31.0, '(1 Low)'), 'SO₂': (6.0, '(1 Low)'), 'PM₂₅': (47.0, '(5 Moderate)')})
('27/01/2017', '14:00', {'O₃': (16.0, '(1 Low)'), 'PM₁₀₀': (51.0, '(4 Moderate)'), 'NO₂': (31.0, '(1 Low)'), 'SO₂': (6.0, '(1 Low)'), 'PM₂₅': (47.0, '(5 Moderate)')})
('27/01/2017', '13:00', {'O₃': (14.0, '(1 Low)'), 'PM₁₀₀': (51.0, '(4 Moderate)'), 'NO₂': (33.0, '(1 Low)'), 'SO₂': (6.0, '(1 Low)'), 'PM₂₅': (46.0, '(5 Moderate)')})
('27/01/2017', '12:00', {'O₃': (14.0, '(1 Low)'), 'PM₁₀₀': (50.0, '(3 Low)'), 'NO₂': (28.0, '(1 Low)'), 'SO₂': (5.0, '(1 Low)'), 'PM₂₅': (46.0, '(5 Moderate)')})
('27/01/2017', '11:00', {'O₃': (13.0, '(1 Low)'), 'PM₁₀₀': (49.0, '(3 Low)'), 'NO₂': (26.0, '(1 Low)'), 'SO₂': (6.0, '(1 Low)'), 'PM₂₅': (45.0, '(5 Moderate)')})
('27/01/