In [1]:
import time
import datetime
import json
import hashlib
import collections

In [24]:
import requests
import pandas as pd
import dateutil
import pytz
import pymongo

In [6]:
ENDPOINT = 'https://maps.northwestern.edu/api/shuttles'
TZ_CHI = pytz.timezone('America/Chicago')

def get():
    response = requests.get(ENDPOINT, headers={'Accept': 'application/json'})
    assert response.status_code == 200
    return response.json()['bus']
    

def rowhash(row):
    rowhash = hashlib.md5(json.dumps(row, separators=',:', sort_keys=True).encode('utf-8'))
    return rowhash.hexdigest()
    

def parse(raw):
    data = []
    for bus in raw:
        datum = {
            'id': bus['bus_id'],
            'lat': round(float(bus['geocode']['lat']), 5),
            'lon': round(float(bus['geocode']['lon']), 5),
            'route_id': bus['geocode']['route_id'],
            'route_no': bus['geocode']['route_no'],
        }

        # usually nothing here.
    #     try:
    #         if bus['geocode']['speed']:
    #             datum['speed'] = bus['geocode']['speed']
    #     except KeyError:
    #         pass

        try:
            if bus['geocode']['type'].lower().strip() == 'cta':
                datum['type'] = 'CTA'
            else:
                datum['type'] = 'NU'
        except KeyError:
            datum['type'] = 'NU'

        # CTA data seems to omit this
        try:
            if bus['geocode']['lastStop']:
                datum['last_stop'] = bus['geocode']['lastStop']
        except KeyError:
            pass

        if datum['type'] == 'NU':
            datum['direction'] = bus['geocode']['direction']
            t = datetime.datetime.fromtimestamp(float(bus['geocode']['lastUpdate']))

        elif datum['type'] == 'CTA':
            t = datetime.datetime.strptime(bus['geocode']['lastUpdate'], '%Y%m%d %H:%M')

        datum['updated'] = int(TZ_CHI.localize(t).timestamp())

        datum['hash'] = rowhash(datum)
        
        data.append(datum)

    return data


In [22]:
df = pd.DataFrame(parse(get())).sort_values(by='updated')
df

Unnamed: 0,direction,hash,id,last_stop,lat,lon,route_id,route_no,type,updated
8,N,87ae31e72d0d44b9792811129f9afc33,29823,359.0,42.01804,-87.67297,63,63,NU,1471480680
14,,574e6d300d5e3bd0e46aa21a8497e920,1856,,42.01947,-87.68281,201,4318,CTA,1471480860
13,,5198b425028d51a03a30dadd7898e919,1766,,42.06433,-87.70753,201,4318,CTA,1471480860
9,N,f826ed8b145d74c8c798049ec614503b,29838,473.0,42.06436,-87.70843,63,63,NU,1471480860
15,,9ca9896ab95fa9e01dbb46c544ade12d,1961,,42.01804,-87.67297,201,4319,CTA,1471480860
0,N,01a641fdf517b3c55e5eda4cec5ce7dd,29888,362.0,42.01949,-87.68356,63,63,NU,1471480860
2,N,6f14bdd9024fcc5f23299957f9156e48,29783,481.0,42.06419,-87.68632,63,63,NU,1471480860
16,,00c02741c13b41c0b66fa306c4842482,1883,,42.06423,-87.68704,201,4319,CTA,1471480860
6,S,9d38b749fa51381390000787f3e146f7,29707,264.0,41.68899,-87.60284,52,52,NU,1471480903
1,E,93824eac895e6139cbc46d12d1aa9a34,29967,264.0,41.59683,-87.47148,52,52,NU,1471480903


In [23]:
T = 10 # seconds
Tmax = 5 * 60 # seconds

no_results = 0

known = collections.deque([], 2048)

for _ in range(10):

    loop_time = min(T * (2 ** min(10, no_results)), Tmax)
    time.sleep(loop_time)
    
    data = get()
    
    if len(data):
        no_results = 0
    else:
        no_results += 1
        continue
    
    data = parse(data)
    
    data = [r for r in data if rowhash(r) not in known]
    known.extendleft(rowhash(r) for r in data)
    
    print(len(data))
    

17
11
13
11
11
6
8
9
9
10


In [67]:
client = pymongo.MongoClient('mongodb://{user}:{pass}@{server}/'.format(**json.load(open('credentials.json'))))
db = client['transit']
nubus = db['nubus']

In [86]:
nubus.count()

80

In [78]:
pd.DataFrame(list(nubus.find()))

Unnamed: 0,_id,direction,id,last_stop,lat,lon,route_id,route_no,type,updated
0,00962e4780c932b38bf603ca,N,29888,366.0,42.04781,-87.67884,63,63,NU,1471482780
1,6d0148648a7407b5861e3363,N,29783,332.0,42.06416,-87.68331,63,63,NU,1471482900
2,f63656784b7f82bf73882824,NW,29725,153.0,42.05555,-87.67709,ICB,48,NU,1471482920
3,84e62316c2f542d1897fe9db,N,29823,440.0,42.0645,-87.71831,63,63,NU,1471482900
4,366aec48323bf1ca3d4622f9,N,29838,485.0,42.01865,-87.67604,63,63,NU,1471482600
5,32da694f8bea343ead4683a4,W,29732,262.0,41.89507,-87.62053,SHOPNRIDE,45,NU,1471482926
6,d7639791afc2323171c296f1,,1883,,42.06416,-87.68331,201,4318,CTA,1471482900
7,ff7986c2c24f496d613410b1,,1856,,42.05068,-87.67743,201,4319,CTA,1471482840
8,3fa3074896defdccbb14b8be,,1961,,42.0645,-87.71831,201,4319,CTA,1471482900
9,d9293f8c420c3224b032453e,N,29725,153.0,42.05612,-87.67705,ICB,48,NU,1471482940


In [68]:
test = client['test']['test']

In [69]:
test.count()

67

In [64]:
data = parse(get())

In [65]:
bulk = pymongo.bulk.BulkOperationBuilder(test, ordered=False)
for doc in data:
    d = doc.copy()
    d['_id'] = d.pop('hash')[:24]
    bulk.find({'_id': d['_id']}).upsert().update_one({'$setOnInsert': d})
    
bresult = bulk.execute()
bresult

{'nInserted': 0,
 'nMatched': 5,
 'nModified': 0,
 'nRemoved': 0,
 'nUpserted': 5,
 'upserted': [{'_id': '786b2619afa0c7bfebddcbbd', 'index': 2},
  {'_id': '0d5fc35d72384e65695845ba', 'index': 4},
  {'_id': 'bfa1bb3a1f2508bf12d9bea7', 'index': 5},
  {'_id': '2e123fa5d8f5f37cb360373a', 'index': 6},
  {'_id': '42a08310e8f4936a2a3be398', 'index': 9}],
 'writeConcernErrors': [],
 'writeErrors': []}

In [66]:
test.count()

67