In [10]:
### producer code to transform and push data
import re
import json
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import date

#html = urlopen("http://www.psudataeng.com:8000/getStopEvents")
with open('stopEvents_2022-05-15.html') as html:
    soup = BeautifulSoup(html, 'lxml')

all_h3s = soup.find_all('h3')

all_ids = []
for h3 in all_h3s:
    match = re.search(r'\d{9}', h3.string)
    all_ids.append(match.group())

trip_data = []
all_tables = soup.find_all('table')
for i in range(len(all_tables)):
    t_td = all_tables[i].find_all('td')
    data = {}
    data["trip_id"] = all_ids[i]
    data["route_id"] = t_td[3].string
    data["vehicle_id"] = t_td[0].string
    data["service_key"] = t_td[5].string
    data["direction"] = t_td[4].string
    trip_data.append(data)

print(len(trip_data))

636


In [11]:
FILE_DATE = date.today().strftime("%Y-%m-%d")
FILE_NAME = f"se-{FILE_DATE}.json"

with open(FILE_NAME, 'w') as file:
    json.dump(trip_data, file)

with open(FILE_NAME) as file:
    data = json.load(file)
    
df = pd.DataFrame(data)
df

Unnamed: 0,trip_id,route_id,vehicle_id,service_key,direction
0,170612619,6,2270,U,0
1,170612634,6,2270,U,0
2,170612649,6,2270,U,1
3,170612662,6,2270,U,1
4,170612670,6,2270,U,0
...,...,...,...,...,...
631,170623254,50,6009,U,1
632,170623282,50,6010,U,
633,170626155,60,4018,U,
634,170628104,37,4032,U,


In [12]:
### consumer code to validate stop_events data
assertion0 = "All trip_ids are 9 digit integers"
df['trip_id'] = df['trip_id'].astype(int)
assert df['trip_id'].between(100000000,999999999).all(), f"FAILED {assertion0}"
print(f"PASSED {assertion0}")

PASSED All trip_ids are 9 digit integers


In [13]:
assertion1 = "All route_id are non-null, non-negative integers"
df['route_id'] = df['route_id'].astype(int)
assert df['route_id'].all() >= 0, f"FAILED {assertion1}"
print(f"PASSED {assertion1}")

PASSED All route_id are non-null, non-negative integers


In [14]:
assertion2 = "All vehicle_id are non-null, non-negative integers"
df['vehicle_id'] = df['vehicle_id'].astype(int)
assert df['vehicle_id'].all() >= 0, f"FAILED {assertion2}"
print(f"PASSED {assertion2}")

PASSED All vehicle_id are non-null, non-negative integers


In [15]:
assertion3 = "All service_key fields are filled with values of either W, S, or U"
allowed = ['W', 'S', 'U']
assert df['service_key'].apply(lambda x: True if x in allowed else False).all(), f"FAILED {assertion3}"
print(f"PASSED {assertion3}")

PASSED All service_key fields are filled with values of either W, S, or U


In [16]:
assertion4 = "All directions are either 0 or 1"
allowed = ['0', '1']
df.dropna(inplace=True)
assert df['direction'].apply(lambda x: True if x in allowed else False).all(), f"FAILED {assertion4}"
print(f"PASSED {assertion4}")

PASSED All directions are either 0 or 1


In [17]:
### consumer code to transform stop_events data
df['service_key'] = df['service_key'].apply(lambda x: 'Weekday' if x == 'W' else ('Saturday' if x == 'S' else ('Sunday' if x == 'U' else None)))
df['direction'] = df['direction'].apply(lambda x: 'Out' if x == '0' else ('Back' if x == '1' else None))
df

Unnamed: 0,trip_id,route_id,vehicle_id,service_key,direction
0,170612619,6,2270,Sunday,Out
1,170612634,6,2270,Sunday,Out
2,170612649,6,2270,Sunday,Back
3,170612662,6,2270,Sunday,Back
4,170612670,6,2270,Sunday,Out
...,...,...,...,...,...
613,170616552,60,4007,Sunday,Out
617,170616613,60,4007,Sunday,Out
620,170621558,60,4038,Sunday,Out
621,170621802,60,4038,Sunday,Back


In [20]:
i = 'stop_events/stopEvents_2022-05-15.html'
imatch = re.search(r'\d\d\d\d-\d\d-\d\d', i).group()
#idate = imatch.group()
print(imatch)

2022-05-15
