# Tree

Because of the hirechical nature of the area API we'll define a tree like structure to make it easier to navigatae.

In [1]:
import Base: parent

In [2]:
using Requests, Cascadia,Gumbo, AbstractTrees,ProgressMeter,StatsBase

In [3]:
using AbstractTrees
import AbstractTrees: children, printnode
import Base: start, done, next

In [4]:
#Used to contain areas, these can be districts, sub-disricts
immutable Area 
    name::String
    code::String
    level::Int
    subarea::Vector
end

In [5]:
#used to contain a location, specifically bus stops
immutable Location
    name::String
    lat::Float64
    lon::Float64
end

In [6]:
start(area::Area) = start(area.subarea)

function next(area::Area,i)
    (v,i) = next(area.subarea,i)
end

done(area::Area,i) = done(area.subarea,i)

children(area::Area) = area.subarea

printnode(io::IO, area::Area) = Base.print_with_color(:blue, io, area.name)
printnode(io::IO, location::Location) = Base.print_with_color(:green, io, location.name)
# printnode(io::IO, f::File) = print(io, basename(f.path))

printnode (generic function with 5 methods)

In [7]:
#Helper function to search area tree
function find_area(areas,re)
    
    """
    Can be used to search and area tree with a regualr expression
    returns the first match
    """
    
    for area in PostOrderDFS(areas)
        if typeof(area) == Area
            if ismatch(re,area.name)
                return area
            end
        end
    end
    print("Area Not Found")
end

find_area (generic function with 1 method)

## HTML Helper Functions

This section contains helper functions for traversing and searching  html.

In [8]:
url = "https://mobile.nwstbus.com.hk/nwp3/getlocation.php"

"https://mobile.nwstbus.com.hk/nwp3/getlocation.php"

In [9]:
function parent(node::HTMLNode, n=1)
    
    """
    Returns the parent of a html node, optional takes argument n
    which can be used to specify home many parents to go up
    
    """
    
    parent = node
    for i in 1:n
        parent = parent.parent
    end
    return parent
end
        

parent (generic function with 11 methods)

In [10]:
function innertext(node)
    
    """
    Extract inner text from node.
    """
    
    s = ""
    for ele in Leaves(node)
        if typeof(ele) == HTMLText
            s *= text(ele)
        elseif typeof(ele) ==  HTMLElement{:br}
            s *= "\n"
        end
    end
    
    return s
end
        
        

innertext (generic function with 1 method)

# ETA API

The first set is to get all of the sublocations for the main areas, because of the hirechial nature of them it makes sense to use a tree to store this information.

In [11]:
function extract_lis(html_node)
    
    """
    This function extracts the li elements that contain
    the area/locational information.
    """
    
    lis = matchall(sel"li",html_node);
    l = []
    for li in lis
        if haskey(li.attributes,"onclick")
            push!(l,li.attributes["onclick"])
        end
    end
    return l
end

extract_lis (generic function with 1 method)

In [12]:
function parse_area_code(str)
    
    """
    Parses an area ai returning the name and code
    """
    
    group = match(r"'([^']+)'",str)[1]
    key = split(group,"|")[1]
    code = split(group,"|")[2]
    return key => code
end

parse_area_code (generic function with 1 method)

In [13]:
function add_areas!(top_area,level)
    
    """
    Takes a tree and a level specification and adds all areas to tree
    """
    
    data = Dict(
        "l"=> level,
        "t"=>"s",
        "lang"=>"1", 
    )
    
    for area in Leaves(top_area)
        data["k"] = area.code
        res = post(url,data=data)
        html = res |> String |> parsehtml
        try
            for (name,code) in parse_area_code.(extract_lis(html.root))
                push!(area.subarea, Area(name,code,level,[]))
            end
        end
    end
    return top_area
end

add_areas! (generic function with 1 method)

In [14]:
function parse_location(str)
    group = match(r"'([^']+)'",str)[1]
    name = match(r"[^|]+",group).match
    lat = split(group,"|")[end-2]
    lon = split(group,"|")[end-1]
    lat = parse(lat)
    lon = parse(lon)
    return Location(name,lat,lon)
end

parse_location (generic function with 1 method)

In [15]:
function add_locations!(area)
    
    data = Dict(
        "l"=> 4,
        "t"=>"s",
        "lang"=>"1", 
    )
    
    if typeof(area) == Area
        try
            data["k"] = area.code
            res = post(url,data=data)
            html = res |> String |> parsehtml
            for li in extract_lis(html.root)
                push!(area.subarea ,parse_location(li))
            end
        end
    end
end
    
    

add_locations! (generic function with 1 method)

In [16]:
#Root Node 
ALL = Area("All","ALL",1,[])

#Manually add first main areas
areas = [
    "Kowllon" => "KL",
    "Hong Kong" => "HK",
    "New Territories" => "NT",
]

for (name,area_code) in areas
    push!(ALL.subarea, Area(name,area_code,2,[]))
end

#Add ditrict and sub-districs
ALL = add_areas!(ALL,2); #district
ALL = add_areas!(ALL,3); # sub-distric

#Add locations
for area in Leaves(ALL)
    add_locations!(area)
end

In [20]:
#Blue represents an area, and green represnets a location
# print_tree(ALL)

In [220]:
#Can search tree for areas using a regular expression
hung_hom = find_area(ALL,r"Hung Hom");
print_tree(hung_hom)

[34mHung Hom[39m
├─ [32mArea:Hung Hom[39m
├─ [32mFortune Metropolis[39m
├─ [32mHarbourview Horizons[39m
├─ [32mHK Polytechnic University Student Halls of Residence (Hung Hom)[39m
├─ [32mHung Hom Bay Centre[39m
├─ [32mHung Hom Complex (Public Library)[39m
├─ [32mHung Hom Government Primary School[39m
├─ [32mHung Hom Station[39m
├─ [32mHunghom Commercial Centre / Hunghom Square[39m
├─ [32mKa Wai Chuen[39m
├─ [32mRoyal Peninsula[39m
├─ [32mSKH Holy Carpenter Secondary School[39m
├─ [32mValley Road / Ho Man Tin Station Exit B1[39m
└─ [32mVicinity of Gillies Avenue South, Wuhu Street & Baker Street[39m


## ETA API

Now that we have all of the locations we can use them to query the ETA API, we'll define two functions, one to fetch the html and another to parse it.

In [64]:
function fetch_route_details(start::Location,finish::Location, time= now() + Dates.Minute(5))
    
    data = Dict(
        "slat"=> start.lat,
        "slon"=> start.lon,
        "elat"=>finish.lat,
        "elon"=>finish.lon,
        "loc" =>"$(start.name), $(finish.name)",
        "leg"=>2,
        "ws"=>1.3,
        "t"=> Dates.format(time, "yyyy-mm-dd HH:MM"),
        "l"=>1,
        "m1"=>"T"
    )
    
    route_url = "https://mobile.nwstbus.com.hk/nwp3/ppsearch_p3.php?" 
    res = post(route_url,data=data)
end

fetch_route_details (generic function with 2 methods)

In [65]:
function parse_route_details(html_doc::HTMLDocument)

    """
    Will parse html returning, all routes between two stops, price and estimated
    time. If the returned array is empty it means that two bus stops are within
    walking distance.
    """
    html = html_doc.root
    tables = matchall(sel"#routelist2 > table",html)
    routes = []
    for table in tables
        
        buses = innertext.(matchall(sel".routenocell",table))
        n = length(buses) - 1
        prices = innertext.(matchall(sel"td:matches(\$\d)",table)[end-n:end])
        time = innertext(matchall(sel"td:matches(Estimated.*)",table))
    
        d = Dict(
            "buses" => buses,
            "prices"=> prices,
            "est_time"=> time
        )
        
        push!(routes,d)
    end
    return routes
end
    
    

parse_route_details (generic function with 1 method)

In [41]:
#Get all bus stops in hk
bus_stops = collect(Leaves(ALL));
print("There are $(length(bus_stops)) bus stops.")

There are 2593 bus stops.

In [42]:
#Pick Two bus stops at random
rand_stop_a = sample(bus_stops)
rand_stop_b = sample(bus_stops)
println(rand_stop_a)
println(rand_stop_b)

Location("Bank of China Centre", 22.3177386046, 114.158879069)
Location("Lake Silver", 22.428524343561, 114.24404361247)


In [43]:
#Fetch best Routes
res = fetch_route_details(rand_stop_a,rand_stop_b)
html = res |> String |> parsehtml;
best_routes = parse_route_details(html)

1-element Array{Any,1}:
 Dict{String,Any}(Pair{String,Any}("prices", String["\$9.3", "\$25.6"]),Pair{String,Any}("buses", String["905", "N680"]),Pair{String,Any}("est_time", "Estimated\n114 Min"))

In [33]:
#Alterantively we could look at bus stops for an area
hung_hom = find_area(ALL,r"Hung Hom");
kowloon_city = find_area(ALL,r"Kowloon City");

In [80]:
print_tree(hung_hom)

[34mHung Hom[39m
├─ [32mArea:Hung Hom[39m
├─ [32mFortune Metropolis[39m
├─ [32mHarbourview Horizons[39m
├─ [32mHK Polytechnic University Student Halls of Residence (Hung Hom)[39m
├─ [32mHung Hom Bay Centre[39m
├─ [32mHung Hom Complex (Public Library)[39m
├─ [32mHung Hom Government Primary School[39m
├─ [32mHung Hom Station[39m
├─ [32mHunghom Commercial Centre / Hunghom Square[39m
├─ [32mKa Wai Chuen[39m
├─ [32mRoyal Peninsula[39m
├─ [32mSKH Holy Carpenter Secondary School[39m
├─ [32mValley Road / Ho Man Tin Station Exit B1[39m
└─ [32mVicinity of Gillies Avenue South, Wuhu Street & Baker Street[39m


In [81]:
print_tree(kowloon_city)

[34mKowloon City[39m
├─ [32mArea:Kowloon City[39m
├─ [32mArgyle Street (near Olympic Garden)[39m
├─ [32mArgyle Street Playground[39m
├─ [32mChun Seen Mei Chuen[39m
├─ [32mKowloon City Municipal Services Building (Sports Centre / Public Library)[39m
├─ [32mKowloon City Plaza[39m
├─ [32mKowloon City Post Office[39m
├─ [32mKowloon Walled City Park / Carpenter Road Park[39m
├─ [32mNga Tsin Wai Road (near Prince Edward Road East)[39m
├─ [32mOlympic Garden[39m
├─ [32mPrince Edward Road West (near Olympic Garden)[39m
├─ [32mRegal Oriental Hotel[39m
├─ [32mSung Wong Toi Garden[39m
└─ [32mVicinity of Sa Po Road, Tak Ku Ling Road & Nga Tsin Long Road[39m


In [83]:
#Lets pick two bus stops
a = hung_hom.subarea[2]
b = kowloon_city.subarea[5]
println(a)
println(b)

Location("Fortune Metropolis", 22.303672369191, 114.18345667531)
Location("Kowloon City Municipal Services Building (Sports Centre / Public Library)", 22.3294357446, 114.1890004604)


In [84]:
res = fetch_route_details(a,b)
html = res |> String |> parsehtml;
parse_route_details(html)

7-element Array{Any,1}:
 Dict{String,Any}(Pair{String,Any}("prices", String["\$5.7"]),Pair{String,Any}("buses", String["111"]),Pair{String,Any}("est_time", "Estimated\n25 Min")) 
 Dict{String,Any}(Pair{String,Any}("prices", String["\$5.7"]),Pair{String,Any}("buses", String["107"]),Pair{String,Any}("est_time", "Estimated\n27 Min")) 
 Dict{String,Any}(Pair{String,Any}("prices", String["\$5.7"]),Pair{String,Any}("buses", String["101"]),Pair{String,Any}("est_time", "Estimated\n28 Min")) 
 Dict{String,Any}(Pair{String,Any}("prices", String["\$7.5"]),Pair{String,Any}("buses", String["796X"]),Pair{String,Any}("est_time", "Estimated\n28 Min"))
 Dict{String,Any}(Pair{String,Any}("prices", String["\$5.7"]),Pair{String,Any}("buses", String["116"]),Pair{String,Any}("est_time", "Estimated\n30 Min")) 
 Dict{String,Any}(Pair{String,Any}("prices", String["\$5.7"]),Pair{String,Any}("buses", String["113"]),Pair{String,Any}("est_time", "Estimated\n33 Min")) 
 Dict{String,Any}(Pair{String,Any}("prices", S

# Bus Routes

It's usefull to know all of the buses and there routes.

In [122]:
function fetch_bus_numbers()

    d = Dict(
        "rtype"=>"X",
        "skey"=>"Input Route No.",
        "l"=>1,
        "sysid"=>52
    )



    routesearch_url = "https://mobile.nwstbus.com.hk/nwp3/routesearch.php?"
    post(routesearch_url,data=d)
end

fetch_bus_numbers (generic function with 1 method)

In [128]:
function parse_bus_number(node::HTMLElement)
    
    n =  parent(node,2)
    res = innertext.(matchall(sel"tbody > tr",n)[2:end])
    Dict(
        "bus_no"=>res[1],
        "to"=>res[2],
        "details"=>res[3],
    )
    
end
    
    
    

parse_bus_number (generic function with 1 method)

In [129]:
res = fetch_bus_numbers()
html = res |> String |> parsehtml;

In [130]:
bus_numbers = matchall(sel".routenocell",html.root);

In [127]:
#Bus info, contains forward and backward routes
bus_nos = parse_bus_number.(bus_number)

532-element Array{Dict{String,String},1}:
 Dict("bus_no"=>"1","to"=>"To: Felix Villas","times"=>"Citybus")                                                                   
 Dict("bus_no"=>"1","to"=>"To: Happy Valley (Upper)","times"=>"Citybus")                                                           
 Dict("bus_no"=>"1P","to"=>"To: Central","times"=>"Citybus, Monday to Friday only")                                                
 Dict("bus_no"=>"1P","to"=>"To: Happy Valley (Upper)","times"=>"Citybus, Monday to Friday only")                                   
 Dict("bus_no"=>"2","to"=>"To: Grand Promenade","times"=>"NWFB")                                                                   
 Dict("bus_no"=>"2","to"=>"To: Central (Macau Ferry)","times"=>"NWFB")                                                             
 Dict("bus_no"=>"2A","to"=>"To: Wan Chai North","times"=>"NWFB")                                                                   
 Dict("bus_no"=>"2A","to"=>"To: Yi

In [131]:
#Unique buses ,its exxactly half len of above since some drive ciruclar routes
unique([ bus["bus_no"] for bus in bus_nos])

317-element Array{String,1}:
 "1"   
 "1P"  
 "2"   
 "2A"  
 "2R"  
 "2X"  
 "3A"  
 "4"   
 "4X"  
 "5B"  
 "5X"  
 "6"   
 "6A"  
 ⋮     
 "NA12"
 "NA21"
 "NA29"
 "R8"  
 "R11" 
 "R22" 
 "S1"  
 "S52" 
 "S52P"
 "S56" 
 "X1"  
 "X962"