# COLLECT DISTRICT COORDINATE DATA

In [1]:
import requests
import json
import time
from tqdm import tqdm_notebook as tqdm
from collections import OrderedDict
import numpy as np
import pandas as pd

In [2]:
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'

In [3]:
overpass_url = "http://overpass-api.de/api/interpreter"

## FUNCTIONS

In [4]:
def write_roman(num):
    # convert number to roman numbers
    # for district queries
    roman = OrderedDict()
    roman[1000] = "M"
    roman[900] = "CM"
    roman[500] = "D"
    roman[400] = "CD"
    roman[100] = "C"
    roman[90] = "XC"
    roman[50] = "L"
    roman[40] = "XL"
    roman[10] = "X"
    roman[9] = "IX"
    roman[5] = "V"
    roman[4] = "IV"
    roman[1] = "I"

    def roman_num(num):
        for r in roman.keys():
            x, y = divmod(num, r)
            yield roman[r] * x
            num -= (r * x)
            if num <= 0:
                break

    return "".join([a for a in roman_num(num)])

def json_loader(file):
    # loads a json 'file' into a dictionary object
    with open(file, 'r') as fp:
        loaded_json = json.load(open(file))
    return loaded_json

def get_overpass_query_from_listofIDs(type_, list_of_ids):
    # queries a list of object ids with type_ (node, way, rel)
    return f"""
    [out:json];
    """ + "("+";".join([f"{type_}({id_})" for id_ in list_of_ids])+";)" + """;
    out center;
    """

def get_border_of_district(ker, nodes_resp, nodes_resp_byways):
    # get first ways coordinates in correct order
    firstway = list(nodes_resp_byways[ker].keys())[0]
    sorted_df = sort_points(nodes_resp_byways[ker][firstway])
    
    # we should not look for continuation in ways done
    ways_done = [firstway]
    id_to_find = sorted_df.iloc[-1]['id']
    
    # find the next way to continue
    while len(ways_done) != len(nodes_resp_byways[ker].keys()):
        next_way = find_next_way(ker, nodes_resp_byways, ways_done, id_to_find)
        if next_way is None:
                next_way = find_next_way_alt(ker, nodes_resp, nodes_resp_byways, ways_done, id_to_find, sorted_df)
        sorted_df_new = sort_points(nodes_resp_byways[ker][next_way], id_to_find)
        sorted_df = sorted_df.append(sorted_df_new, sort=False)
        ways_done.append(next_way)
        id_to_find = sorted_df.iloc[-1]['id']
    
    sorted_df['name'] = f'{ker}. kerulet'
    return json.loads(sorted_df.reset_index(drop=True).drop(['tags', 'type', 'dist'], axis = 1, errors='ignore').to_json(orient='records'))

lat = 0
lon = 0

def get_dist(row, lat = lat, lon = lon):
    # get distance of coordinates
    return (((row['lat'] - lat) ** 2) + ((row['lon'] - lon) ** 2)) ** 0.5

def sort_points(df, id_to_start=None):
    # sort coordinates by closeness
    if id_to_start:
        elso_df = pd.DataFrame(df)
        elso_df['ordering'] = np.where(elso_df['id'] == id_to_start, 0, 1)
        elso_df = elso_df.sort_values('ordering').reset_index(drop=True)
        elso_df = elso_df.drop('ordering', axis = 1)
        sorted_df = pd.DataFrame(elso_df.loc[0]).T
    else:
        elso_df = pd.DataFrame(df).sort_values(by = ['lon', 'lat']).reset_index(drop = True)
        sorted_df = pd.DataFrame(elso_df.loc[0]).T
    lat = elso_df.loc[0, 'lat']
    lon = elso_df.loc[0, 'lon']
    elso_df.drop(0, inplace = True)
    
    while len(elso_df) > 0:
        elso_df['dist'] = elso_df.apply(lambda x: get_dist(x, lat, lon), axis = 1)
        elso_df = elso_df.sort_values(by = 'dist')
        sorted_df = sorted_df.append(pd.DataFrame(elso_df.loc[elso_df.index[0]]).T, sort = True)
        lat = sorted_df.iloc[-1]['lat']
        lon = sorted_df.iloc[-1]['lon']
        elso_df.drop(elso_df.index[0], inplace = True)
    
    return sorted_df

def find_next_way(ker, nodes_resp_byways, ways_done, id_to_find):
    # find the way which is not done yet having the node 
    for way in [f for f in nodes_resp_byways[ker].keys() if not f in ways_done]:
        for node in nodes_resp_byways[ker][way]:
            if node['id'] == id_to_find:
                return way
        
def find_next_way_alt(ker, nodes_resp, nodes_resp_byways, ways_done, id_to_find, sorted_df):
    # find potential next points
    available_points = [f for f in nodes_resp[ker] if not f['id'] in sorted_df['id'].tolist()]
    
    # calculate the closest
    temp_df = pd.DataFrame(available_points)
    temp_df['dist'] = temp_df.apply(lambda x: get_dist(x, sorted_df.iloc[-1]['lat'], sorted_df.iloc[-1]['lon']), axis = 1)
    next_id = temp_df.sort_values('dist').reset_index(drop=True).loc[0, 'id']
    
    # return way with the closest id to continue
    return [f for f in get_wayids_for_nodeid(next_id, ker, nodes_resp_byways) if not f in ways_done][0]

def json_to_file(my_json, filename):
    with open(filename, 'w') as fp:
        json.dump(my_json, fp)
        
def get_wayids_for_nodeid(nodeid, ker, nodes_resp_byways):
    # get ways a node is part of
    return [f for f in nodes_resp_byways[ker].keys() if nodeid in [g['id'] for g in nodes_resp_byways[ker][f]]]

## DISTRICTS

### GET WAYS BORDERING DISTRICTS

In [6]:
responses = {ker: None for ker in [write_roman(f) for f in range(1,24)]}

for ker in [write_roman(f) for f in range(1,24)]:
    print(f"Downloading: {ker}. kerület")
    overpass_query = f"""
    [out:json];
    area["ISO3166-1"="HU"][admin_level=2];
    (
     rel["name"="{ker}. kerület"](area);
    );
    out center;
    """

    response = requests.get(overpass_url, params={'data': overpass_query}, headers={'User-Agent': user_agent})
    try:
        responses[ker] = response.json()
        print('Success!')
    except:
        print('Error, try again...')
        response = requests.get(overpass_url, params={'data': overpass_query}, headers={'User-Agent': user_agent})
        responses[ker] = response.json()
        print('Success!')

Downloading: I. kerület
Success!
Downloading: II. kerület
Success!
Downloading: III. kerület
Success!
Downloading: IV. kerület
Error, try again...
Success!
Downloading: V. kerület
Success!
Downloading: VI. kerület
Success!
Downloading: VII. kerület
Success!
Downloading: VIII. kerület
Success!
Downloading: IX. kerület
Success!
Downloading: X. kerület
Success!
Downloading: XI. kerület
Success!
Downloading: XII. kerület
Success!
Downloading: XIII. kerület
Success!
Downloading: XIV. kerület
Success!
Downloading: XV. kerület
Success!
Downloading: XVI. kerület
Success!
Downloading: XVII. kerület
Success!
Downloading: XVIII. kerület
Success!
Downloading: XIX. kerület
Success!
Downloading: XX. kerület
Success!
Downloading: XXI. kerület
Success!
Downloading: XXII. kerület
Success!
Downloading: XXIII. kerület
Success!


### GET WAYS AND POINTS CONSTRUCTING THE DISTRICT BORDERS

In [7]:
# distill necessary information
outer_ways = {ker: 
              [i['ref'] for i in responses[ker]['elements'][0]['members'] if i['role'] == 'outer']
              for ker in responses.keys()}

In [8]:
ways_resp = {ker: None for ker in outer_ways.keys()}
for ker in outer_ways.keys():
    print(f"Downloading ways for {ker}. district...")
    overpass_query = f"""
        [out:json];
         """+ "("+";".join([f"way({wayid})" for wayid in outer_ways[ker]])+";)" + """;
        out center;
        """
    response = requests.get(overpass_url, params={'data': overpass_query})
    time.sleep(1)
    try:
        response_json = response.json()
    except:
        response = requests.get(overpass_url, params={'data': overpass_query}, headers={'User-Agent': user_agent})
        response_json = response.json()
    ways_resp[ker] = response_json
    print('Success!')   

Downloading ways for I. district...
Success!
Downloading ways for II. district...
Success!
Downloading ways for III. district...
Success!
Downloading ways for IV. district...
Success!
Downloading ways for V. district...
Success!
Downloading ways for VI. district...
Success!
Downloading ways for VII. district...
Success!
Downloading ways for VIII. district...
Success!
Downloading ways for IX. district...
Success!
Downloading ways for X. district...
Success!
Downloading ways for XI. district...
Success!
Downloading ways for XII. district...
Success!
Downloading ways for XIII. district...
Success!
Downloading ways for XIV. district...
Success!
Downloading ways for XV. district...
Success!
Downloading ways for XVI. district...
Success!
Downloading ways for XVII. district...
Success!
Downloading ways for XVIII. district...
Success!
Downloading ways for XIX. district...
Success!
Downloading ways for XX. district...
Success!
Downloading ways for XXI. district...
Success!
Downloading ways for 

### GET NODES BY DISTRICT

In [9]:
# node ids to query
nodes_border = {ker: {i['id']: i['nodes'] for i in ways_resp[ker]['elements']} for ker in outer_ways.keys()}
nodes_byker = {ker: [item for sublist in [list(set_) for k, set_ in nodes_border[ker].items()] for item in sublist] for ker in outer_ways.keys()}

In [10]:
nodes_resp = {ker: [] for ker in nodes_border.keys()}
for ker in nodes_resp.keys():
    print(f"Downloading nodes for {ker}. district...")
    if len(nodes_byker[ker]) > 300:
        overpass_queries = overpass_queries = [get_overpass_query_from_listofIDs('node', nodes_byker[ker][i*300:(i+1)*300]) 
                            for i in range(int(
                                len(nodes_byker[ker])/300))] + [get_overpass_query_from_listofIDs(
            'node', nodes_byker[ker][int(len(nodes_byker[ker])/300)*300:])]
        responses = [requests.get(overpass_url, params={'data': overpass_query}) for overpass_query in overpass_queries]
        try:
            response_jsons = [response.json() for response in responses]
        except:
            responses = [requests.get(overpass_url, params={'data': overpass_query}) for overpass_query in overpass_queries]
            response_jsons = [response.json() for response in responses]
        for resp in response_jsons:    
            nodes_resp[ker].append(resp)
        print('Success!') 

    else:
        overpass_query = get_overpass_query_from_listofIDs('node', nodes_byker[ker])
        response = requests.get(overpass_url, params={'data': overpass_query})
        time.sleep(1)
        try:
            response_json = response.json()
        except:
            response = requests.get(overpass_url, params={'data': overpass_query}, headers={'User-Agent': user_agent})
            response_json = response.json()
        nodes_resp[ker].append(response_json)
        print('Success!')

Downloading nodes for I. district...
Success!
Downloading nodes for II. district...
Success!
Downloading nodes for III. district...
Success!
Downloading nodes for IV. district...
Success!
Downloading nodes for V. district...
Success!
Downloading nodes for VI. district...
Success!
Downloading nodes for VII. district...
Success!
Downloading nodes for VIII. district...
Success!
Downloading nodes for IX. district...
Success!
Downloading nodes for X. district...
Success!
Downloading nodes for XI. district...
Success!
Downloading nodes for XII. district...
Success!
Downloading nodes for XIII. district...
Success!
Downloading nodes for XIV. district...
Success!
Downloading nodes for XV. district...
Success!
Downloading nodes for XVI. district...
Success!
Downloading nodes for XVII. district...
Success!
Downloading nodes for XVIII. district...
Success!
Downloading nodes for XIX. district...
Success!
Downloading nodes for XX. district...
Success!
Downloading nodes for XXI. district...
Success!


#### MAP NODES TO WAYS

In [11]:
nodes_resp = {ker: [item for sublist in [i['elements'] for i in nodes_resp[ker]] for item in sublist] for ker in nodes_resp.keys()}

In [12]:
nodes_resp_byways = {ker: {way: [f for f in nodes_resp[ker] if f['id'] in nodes_border[ker][way]] for way in nodes_border[ker].keys()} for ker in nodes_resp.keys()}

#### SORT NODES USING THE ALGORITHM

In [17]:
sorted_nodes = {ker: get_border_of_district(ker, nodes_resp, nodes_resp_byways) for ker in tqdm(nodes_resp.keys())}

HBox(children=(IntProgress(value=0, max=23), HTML(value='')))

In [19]:
# create one big json list
all_coords = []
for ker in sorted_nodes.keys():
    all_coords += sorted_nodes[ker]

#### WRITE TO FILE

In [21]:
with open('../data/district_coords.txt', 'w') as fp:
    json.dump(all_coords, fp)