# COLLECT DISTRICT COORDINATE DATA

In [1]:
import requests
import json
import time
from tqdm import tqdm_notebook as tqdm
from collections import OrderedDict
import numpy as np
import pandas as pd

In [2]:
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'

In [3]:
overpass_url = "http://overpass-api.de/api/interpreter"

## FUNCTIONS

In [4]:
def write_roman(num):
    # convert number to roman numbers
    # for district queries
    roman = OrderedDict()
    roman[1000] = "M"
    roman[900] = "CM"
    roman[500] = "D"
    roman[400] = "CD"
    roman[100] = "C"
    roman[90] = "XC"
    roman[50] = "L"
    roman[40] = "XL"
    roman[10] = "X"
    roman[9] = "IX"
    roman[5] = "V"
    roman[4] = "IV"
    roman[1] = "I"

    def roman_num(num):
        for r in roman.keys():
            x, y = divmod(num, r)
            yield roman[r] * x
            num -= (r * x)
            if num <= 0:
                break

    return "".join([a for a in roman_num(num)])

def json_loader(file):
    # loads a json 'file' into a dictionary object
    with open(file, 'r') as fp:
        loaded_json = json.load(open(file))
    return loaded_json

def get_overpass_query_from_listofIDs(type_, list_of_ids):
    # queries a list of object ids with type_ (node, way, rel)
    return f"""
    [out:json];
    """ + "("+";".join([f"{type_}({id_})" for id_ in list_of_ids])+";)" + """;
    out center;
    """

def get_border_of_district(ker, nodes_resp, nodes_resp_byways):
    # get first ways coordinates in correct order
    firstway = list(nodes_resp_byways[ker].keys())[0]
    sorted_df = sort_points(nodes_resp_byways[ker][firstway])
    
    # we should not look for continuation in ways done
    ways_done = [firstway]
    id_to_find = sorted_df.iloc[-1]['id']
    
    # find the next way to continue
    while len(ways_done) != len(nodes_resp_byways[ker].keys()):
        next_way = find_next_way(ker, nodes_resp_byways, ways_done, id_to_find)
        if next_way is None:
                next_way = find_next_way_alt(ker, nodes_resp, nodes_resp_byways, ways_done, id_to_find, sorted_df)
        sorted_df_new = sort_points(nodes_resp_byways[ker][next_way], id_to_find)
        sorted_df = sorted_df.append(sorted_df_new, sort=False)
        ways_done.append(next_way)
        id_to_find = sorted_df.iloc[-1]['id']
    
    sorted_df['name'] = f'{ker}. kerulet'
    return json.loads(sorted_df.reset_index(drop=True).drop(['tags', 'type', 'dist'], axis = 1, errors='ignore').to_json(orient='records'))

lat = 0
lon = 0

def get_dist(row, lat = lat, lon = lon):
    # get distance of coordinates
    return (((row['lat'] - lat) ** 2) + ((row['lon'] - lon) ** 2)) ** 0.5

def sort_points(df, id_to_start=None):
    # sort coordinates by closeness
    if id_to_start:
        elso_df = pd.DataFrame(df)
        elso_df['ordering'] = np.where(elso_df['id'] == id_to_start, 0, 1)
        elso_df = elso_df.sort_values('ordering').reset_index(drop=True)
        elso_df = elso_df.drop('ordering', axis = 1)
        sorted_df = pd.DataFrame(elso_df.loc[0]).T
    else:
        elso_df = pd.DataFrame(df).sort_values(by = ['lon', 'lat']).reset_index(drop = True)
        sorted_df = pd.DataFrame(elso_df.loc[0]).T
    lat = elso_df.loc[0, 'lat']
    lon = elso_df.loc[0, 'lon']
    elso_df.drop(0, inplace = True)
    
    while len(elso_df) > 0:
        elso_df['dist'] = elso_df.apply(lambda x: get_dist(x, lat, lon), axis = 1)
        elso_df = elso_df.sort_values(by = 'dist')
        sorted_df = sorted_df.append(pd.DataFrame(elso_df.loc[elso_df.index[0]]).T, sort = True)
        lat = sorted_df.iloc[-1]['lat']
        lon = sorted_df.iloc[-1]['lon']
        elso_df.drop(elso_df.index[0], inplace = True)
    
    return sorted_df

def find_next_way(ker, nodes_resp_byways, ways_done, id_to_find):
    # find the way which is not done yet having the node 
    for way in [f for f in nodes_resp_byways[ker].keys() if not f in ways_done]:
        for node in nodes_resp_byways[ker][way]:
            if node['id'] == id_to_find:
                return way
        
def find_next_way_alt(ker, nodes_resp, nodes_resp_byways, ways_done, id_to_find, sorted_df):
    # find potential next points
    available_points = [f for f in nodes_resp[ker] if not f['id'] in sorted_df['id'].tolist()]
    
    # calculate the closest
    temp_df = pd.DataFrame(available_points)
    temp_df['dist'] = temp_df.apply(lambda x: get_dist(x, sorted_df.iloc[-1]['lat'], sorted_df.iloc[-1]['lon']), axis = 1)
    next_id = temp_df.sort_values('dist').reset_index(drop=True).loc[0, 'id']
    
    # return way with the closest id to continue
    return [f for f in get_wayids_for_nodeid(next_id, ker, nodes_resp_byways) if not f in ways_done][0]

def json_to_file(my_json, filename):
    with open(filename, 'w') as fp:
        json.dump(my_json, fp)
        
def get_wayids_for_nodeid(nodeid, ker, nodes_resp_byways):
    # get ways a node is part of
    return [f for f in nodes_resp_byways[ker].keys() if nodeid in [g['id'] for g in nodes_resp_byways[ker][f]]]

## DISTRICTS

### GET WAYS BORDERING DISTRICTS

In [5]:
responses = {ker: None for ker in [write_roman(f) for f in range(1,24)]}

for ker in [write_roman(f) for f in range(1,24)]:
    if responses[ker]:
        continue
    
    print(f"Downloading: {ker}. kerület")
    overpass_query = f"""
    [out:json];
    area["ISO3166-1"="HU"][admin_level=2];
    (
     rel["name"="{ker}. kerület"](area);
    );
    out center;
    """
    retry = True
    try_num = 1
    while retry:
        print(f'Try {try_num}')
        response = requests.get(overpass_url, params={'data': overpass_query}, headers={'User-Agent': user_agent})
        if response.status_code == 200:
            responses[ker] = response.json()
            retry = False
            print('Success!')
        else:
            time.sleep(try_num)
            try_num += 1
            continue

Downloading: I. kerület
Try 1
Success!
Downloading: II. kerület
Try 1
Try 2
Success!
Downloading: III. kerület
Try 1
Try 2
Success!
Downloading: IV. kerület
Try 1
Try 2
Try 3
Success!
Downloading: V. kerület
Try 1
Success!
Downloading: VI. kerület
Try 1
Try 2
Try 3
Success!
Downloading: VII. kerület
Try 1
Try 2
Success!
Downloading: VIII. kerület
Try 1
Success!
Downloading: IX. kerület
Try 1
Try 2
Try 3
Try 4
Success!
Downloading: X. kerület
Try 1
Success!
Downloading: XI. kerület
Try 1
Try 2
Try 3
Try 4
Success!
Downloading: XII. kerület
Try 1
Success!
Downloading: XIII. kerület
Try 1
Try 2
Try 3
Try 4
Success!
Downloading: XIV. kerület
Try 1
Success!
Downloading: XV. kerület
Try 1
Try 2
Try 3
Try 4
Success!
Downloading: XVI. kerület
Try 1
Success!
Downloading: XVII. kerület
Try 1
Try 2
Try 3
Success!
Downloading: XVIII. kerület
Try 1
Success!
Downloading: XIX. kerület
Try 1
Try 2
Try 3
Success!
Downloading: XX. kerület
Try 1
Success!
Downloading: XXI. kerület
Try 1
Try 2
Try 3
Try 4


### GET WAYS AND POINTS CONSTRUCTING THE DISTRICT BORDERS

In [6]:
# distill necessary information
outer_ways = {ker: 
              [i['ref'] for i in responses[ker]['elements'][0]['members'] if i['role'] == 'outer']
              for ker in responses.keys()}

In [7]:
ways_resp = {ker: None for ker in outer_ways.keys()}

In [9]:
for ker in outer_ways.keys():
    print(f"Downloading ways for {ker}. district...")
    if ways_resp[ker]:
        continue
    
    overpass_query = f"""
        [out:json];
         """+ "("+";".join([f"way({wayid})" for wayid in outer_ways[ker]])+";)" + """;
        out center;
        """
    retry = True
    try_num = 1
    while retry:
        print(f'Try {try_num}')
        response = requests.get(overpass_url, params={'data': overpass_query})
        if response.status_code == 200:
            response_json = response.json()
            retry = False
            print('Success!')
        else:
            time.sleep(try_num)
            try_num += 1
            continue
            
    ways_resp[ker] = response_json

Downloading ways for I. district...
Downloading ways for II. district...
Try 1
Try 2
Success!
Downloading ways for III. district...
Try 1
Success!
Downloading ways for IV. district...
Try 1
Try 2
Success!
Downloading ways for V. district...
Try 1
Success!
Downloading ways for VI. district...
Try 1
Success!
Downloading ways for VII. district...
Try 1
Success!
Downloading ways for VIII. district...
Try 1
Success!
Downloading ways for IX. district...
Try 1
Success!
Downloading ways for X. district...
Try 1
Success!
Downloading ways for XI. district...
Try 1
Success!
Downloading ways for XII. district...
Try 1
Success!
Downloading ways for XIII. district...
Try 1
Success!
Downloading ways for XIV. district...
Try 1
Try 2
Success!
Downloading ways for XV. district...
Try 1
Success!
Downloading ways for XVI. district...
Try 1
Success!
Downloading ways for XVII. district...
Try 1
Success!
Downloading ways for XVIII. district...
Try 1
Success!
Downloading ways for XIX. district...
Try 1
Succes

### GET NODES BY DISTRICT

In [43]:
# node ids to query
nodes_border = {ker: {i['id']: i['nodes'] for i in ways_resp[ker]['elements']} for ker in outer_ways.keys()}
nodes_byker = {ker: [item for sublist in [list(set_) for k, set_ in nodes_border[ker].items()] for item in sublist] for ker in outer_ways.keys()}

In [44]:
nodes_resp = {ker: [] for ker in nodes_border.keys()}

In [45]:
for ker in nodes_resp.keys():
    print(f"Downloading nodes for {ker}. district...")
    if nodes_resp[ker]:
        print("continuing, as it is ready.")
        continue
    
    if len(nodes_byker[ker]) > 300:
        print("The task to be done in multiple segments...")
        print("-------------")
        overpass_queries = [get_overpass_query_from_listofIDs('node', nodes_byker[ker][i*300:(i+1)*300]) 
                            for i in range(int(
                                len(nodes_byker[ker])/300))] + [get_overpass_query_from_listofIDs(
            'node', nodes_byker[ker][int(len(nodes_byker[ker])/300)*300:])]
        responses = []
        for n, overpass_query in enumerate(overpass_queries):
            print(f"Segment {n+1} of {len(overpass_queries)}:")
            success = False
            for i in range(4):
                print(f'Try {i}')
                response = requests.get(overpass_url, params={'data': overpass_query})
                if response.status_code == 200:
                    responses.append(response)
                    success = True
                    print("Segment successful!")
                    print("-------------")
                    break
                else:
                    time.sleep(2)
                    continue
            if not success:
                raise ValueError(f"Could not obtain all nodes for ker {ker}")
        
        response_jsons = [response.json() for response in responses]
        for resp in response_jsons:    
            nodes_resp[ker].append(resp)
        print('Success!')
    else:
        for i in range(4):
            print(f"Try {i}")
            overpass_query = get_overpass_query_from_listofIDs('node', nodes_byker[ker])
            response = requests.get(overpass_url, params={'data': overpass_query})
            time.sleep(1)
            try:
                response_json = response.json()
                nodes_resp[ker] = [response_json]
                print('Success!')
                break
            except:
                continue
    print(f"District {ker} ready")
    print("-------------")
    print("-------------")

Downloading nodes for I. district...
The task to be done in multiple segments...
-------------
Segment 1 of 2:
Try 0
Segment successful!
-------------
Segment 2 of 2:
Try 0
Segment successful!
-------------
Success!
District I ready
-------------
-------------
Downloading nodes for II. district...
The task to be done in multiple segments...
-------------
Segment 1 of 3:
Try 0
Try 1
Try 2
Try 3
Segment successful!
-------------
Segment 2 of 3:
Try 0
Segment successful!
-------------
Segment 3 of 3:
Try 0
Try 1
Try 2
Try 3
Segment successful!
-------------
Success!
District II ready
-------------
-------------
Downloading nodes for III. district...
The task to be done in multiple segments...
-------------
Segment 1 of 2:
Try 0
Segment successful!
-------------
Segment 2 of 2:
Try 0
Try 1
Try 2
Try 3
Segment successful!
-------------
Success!
District III ready
-------------
-------------
Downloading nodes for IV. district...
Try 0
Success!
District IV ready
-------------
-------------
Do

#### MAP NODES TO WAYS

In [46]:
nodes_resp = {ker: [item for sublist in [i['elements'] for i in nodes_resp[ker]] for item in sublist] for ker in nodes_resp.keys()}

In [47]:
nodes_resp_byways = {ker: {way: [f for f in nodes_resp[ker] if f['id'] in nodes_border[ker][way]] for way in nodes_border[ker].keys()} for ker in nodes_resp.keys()}

#### SORT NODES USING THE ALGORITHM

In [50]:
sorted_nodes = {ker: get_border_of_district(ker, nodes_resp, nodes_resp_byways) for ker in tqdm(nodes_resp.keys())}

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/23 [00:00<?, ?it/s]

In [51]:
# create one big json list
all_coords = []
for ker in sorted_nodes.keys():
    all_coords += sorted_nodes[ker]

#### WRITE TO FILE

In [53]:
with open('data/district_coords.txt', 'w') as fp:
    json.dump(all_coords, fp)