In [134]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm

# Load data

In [124]:
df_train = pd.read_csv('data/train.csv')
df_train.head()

Unnamed: 0,waktu_setempat,id_jalan,id_titik_mulai,id_titik_akhir,rerata_kecepatan
0,2020-02-01 01:00:00+00:00,691007296,21390008,1425033102,29.126
1,2020-02-01 01:00:00+00:00,47010584,1677092762,579493410,46.576
2,2020-02-01 01:00:00+00:00,22932408,26486694,1930267566,36.587
3,2020-02-01 01:00:00+00:00,142479648,1111592522,3775231113,34.063
4,2020-02-01 01:00:00+00:00,8504977,5940503398,5940503394,38.336


In [125]:
df_test = pd.read_csv('data/test.csv')
df_test.head()

Unnamed: 0,id,waktu_setempat,id_jalan,id_titik_mulai,id_titik_akhir
0,0,2020-02-23 00:00:00+00:00,4004732,32046542,6454026544
1,1,2020-02-23 00:00:00+00:00,182210371,1314925464,1314925496
2,2,2020-02-23 00:00:00+00:00,22932408,1482086782,26481020
3,3,2020-02-23 00:00:00+00:00,182210371,3892883,267337489
4,4,2020-02-23 00:00:00+00:00,66924592,266041030,2592978110


# OSM API

In [126]:
from OSMPythonTools.api import Api

In [127]:
unique_jalan_train = df_train['id_jalan'].unique()
unique_jalan_test = df_test['id_jalan'].unique()
unique_jalan = np.concatenate((unique_jalan_train, unique_jalan_test), axis=0)
unique_jalan = np.unique(unique_jalan, axis=0)

In [128]:
unique_mulai_train = df_train['id_titik_mulai'].unique()
unique_mulai_test = df_test['id_titik_mulai'].unique()
unique_akhir_train = df_train['id_titik_akhir'].unique()
unique_akhir_test = df_test['id_titik_akhir'].unique()
unique_mulai = np.concatenate((unique_mulai_train, unique_mulai_test), axis=0)
unique_mulai = np.unique(unique_mulai, axis=0)
unique_akhir = np.concatenate((unique_akhir_train, unique_akhir_test), axis=0)
unique_akhir = np.unique(unique_akhir, axis=0)

In [140]:
# Testing result
api = Api()
way = api.query('way/66924592')
way.tags()

{'bicycle': 'yes',
 'class:bicycle:commute': '1',
 'cycleway': 'share_busway',
 'foot': 'yes',
 'highway': 'trunk',
 'horse': 'yes',
 'lit': 'yes',
 'maxspeed': '30 mph',
 'name': 'Brixton Road',
 'oneway': 'no',
 'operator': 'Transport for London',
 'ref': 'A23',
 'sidewalk': 'both',
 'surface': 'asphalt'}

In [130]:
result_jalan = {'id_jalan': unique_jalan,
                'api_result': []}
api = Api()
# new_col_jalan = ['lanes', 'lanes:forward', 'lit', 'maxspeed']
new_col_jalan = ['cycleway', 'highway','lanes', 'lanes:forward', 'lit', 'maxspeed', 'name', 'operator', 'ref', 'sidewalk', 'surface', 'turn:lanes:forward']
for id_jalan in unique_jalan:
    way = api.query(f'way/{id_jalan}')
    api_result = {} 
    for col in new_col_jalan:
        api_result[col] = way.tag(col)
    result_jalan['api_result'].append(api_result)


In [131]:
result_jalan['api_result']

[{'cycleway': None,
  'highway': 'primary',
  'lanes': None,
  'lanes:forward': None,
  'lit': 'yes',
  'maxspeed': '30 mph',
  'name': 'High Road',
  'operator': None,
  'ref': 'A1000',
  'sidewalk': None,
  'surface': None,
  'turn:lanes:forward': None},
 {'cycleway': None,
  'highway': 'trunk',
  'lanes': '3',
  'lanes:forward': '1',
  'lit': 'yes',
  'maxspeed': '30 mph',
  'name': 'Roehampton Lane',
  'operator': 'Transport for London',
  'ref': 'A306',
  'sidewalk': None,
  'surface': None,
  'turn:lanes:forward': None},
 {'cycleway': None,
  'highway': 'trunk',
  'lanes': None,
  'lanes:forward': None,
  'lit': 'yes',
  'maxspeed': '30 mph',
  'name': 'Upper Richmond Road',
  'operator': 'Transport for London',
  'ref': 'A205',
  'sidewalk': 'both',
  'surface': None,
  'turn:lanes:forward': None},
 {'cycleway': None,
  'highway': 'trunk',
  'lanes': '2',
  'lanes:forward': None,
  'lit': 'yes',
  'maxspeed': '30 mph',
  'name': 'Upper Richmond Road West',
  'operator': 'Transpo

In [33]:
# Testing result
node = api.query('node/21390008') # idx=0, id_titik mulai
print(node.lat())
print(node.lon())

51.434928
-0.1611764


Latest Latitude: 51.4502291
Latest Longitude: -0.1225601


In [50]:
import xml.etree.ElementTree as ET
import requests

def manual_extract(node):
    url = f'https://www.openstreetmap.org/api/0.6/node/{node}/history'
    r = requests.get(url)
    root = ET.fromstring(r.text)

    # Find all nodes with the specified id
    nodes = root.findall(f'.//node[@id="{node}"]')

    # Initialize variables to hold latitude and longitude
    latest_lat = None
    latest_lon = None

    # Iterate through the nodes in reverse order (latest version first)
    for node in reversed(nodes):
        lat = node.get('lat')
        lon = node.get('lon')
        
        if lat and lon:
            latest_lat = lat
            latest_lon = lon
            break  # Stop when the first valid lat and lon are found

    # If no valid lat and lon are found in the latest version, use the previous version's data
    if latest_lat is None or latest_lon is None:
        for node in reversed(nodes):
            lat = node.get('lat')
            lon = node.get('lon')
            if lat and lon:
                latest_lat = lat
                latest_lon = lon
                break  # Stop when the first valid lat and lon are found
    return latest_lat, latest_lon

    

In [51]:
result_mulai = {'id_titik_mulai': unique_mulai,
                'api_result': []}
api = Api()
new_col_jalan = ['lat', 'lon']
for id_titik in unique_mulai:
    try:
        node = api.query(f'node/{id_titik}')
    except:
        lat, lon = manual_extract(id_titik)
    
    api_result = {'lat': node.lat(),
                  'lon': node.lon()} 
    result_mulai['api_result'].append(api_result)


[api] downloading data: node/227758


The requested data could not be downloaded. HTTP Error 410: Gone
Traceback (most recent call last):
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/site-packages/OSMPythonTools/internal/cacheObject.py", line 95, in __query
    response = urllib.request.urlopen(request)
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 216, in urlopen
    return opener.open(url, data, timeout)
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 525, in open
    response = meth(req, response)
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 634, in http_response
    response = self.parent.error(
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 557, in error
    result = self._call_chain(*args)
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 496, in _call_chain
    result = func(*args)
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 749, in http

In [52]:
result_mulai['api_result']

[{'lat': 51.5356116, 'lon': -0.1470438},
 {'lat': 51.5276777, 'lon': -0.1443142},
 {'lat': 51.5256574, 'lon': -0.1442502},
 {'lat': 51.6398113, 'lon': -0.1802294},
 {'lat': 51.6390703, 'lon': -0.1798086},
 {'lat': 51.6357453, 'lon': -0.1767753},
 {'lat': 51.633793, 'lon': -0.1759105},
 {'lat': 51.6326856, 'lon': -0.1756549},
 {'lat': 51.6276102, 'lon': -0.1748049},
 {'lat': 51.6265808, 'lon': -0.1755074},
 {'lat': 51.6265402, 'lon': -0.1755367},
 {'lat': 51.6258079, 'lon': -0.1759228},
 {'lat': 51.6235299, 'lon': -0.1764571},
 {'lat': 51.6226864, 'lon': -0.1765291},
 {'lat': 51.6221812, 'lon': -0.1765521},
 {'lat': 51.621787, 'lon': -0.1765761},
 {'lat': 51.6209294, 'lon': -0.1766224},
 {'lat': 51.6201206, 'lon': -0.1766651},
 {'lat': 51.6192148, 'lon': -0.1767097},
 {'lat': 51.6183077, 'lon': -0.1767455},
 {'lat': 51.6173225, 'lon': -0.1767883},
 {'lat': 51.5892032, 'lon': -0.1998053},
 {'lat': 51.5822067, 'lon': -0.1991985},
 {'lat': 51.5803873, 'lon': -0.1984375},
 {'lat': 51.638376

In [53]:
result_akhir = {'id_titik_akhir': unique_akhir,
                'api_result': []}
api = Api()
new_col_jalan = ['lat', 'lon']
for id_titik in unique_akhir:
    try:
        node = api.query(f'node/{id_titik}')
    except:
        lat, lon = manual_extract(id_titik)
    
    api_result = {'lat': node.lat(),
                  'lon': node.lon()} 
    result_akhir['api_result'].append(api_result)


[api] downloading data: node/227758


The requested data could not be downloaded. HTTP Error 410: Gone
Traceback (most recent call last):
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/site-packages/OSMPythonTools/internal/cacheObject.py", line 95, in __query
    response = urllib.request.urlopen(request)
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 216, in urlopen
    return opener.open(url, data, timeout)
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 525, in open
    response = meth(req, response)
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 634, in http_response
    response = self.parent.error(
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 557, in error
    result = self._call_chain(*args)
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 496, in _call_chain
    result = func(*args)
  File "/Users/mdaniyalk/miniforge3/lib/python3.10/urllib/request.py", line 749, in http

In [66]:
result_akhir['api_result'][0]['lat']

51.5356116

# MapQuest API

In [55]:
unique_mulai_akhir_train = np.unique(df_train[['id_titik_mulai', 'id_titik_akhir']].values, axis=0)
unique_mulai_akhir_test = np.unique(df_test[['id_titik_mulai', 'id_titik_akhir']].values, axis=0)
unique_mulai_akhir = np.concatenate((unique_mulai_akhir_train, unique_mulai_akhir_test), axis=0)
unique_mulai_akhir = np.unique(unique_mulai_akhir, axis=0)


In [105]:
np.array([21390008, 1425033102]) in unique_mulai_akhir

True

In [60]:
import json

def distance(start_lat, start_lon, end_lat, end_lon):
    key = 'FVgErOkhYoJmjdsUrldVi9nkrCrGKuWm'
    url = f'https://www.mapquestapi.com/directions/v2/route?key={key}&from={start_lat},{start_lon}&to={end_lat},{end_lon}'
    r = requests.get(url)
    data = r.json()
    dist = data['route']['distance']
    return dist

In [89]:
distance(51.4651822,-0.2552399,51.4652044,-0.2545952)

0.0311

In [74]:
np.where(result_akhir['id_titik_akhir'] == id_titik[1])[0].sum()

316

In [119]:
result_distance = {'id_titik_mulai_akhir': unique_mulai_akhir,
                'api_result': []}
api = Api()
new_col_jalan = ['lat', 'lon']
for id_titik in unique_mulai_akhir:
    idx_mulai = np.where(result_mulai['id_titik_mulai'] == id_titik[0])[0].sum()
    idx_akhir = np.where(result_akhir['id_titik_akhir'] == id_titik[1])[0].sum()
    dist = distance(result_mulai['api_result'][idx_mulai]['lat'], result_mulai['api_result'][idx_mulai]['lon'], result_akhir['api_result'][idx_akhir]['lat'], result_akhir['api_result'][idx_akhir]['lon'])
    api_result = {'distance': dist} 
    result_distance['api_result'].append(api_result)


Export to CSV

In [120]:
result_distance['api_result']

[{'distance': 0.0336},
 {'distance': 0.0466},
 {'distance': 0.0547},
 {'distance': 0.0267},
 {'distance': 0.2871},
 {'distance': 0.023},
 {'distance': 0.0491},
 {'distance': 0.0298},
 {'distance': 0.0249},
 {'distance': 0.0273},
 {'distance': 0.0137},
 {'distance': 0.0087},
 {'distance': 0.0329},
 {'distance': 0.0833},
 {'distance': 0.0597},
 {'distance': 0},
 {'distance': 0.0255},
 {'distance': 0},
 {'distance': 0.0584},
 {'distance': 0.0652},
 {'distance': 0.0565},
 {'distance': 0},
 {'distance': 0.0093},
 {'distance': 0.0087},
 {'distance': 0.0565},
 {'distance': 0.0267},
 {'distance': 0.0106},
 {'distance': 0.0112},
 {'distance': 0.0186},
 {'distance': 0.0447},
 {'distance': 0},
 {'distance': 0.0491},
 {'distance': 0.0043},
 {'distance': 0.0118},
 {'distance': 0.0423},
 {'distance': 0.0106},
 {'distance': 0.0199},
 {'distance': 0.0603},
 {'distance': 2.5352},
 {'distance': 0.0261},
 {'distance': 0.0211},
 {'distance': 0.0087},
 {'distance': 0.0702},
 {'distance': 0.0186},
 {'distan

In [107]:
id_distance = np.array([21390008, 1425033102])
# np.where(result_distance['id_titik_mulai_akhir'] == id_distance)
[index for index, value in enumerate(unique_mulai_akhir) if value in id_distance]


[185, 186, 222]

In [117]:
array1 = np.array([21390008, 1425033102])
array2 = np.array([21390008, 1425033102])
np.array_equal(array1, array2)

False

In [114]:
if (np.array([21390008, 1425033102]) - np.array([21390008, 1425033102]) == np.array([0, 0])):
    print('True')

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [121]:
def find_idx_dist(id_titik, unique_val):
    for idx, val in enumerate(unique_val):
        if np.array_equal(val, id_titik):
            return idx
            

In [135]:
def add_data(df, filename):
    # new_col = ['lanes', 'lanes:forward', 'lit', 'maxspeed', 'mulai_lat', 'mulai_lon', 'akhir_lat', 'akhir_lon', 'distance']
    new_col = ['cycleway', 'highway','lanes', 'lanes:forward', 'lit', 'maxspeed', 'name', 'operator', 'ref', 'sidewalk', 'surface', 'turn:lanes:forward', 'mulai_lat', 'mulai_lon', 'akhir_lat', 'akhir_lon', 'distance']
    additional_data = {'cycleway':[], 'highway':[],'lanes':[], 'lanes:forward':[], 'lit':[], 'maxspeed':[], 'name':[], 'operator':[], 'ref':[], 'sidewalk':[], 'surface':[], 'turn:lanes:forward':[], 'mulai_lat':[], 'mulai_lon':[], 'akhir_lat':[], 'akhir_lon':[], 'distance':[]}
    for index, row in tqdm(df.iterrows()):
        id_jalan = row['id_jalan']
        idx_jalan = np.where(result_jalan['id_jalan'] == id_jalan)[0].sum()

        cycleway, highway, name, operator = (result_jalan['api_result'][idx_jalan]['cycleway'], result_jalan['api_result'][idx_jalan]['highway'], result_jalan['api_result'][idx_jalan]['name'], result_jalan['api_result'][idx_jalan]['operator'])
        lanes, lanesforward, lit, maxspeed = (result_jalan['api_result'][idx_jalan]['lanes'], result_jalan['api_result'][idx_jalan]['lanes:forward'], result_jalan['api_result'][idx_jalan]['lit'], result_jalan['api_result'][idx_jalan]['maxspeed'])
        ref, sidewalk, surface, turnlanesforward = (result_jalan['api_result'][idx_jalan]['ref'], result_jalan['api_result'][idx_jalan]['sidewalk'], result_jalan['api_result'][idx_jalan]['surface'], result_jalan['api_result'][idx_jalan]['turn:lanes:forward'])

        id_titik_mulai = row['id_titik_mulai']
        id_titik_akhir = row['id_titik_akhir']

        idx_mulai = np.where(result_mulai['id_titik_mulai'] == id_titik_mulai)[0].sum()
        idx_akhir = np.where(result_akhir['id_titik_akhir'] == id_titik_akhir)[0].sum()

        mulai_lat, mulai_lon = (result_mulai['api_result'][idx_mulai]['lat'], result_mulai['api_result'][idx_mulai]['lon'])
        akhir_lat, akhir_lon = (result_akhir['api_result'][idx_akhir]['lat'], result_akhir['api_result'][idx_akhir]['lon'])

        id_distance = np.array([row['id_titik_mulai'], row['id_titik_akhir']])
        idx_distance = find_idx_dist(id_distance, unique_mulai_akhir)
        distance = result_distance['api_result'][idx_distance]['distance']

        additional_data['cycleway'].append(cycleway)
        additional_data['highway'].append(highway)
        additional_data['name'].append(name)
        additional_data['operator'].append(operator)
        additional_data['lanes'].append(lanes)
        additional_data['lanes:forward'].append(lanesforward)
        additional_data['lit'].append(lit)
        additional_data['maxspeed'].append(maxspeed)
        additional_data['ref'].append(ref)
        additional_data['sidewalk'].append(sidewalk)
        additional_data['surface'].append(surface)
        additional_data['turn:lanes:forward'].append(turnlanesforward)
        
        additional_data['mulai_lat'].append(mulai_lat)
        additional_data['mulai_lon'].append(mulai_lon)
        additional_data['akhir_lat'].append(akhir_lat)
        additional_data['akhir_lon'].append(akhir_lon)
        additional_data['distance'].append(distance)
    for col in new_col:
        df[col] = additional_data[col]
    df.to_csv(filename, index=False)

In [136]:
add_data(df_train, 'new_train-2.csv')
add_data(df_test, 'new_test-2.csv')

0it [00:00, ?it/s]

398648it [05:13, 1272.71it/s]
127489it [01:37, 1303.23it/s]
