# Downloading routes that pass through a port's coordinates

Until now our route search was quite imprecise since it was based on searching for a port name on *Wikiloc*. Now that we have the exact coordinates for most ports it's quite possible to use the site's *'search routes that pass through here'* option and find routes that pass **exactly** through each port. 

Let's get to it.

## Preparing our setup

For this purpose we will need to load our usual libraries, import a few functions and load the dataframe of port coordinates.

In [3]:
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import folium
import re
import pathlib
import os
import gpxpy
import gpxpy.gpx
import time
from pathlib import Path
import os

In [4]:
df = pd.read_csv('master_df_puertos_coords.csv')

In [2]:
driver = webdriver.Chrome()
driver.get('https://es.wikiloc.com/wikiloc/start.do')

In [131]:
#Navigating to the search section with our "road bike" filter activated.

def filtro():
    search_bar = driver.find_element_by_xpath('/html/body/header/nav/div/div[1]/div/div')
    search_bar.click()
    time.sleep(1)
    search_bar_open = driver.find_element_by_xpath('/html/body/header/nav/div/div[1]/div/div[1]/input')
    search_bar_open.send_keys('bracons')
    time.sleep(1)
    bracons = driver.find_element_by_xpath('/html/body/header/nav/div/div[1]/div/div[2]/div/ul/div')
    bracons.click()
    time.sleep(1)
    actividad = driver.find_element_by_xpath('/html/body/div[1]/div[1]/div[1]/div/div[1]/button')
    actividad.click()
    time.sleep(1)
    selector_actividad = driver.find_element_by_xpath('/html/body/div[1]/div[1]/div[1]/div/div[1]/div/div/div/div[1]/div[1]/div/div[2]/div/div[2]/div')
    selector_actividad.click()
    time.sleep(1)
    aplicar_filtro = driver.find_element_by_xpath('/html/body/div[1]/div[1]/div[1]/div/div[1]/div/div/div/div[2]/button')
    aplicar_filtro.click() #Selecting the "road bike" filter. Now we're all set to scrape any destination.

In [8]:
filtro()

Our previous scrapper function only took into account the search location's name, but this time we will be including coordinates to get a far more accurate result.

# Using our function to scrape all routes

In this section we created a loop that makes use of **Wikiloc**'s function "search routes through this point" to pass our port coordinates and find routes that reach every mountain pass.

In [70]:
#Creating a reduced dataframe to test our function.

df_10 = df.head(10)

In [85]:
#Defining and testing our loop.

start = time.time() #Starting a timer.

lista_rutas = []

for i in range(len(df_10)):
    try:
        time.sleep(0.5)
        url = 'https://es.wikiloc.com/wikiloc/map.do?sw=41.55952720430883%2C2.3658370971679688&ne=41.58575748995264%2C2.415275573730469&act=29&zdp=(' + str(df['long'].iloc[i]) + '%2C' + str(df['lat'].iloc[i]) + '%2C357)&page=1'
        driver.get(url)
        time.sleep(0.5)
        listado_dict = []
        element = driver.find_elements_by_class_name('trail-list__wrapper')
        for n in range(len(element)):
            try:
                element[n].click()
                time.sleep(0.3)
                nombre_ruta = driver.find_elements_by_xpath('//*[@id="front"]/div/div/h1')[0].text
                url = driver.current_url
                trailrank = driver.find_elements_by_xpath('//*[@id="front"]/div/div/a')[0].text
                distancia = driver.find_elements_by_xpath('//*[@id="trail-data"]/div[1]/div[1]/a/span[2]')[0].text
                desnivel = driver.find_elements_by_xpath('//*[@id="trail-data"]/div[1]/div[3]/a/span[2]')[0].text
                dificultad = driver.find_elements_by_xpath('//*[@id="trail-data"]/div[1]/div[7]/a/span[2]')[0].text
                fotos = driver.find_elements_by_class_name('trail-photo')
                photo1 = fotos[0].get_attribute('href')
                photo2 = fotos[1].get_attribute('href')
                photo3 = fotos[2].get_attribute('href')
                driver.back()
                time.sleep(0.3)
                element = driver.find_elements_by_class_name('trail-list__wrapper')
                dict_ruta = {'ubicacion': df_10['puerto'].iloc[i], 'nombre': nombre_ruta, 'trailrank': trailrank,'distancia': distancia, 'desnivel': desnivel,
                             'dificultad': dificultad, 'url': url, 'photo1': photo1, 'photo2': photo2, 'photo3': photo3}
                lista_rutas.append(dict_ruta)
            except:
                pass
                driver.back()
                time.sleep(0.5)
                element = driver.find_elements_by_class_name('trail-list__wrapper')
    except:
        time.sleep(2)
        pass
    
stop = time.time() 
duration = (stop - start) / 60
print('Minutes:', duration)

Minutes: 4.988209561506907


In [86]:
#Creating a dataframe out of the list of routes.

df_rutas = pd.DataFrame(lista_rutas)

In [87]:
df_rutas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ubicacion   114 non-null    object
 1   nombre      114 non-null    object
 2   trailrank   114 non-null    object
 3   distancia   114 non-null    object
 4   desnivel    114 non-null    object
 5   dificultad  114 non-null    object
 6   url         114 non-null    object
 7   photo1      114 non-null    object
 8   photo2      114 non-null    object
 9   photo3      114 non-null    object
dtypes: object(10)
memory usage: 9.0+ KB


We can now scrape all routes. It's going to take a while.

In [88]:
start = time.time() #Starting a timer.

lista_rutas = []

for i in range(len(df)):
    try:
        time.sleep(0.5)
        url = 'https://es.wikiloc.com/wikiloc/map.do?sw=41.55952720430883%2C2.3658370971679688&ne=41.58575748995264%2C2.415275573730469&act=29&zdp=(' + str(df['long'].iloc[i]) + '%2C' + str(df['lat'].iloc[i]) + '%2C357)&page=1'
        driver.get(url)
        time.sleep(0.5)
        listado_dict = []
        element = driver.find_elements_by_class_name('trail-list__wrapper')
        for n in range(len(element)):
            try:
                element[n].click()
                time.sleep(0.3)
                nombre_ruta = driver.find_elements_by_xpath('//*[@id="front"]/div/div/h1')[0].text
                url = driver.current_url
                trailrank = driver.find_elements_by_xpath('//*[@id="front"]/div/div/a')[0].text
                distancia = driver.find_elements_by_xpath('//*[@id="trail-data"]/div[1]/div[1]/a/span[2]')[0].text
                desnivel = driver.find_elements_by_xpath('//*[@id="trail-data"]/div[1]/div[3]/a/span[2]')[0].text
                dificultad = driver.find_elements_by_xpath('//*[@id="trail-data"]/div[1]/div[7]/a/span[2]')[0].text
                fotos = driver.find_elements_by_class_name('trail-photo')
                photo1 = fotos[0].get_attribute('href')
                photo2 = fotos[1].get_attribute('href')
                photo3 = fotos[2].get_attribute('href')
                driver.back()
                time.sleep(0.3)
                element = driver.find_elements_by_class_name('trail-list__wrapper')
                dict_ruta = {'ubicacion': df['puerto'].iloc[i], 'nombre': nombre_ruta, 'trailrank': trailrank,'distancia': distancia, 'desnivel': desnivel,
                             'dificultad': dificultad, 'url': url, 'photo1': photo1, 'photo2': photo2, 'photo3': photo3}
                lista_rutas.append(dict_ruta)
            except:
                pass
                driver.back()
                time.sleep(0.5)
                element = driver.find_elements_by_class_name('trail-list__wrapper')
    except:
        time.sleep(2)
        pass
    
stop = time.time() 
duration = (stop - start) / 60
print('Minutes:', duration)

Minutes: 552.156865421931


In [89]:
df_rutas = pd.DataFrame(lista_rutas)
df_rutas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8319 entries, 0 to 8318
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ubicacion   8319 non-null   object
 1   nombre      8319 non-null   object
 2   trailrank   8319 non-null   object
 3   distancia   8319 non-null   object
 4   desnivel    8319 non-null   object
 5   dificultad  8319 non-null   object
 6   url         8319 non-null   object
 7   photo1      8319 non-null   object
 8   photo2      8319 non-null   object
 9   photo3      8319 non-null   object
dtypes: object(10)
memory usage: 650.0+ KB


In [90]:
#Saving the resulting dataframe.

df_rutas.to_csv('df_extra_routes.csv', index=False)

In [3]:
#Adding favs function.

t = 0.4 #This will set the wait between actions. If you have a good connection you can keep it between 0.3-0.5. 
        #Some trial and error might be necessary.

def fav(url_list):
    start = time.time() #Starting a counter to time our code.
    for i in url_list: #url_list will be the list of the route's urls (max 1000 and no duplicates, VERY important).
        try:
            driver.get(i) #Accessing the url.
            time.sleep(t) 
            driver.find_element_by_xpath('//*[@id="container"]/a').click() #Clicking on the 'Add to favorites' item.
            time.sleep(t)
            driver.find_element_by_xpath('//*[@id="container"]/div/div/div/div[3]/div[1]').click() #Marking the route as fav.
            time.sleep(t)
        except:
            time.sleep(1) 
            pass
    stop = time.time() #Stopping our timer.
    duration = (stop - start) / 60 #Calculating the elapsed minutes.
    print(len(url_list), 'favs added in', duration, 'minutes.')

In [92]:
url_list = df_rutas['url'].tolist()

In [93]:
fav(url_list[:1000])

1000 favs added in 21.04311106602351 minutes.


# Checking for missing routes

First of all we must delete duplicate routes.

In [39]:
df = pd.read_csv('df_extra_routes.csv')

In [40]:
df.sort_values(by='trailrank', ascending=False, inplace=True)

In [41]:
df.drop_duplicates(subset=['url'], keep='first', inplace=True)

In [42]:
df.head()

Unnamed: 0,ubicacion,nombre,trailrank,distancia,desnivel,dificultad,url,photo1,photo2,photo3
1477,Mirador De La Cabra Montés,La Cabra por Almuñecar - 8-11-2012 - Granada,95,"159,35 km",2.296 m,Moderado,https://es.wikiloc.com/rutas-ciclismo/la-cabra...,https://es.wikiloc.com/rutas-ciclismo/la-cabra...,https://es.wikiloc.com/rutas-ciclismo/la-cabra...,https://es.wikiloc.com/rutas-ciclismo/la-cabra...
7704,Blancares,"Granada, Guadix y vuelta por Lugros - 5-06-2014",93,"140,26 km",1.832 m,Moderado,https://es.wikiloc.com/rutas-ciclismo/granada-...,https://es.wikiloc.com/rutas-ciclismo/granada-...,https://es.wikiloc.com/rutas-ciclismo/granada-...,https://es.wikiloc.com/rutas-ciclismo/granada-...
1478,Mirador De La Cabra Montés,La Cabra y vuelta por Guajares - Granada 24-05...,93,"157,12 km",2.692 m,Difícil,https://es.wikiloc.com/rutas-ciclismo/la-cabra...,https://es.wikiloc.com/rutas-ciclismo/la-cabra...,https://es.wikiloc.com/rutas-ciclismo/la-cabra...,https://es.wikiloc.com/rutas-ciclismo/la-cabra...
7705,Blancares,El Mendrugo - 15-02-2015,93,"119,08 km",1.104 m,Moderado,https://es.wikiloc.com/rutas-ciclismo/el-mendr...,https://es.wikiloc.com/rutas-ciclismo/el-mendr...,https://es.wikiloc.com/rutas-ciclismo/el-mendr...,https://es.wikiloc.com/rutas-ciclismo/el-mendr...
7706,Blancares,"El Mendrugo. Pto.Blancares, La Peza, Darro, Mo...",92,"126,83 km",1.242 m,Moderado,https://es.wikiloc.com/rutas-ciclismo/el-mendr...,https://es.wikiloc.com/rutas-ciclismo/el-mendr...,https://es.wikiloc.com/rutas-ciclismo/el-mendr...,https://es.wikiloc.com/rutas-ciclismo/el-mendr...


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5263 entries, 1477 to 1846
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ubicacion   5263 non-null   object
 1   nombre      5263 non-null   object
 2   trailrank   5263 non-null   int64 
 3   distancia   5263 non-null   object
 4   desnivel    5263 non-null   object
 5   dificultad  5263 non-null   object
 6   url         5263 non-null   object
 7   photo1      5263 non-null   object
 8   photo2      5263 non-null   object
 9   photo3      5263 non-null   object
dtypes: int64(1), object(9)
memory usage: 452.3+ KB


Performing the necessary cleaning.

In [44]:
df['distancia'] = df['distancia'].str.replace(" km", "")
df['distancia'] = df['distancia'].str.replace(",", ".")
df['desnivel'] = df['desnivel'].str.replace(" m", "")
df['desnivel'] = df['desnivel'].str.replace(".", "")

  df['desnivel'] = df['desnivel'].str.replace(".", "")


In [45]:
#Adding the alpha name column.

def stripper(name):
    return re.sub(r'\W+', '', name)[:20]

df['alpha_name'] = df['nombre'].apply(stripper)

In [52]:
#Renaming every gpx file with the new shortened and stripped name, matching our new df column.

for path in pathlib.Path("/gpx").iterdir(): #Using iterdir to iterate through every file in our gpx folder. 
    try:
        if path.is_file():
            old_name = path.stem
            old_extension = path.suffix
            directory = path.parent
            strip = re.sub(r'\W+', '', old_name) #Stripping the name using the same regex as before.
            new_name = strip[:20] + old_extension #Only keeping the first 20 characters.
            path.rename(pathlib.Path(directory, new_name)) #Renaming the file.
    except:
        pass

In [26]:
#Creating a list of our shortened alphanumeric names.

alpha_list = df['alpha_name'].tolist()
len(alpha_list)

5263

In [29]:
#Now that our dataframe and gpx files match we can easily check for missing entries.

missing_routes = [] #This list will store our missing filenames.
ok_routes = [] #Successful downloads.

for i in alpha_list:
    gpx_path = 'gpx/' + i + '.gpx'
    isExist = os.path.exists(gpx_path) #Checking if the file with the given filename exists.
    if isExist == True:
        ok_routes.append(i)
        pass
    else:
        missing_routes.append(i)

In [34]:
len(missing_routes)

975

In [35]:
#Creating a list of missing routes url.

missing_list = []

for i in range(1138):
    if df['alpha_name'].iloc[i] in missing_routes:
        missing_list.append(df['url'].iloc[i])

In [36]:
len(missing_list)

102

In [None]:
fav(missing_list)

## Parsing all gpx files

Now that we have all files it's time to parse them so that we can extract valuable data and further analyze them. For this purpose we will be using our previously created parser function.

This function takes a given gpx file and returns a dictionary of the most important values, thus making it easy to convert a list of such dictionaries into a dataframe.

In [2]:
def parser(file):
    try:
        gpx_file = open(file, 'r', encoding='utf-8') 
        gpx = gpxpy.parse(gpx_file) 
        coords = []
        alt = []
        for track in gpx.tracks:
            for segment in track.segments:        
                for point in segment.points:
                    coords.append(tuple([point.latitude, point.longitude]))
                    alt.append(point.elevation)
        parsed_file = {'name': track.name, 
                       'coords': [coords], 
                       'alt': [alt], 
                       'distance': track.length_3d()/1000, 
                       'climb': int(gpx.get_uphill_downhill()[0]),
                       'min_alt': int(gpx.get_elevation_extremes()[0]),
                       'max_alt': int(gpx.get_elevation_extremes()[1]),
                       'start': coords[0][0],
                       's_lo': coords[0][1],
                       'f_la': coords[-1][0],
                       'f_lo': coords[-1][1],}
        return parsed_file
    except:
        pass

In [23]:
start = time.time()

dict_list = []

directory = 'gpx' #Our very original gpx folder name.
 
files = Path(directory).glob('*')
for file in files:
    dict_list.append(parser(file))
    
stop = time.time() 
duration = (stop - start) / 60
print('Minutes:', duration)

Minutes: 18.890571089585624


In [24]:
len(dict_list)

1528

In [25]:
df9 = pd.DataFrame(dict_list)

In [26]:
df9.to_csv('gpx_9.csv', index=False)

In [33]:
df1 =pd.read_csv('gpx_1.csv')
df2 =pd.read_csv('gpx_2.csv')
df3 =pd.read_csv('gpx_3.csv')
df4 =pd.read_csv('gpx_4.csv')
df5 =pd.read_csv('gpx_5.csv')
df6 =pd.read_csv('gpx_6.csv')
df7 =pd.read_csv('gpx_7.csv')
df8 =pd.read_csv('gpx_8.csv')
df9 =pd.read_csv('gpx_9.csv')

In [34]:
df_parsed = pd.concat([df1, df2, df3, df4, df5, df8, df7, df8, df9])

In [37]:
df_parsed.drop_duplicates('name', keep='first', inplace=True)

In [38]:
df_parsed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14137 entries, 0 to 1527
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      14137 non-null  object 
 1   coords    14137 non-null  object 
 2   alt       14137 non-null  object 
 3   distance  14137 non-null  float64
 4   climb     14137 non-null  float64
 5   min_alt   14137 non-null  float64
 6   max_alt   14137 non-null  float64
 7   s_la      14137 non-null  float64
 8   s_lo      14137 non-null  float64
 9   f_la      14137 non-null  float64
 10  f_lo      14137 non-null  float64
dtypes: float64(8), object(3)
memory usage: 1.3+ MB


In [39]:
df_parsed.to_csv('master_parsed_all.csv', index=False)

In [46]:
df_parsed.head()

Unnamed: 0,name,coords,alt,distance,climb,min_alt,max_alt,s_la,s_lo,f_la,f_lo
0,010-19. Alcoceber. Torreblanca. Vilanova. Benl...,"[[(40.255667, 0.28779), (40.255619, 0.287881),...","[[21.778, 21.133, 20.726, 21.508, 21.45, 21.34...",73.450869,848.666,5.407,365.372,40.255667,0.28779,40.255109,0.287932
1,01: Alfarrás (Lleida) a Huesca,"[[(41.832578, 0.566965), (41.832578, 0.566965)...","[[300.255, 299.807, 299.617, 299.608, 299.647,...",103.598595,961.8401,255.286,486.425,41.832578,0.566965,42.141745,-0.411256
2,01-Madrid - Motilla del Palancar,"[[(40.39467, -3.67912), (40.39546, -3.67998), ...","[[592.065, 597.068, 596.014, 597.008, 598.067,...",230.130901,2158.2991,544.081,976.08,40.39467,-3.67912,39.561042,-1.906754
3,01-MAY-16 ALMÁCERA-BÉTERA-OLOCAU-GÁTOVA-ALTO D...,"[[(39.510125, -0.355943), (39.510517, -0.35574...","[[-79.616, -79.676, -79.613, -79.208, -79.662,...",117.58934,1584.8682,-81.149,729.381,39.510125,-0.355943,39.510162,-0.355961
4,01-MAY TROFEO LACTURALE Juveniles,"[[(42.921524, -1.846321), (42.92144, -1.846497...","[[468.335, 468.386, 467.814, 466.407, 466.974,...",67.951624,1339.3475,437.554,800.081,42.921524,-1.846321,42.949894,-1.835245


## 3.1 Merging the parsed gpx dataframe with the routes dataframe

In [40]:
#Loading our routes dataframe.

df_rutas = pd.read_csv('master_rutas_final.csv')

In [43]:
#Merging the parsed gpx files with the corresponding routes to create a final dataframe.

df_merge = pd.merge(df_rutas, df_parsed, how='inner', left_on=['nombre'], right_on=['name'])

In [45]:
#We now have 9171 parsed routes, but some of them will be outside Spain and will have to be discarded.

df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9171 entries, 0 to 9170
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ubicacion   9171 non-null   object 
 1   nombre      9171 non-null   object 
 2   trailrank   9171 non-null   int64  
 3   distancia   9171 non-null   float64
 4   desnivel    9171 non-null   int64  
 5   dificultad  9171 non-null   object 
 6   url         9171 non-null   object 
 7   photo1      9171 non-null   object 
 8   photo2      9171 non-null   object 
 9   photo3      9171 non-null   object 
 10  alpha_name  9170 non-null   object 
 11  name        9171 non-null   object 
 12  coords      9171 non-null   object 
 13  alt         9171 non-null   object 
 14  distance    9171 non-null   float64
 15  climb       9171 non-null   float64
 16  min_alt     9171 non-null   float64
 17  max_alt     9171 non-null   float64
 18  s_la        9171 non-null   float64
 19  s_lo        9171 non-null  

**<div align="right">Ironhack DA PT 2021</div>**
    
**<div align="right">Xavier Esteban</div>**