In [608]:
import json
import requests
from lxml import etree
import time

In [457]:
BASE_URL = 'https://www.wta.org/go-hiking/hikes/' # base + <id> for individual page

In [642]:
def retrieve_hike_urls(): 
    HIKE_LIST_URL = 'https://www.wta.org/go-hiking/@@trailhead-text-search?jsonp_callback=&query=&start=0&num=9999&_=1629843008980'
    response = requests.get(HIKE_LIST_URL)
    data = json.loads(response.text[1:-1])
    del data['start']
    for hike in data['data']:
        del hike['m']
        hike['url'] = BASE_URL + hike['id']
    return data

In [639]:
def retrieve_and_save_hike_urls(filename):
    data = retrieve_hike_urls()
    with open(filename, 'w+') as file:
        file.write(json.dumps(data))

In [636]:
def load_hike_urls(filename):
    with open(filename) as file:
        return json.loads(file.read())

In [635]:
def retrieve_hike_html(hike_id):
    time.sleep(1)
    response = requests.get(BASE_URL + hike_id)
    return response.text

In [684]:
def save_all_hikes(filename):
    hikes = []
    all_hike_urls = load_hike_urls('wta_trail_list.json')['data']
    for index, hike in enumerate(all_hike_urls):
        print(str(index+1) + ' out of ' + str(len(all_hike_urls)))
        print('(' + hike['name'] + ')')
        hikes.append(extract_details(hike['id']))
    with open(filename, 'w+') as file:
        file.write(json.dumps(hikes))

In [686]:
def extract_details(hike_id):
    hike_html = etree.HTML(retrieve_hike_html(hike_id))
    
    details = {
        'trail_id' : hike_id
    }
    
    try:
        alerts = []
        for alert in hike_html.xpath('//div[@id="hike-top"]/div[@class="alerts-and-conditions"]/div'):
            if 'red' in alert.attrib['class']:      
                alerts.append({
                    'type' : 'red',
                    'text' : alert.getchildren()[0].text
                })
            else:            
                alerts.append({
                    'type' : 'parking',
                    'text' : alert.getchildren()[1].text
                })

        details['alerts'] = alerts
    except IndexError:
        pass
    
    try:
        details['last_maintained'] = hike_html.xpath('//div[@class="last-maintained"]/div')[0].text
    except IndexError:
        pass
    
    try:
        details['name'] = hike_html.xpath('//div[@id="hike-top"]/h1')[0].text
    except IndexError:
        pass
        
    try:
        features = []
        for feature in hike_html.xpath('//div[@id="hike-features"]/div'):
            try: 
                features.append(feature.attrib['data-title'])
            except KeyError:
                pass
        details['features'] = features
    except IndexError:
        pass
    
    try: 
        details['distance'] = hike_html.xpath('//div[@id="distance"]/span')[0].text
    except IndexError:
        pass
        
    try:
        elevation = hike_html.xpath('//div[@class="hike-stat"]/div/span')
        details['elevation'] = {
            'gain' : elevation[0].text,
            'highest_point' : elevation[1].text
        }
    except IndexError:
        pass
        
    try:
        details['rating'] = {
            'stars' : hike_html.xpath('//div[@class="current-rating"]')[0].text,
            'num_votes' : hike_html.xpath('//div[@class="rating-count"]')[0].text.strip()[1:].split(' ')[0]
        }
    except IndexError:
        pass
        
    try:
        details['location'] = hike_html.xpath('//div[@id="hike-stats"]/div[@class="hike-stat"]/div')[0].text
    except IndexError:
        pass
        
    try: 
        details['trailhead_coordinates'] = {
            'latitude' : hike_html.xpath('//div[@class="latlong"]/span')[0].text,
            'longitude' : hike_html.xpath('//div[@class="latlong"]/span')[1].text
        }
    except IndexError:
        pass
        
    try:
        details['wta_author'] = hike_html.xpath('//div[@class="img-text-grouping"]/p/a/span')[0].text
    except IndexError:
        pass
    
    try:
        details['driving_directions'] = hike_html.xpath('//div[@id="driving-directions"]/p')[1].text
    except IndexError:
        pass
        
    try:
        details['hike_description'] = '\n'.join(hike_html.xpath('//div[@id="hike-body-text"]/p/text()'))
    except IndexError:
        pass

    URL = BASE_URL + hike_id + '/@@related_tripreport_listing'
    
    def get_all_trs():
        current_page = get_tr_page(URL)
        trs = []
        while current_page:
            current_page = get_tr_page(current_page['next_page_url'])
            if current_page:
                trs += current_page['tr_urls']
        return trs
    
    def get_tr_page(url):
        time.sleep(1)
        
        html = etree.HTML(requests.get(url).text)
        
        try:
            tr_urls = [tr.attrib['href'] for tr in html.xpath('//a[@class="show-with-full full-report-link visualNoPrint hidden-480 wta-action button"]')]
            return {
                'next_page_url' : html.xpath('//li[@class="next"]/a')[0].attrib['href'],
                'tr_urls' : tr_urls,
            }
        except IndexError:
            return None
        
    def get_tr(url):
        time.sleep(1)
        
        html = etree.HTML(requests.get(url).text)
        
        tr = {}
        
        try:
            tr['author'] = html.xpath('string(//span[@itemprop="author"]/a)').strip()
        except IndexError:
            pass
        
        try:
            condition_elements = html.xpath('//div[@class="trip-condition"]')
            tr['conditions'] = [{
                c.getchildren()[0].text.lower().replace(' ', '_') : c.getchildren()[1].text
            } for c in condition_elements]
        except IndexError:
            pass
        
        try:
            tr['likes'] = html.xpath('//span[@class="tally-total"]/text()')[0]
        except IndexError:
            pass
        
        try:
            tr['report'] = '\n'.join(html.xpath('//div[@id="tripreport-body-text"]/p/text()'))
        except IndexError:
            pass
        
        try:
            tr['date_hiked'] = html.xpath('//span[@class="elapsed-time"]/text()')[0]
        except IndexError:
            pass
        
        return tr
    
    # get trip reports
#     trs = get_all_trs()
#     print('getting ' + str(len(trs)) + ' TRs...')
#     details['trip_reports'] = [get_tr(url) for url in get_all_trs()]
    print()
    
    return details

In [None]:
hikes = load_hikes('wta_trail_list.json')
save_all_hikes('wta_hikes.json')

1 out of 3897
(45 Mile Sheep Drive (Central Cascades -- Entiat Mountains/Lake Chelan))

2 out of 3897
(76 Creek (North Cascades -- Mountain Loop Highway))

3 out of 3897
(360 Trails (Puget Sound and Islands -- Seattle-Tacoma Area))

4 out of 3897
(520 Trail (Puget Sound and Islands -- Seattle-Tacoma Area))

5 out of 3897
(2773' 353'P (Central Cascades))

6 out of 3897
(A Quiet Place Park (Puget Sound and Islands -- Seattle-Tacoma Area))

7 out of 3897
(Aasgard Pass (Central Cascades -- Leavenworth Area))

8 out of 3897
(Abercrombie Mountain Trail (Eastern Washington -- Selkirk Range))

9 out of 3897
(Abernathy Pass via Cedar Creek (North Cascades -- North Cascades Highway - Hwy 20))

10 out of 3897
(Abernathy Peak (North Cascades -- Methow/Sawtooth))

11 out of 3897
(Abiel Pass (Snoqualmie Region))
