In [1]:
## Notebook file for testing and playing around with methods, have fun! ##

In [1]:
# Import relevant packages.
from bs4 import BeautifulSoup 
import requests
import urllib3
from datetime import datetime
import pandas as pd
import json

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
# Using Requests & BeautifulSoup, pass a URL to this method and it will return the URL's HTML code as a list of lines.
def get_html_rows(url):
    r = requests.get(url, verify=False) 
    s = BeautifulSoup(r.content, 'html5lib') 
    html = s.prettify()
    return html.splitlines()

In [3]:
# Taking 'https://www.wta.org/go-hiking/hikes' as a parameter,
# this method will find all links to pages consisting of a list of hikes and, return all links as a list using recursion.
# Note, this method does not include 'https://www.wta.org/go-outside/hikes' in the returned list.
def get_hikes_list_page_urls(url, last_page=None):
    
    html_rows = get_html_rows(url)
    
    check = False
    exit = False
    active_page = 0
    index_start = 0
    index_end = 0

    itr = -1
    for row in html_rows:
        itr += 1

        if '"active"' in row:
            index_start = html_rows.index(row)
            active_page = int(html_rows[itr + 2].lstrip())

        if '"last"' in row:
            index_end = html_rows.index(row)
            last_page = int(html_rows[itr + 2].lstrip())
            check = True
            break           
        elif check == False and '"next"' in row:
            index_end = html_rows.index(row)
            last_page = int(html_rows[itr - 3].lstrip())
            check = True
            exit = True
            break        
        elif check == False and active_page == last_page:
            return
            
    rows_range = html_rows[index_start : index_end]
    pages_found = [item[item.find('https') : item.find('">')] for item in rows_range if 'www.wta.org' in item]
    next_page = pages_found[0]
    
    if exit == True:
        return pages_found
    else:
        return list(set().union(pages_found, get_hikes_list_page_urls(next_page, last_page)))

In [4]:
# Taking a list of links consisting of WTA's pages, that houses their lists of hikes, as a parameter,
# this method will return a list of links to all hikes found on wta.org.    
def get_individual_hike_urls(hikes_list_page_urls):
    
    hike_urls_list = []
    
    for url in hikes_list_page_urls:
        html_rows = get_html_rows(url)
        
        for row in html_rows:
            
            if "listitem-title" in row:
                hike_url = row[row.find('https') : row.find('" title=')]                
                hike_urls_list.append(hike_url)
                
    return hike_urls_list

In [5]:
# Taking a list of individual hike links as a parameter,
# this method will reutrn specific information for each hike in the format of a DataFrame.
def get_hike_info(hike_urls):

    titles = []
    regions = []
    distances = []
    dist_types = []
    gains = []
    highests = []
    ratings = []
    rating_counts = []
    latitudes = []
    longitudes = []
    report_counts = []
    report_dates = []
    hike_links = []

    rownum = 1
    for url in hike_urls:
        hike_html_rows = get_html_rows(url)

        itr1 = -1
        for row in hike_html_rows:
            itr1 += 1

            if '"documentFirstHeading"' in row:
                hike_title = hike_html_rows[itr1 + 1].lstrip()
                titles.append(hike_title)

            if '"hike-region"' in row:
                hike_region = hike_html_rows[itr1 + 3].lstrip()
                regions.append(hike_region)

            if '"distance"' in row:
                hike_distance_string = hike_html_rows[itr1 + 2].lstrip()
                hike_distance = float(hike_distance_string[ : hike_distance_string.find(' mile')])
                if ',' in hike_distance_string:
                    hike_distance_type = hike_distance_string[hike_distance_string.find(', ') + 2 : ]
                elif 'of trails' in hike_distance_string:
                    hike_distance_type = hike_distance_string[hike_distance_string.find('of trails') + 3 : ]
                else:
                    hike_distance = 'ERROR'
                distances.append(hike_distance)
                dist_types.append(hike_distance_type)

            if 'Gain:' in row:
                hike_gain = float(hike_html_rows[itr1 + 2].lstrip())
                gains.append(hike_gain)

            if 'Highest Point:' in row:
                hike_highest = float(hike_html_rows[itr1 + 2].lstrip())
                highests.append(hike_highest)

            if '"current-rating"' in row:
                rating_string = hike_html_rows[itr1 + 1].lstrip()
                hike_rating = float(rating_string[ : rating_string.find(' out of')])
                ratings.append(hike_rating)

            if '"rating-count"' in row:
                rating_count_string = hike_html_rows[itr1 + 1].lstrip()
                rating_count = int(rating_count_string[rating_count_string.find('(') + 1 : rating_count_string.find(' vote')])
                rating_counts.append(rating_count)
                
            if '<script type="application/ld+json">' in row:
                json_string = hike_html_rows[itr1 + 1].lstrip()
                hike_json = json.loads(json_string)
                try:
                    latitude = hike_json['geo']['latitude']
                    longitude = hike_json['geo']['longitude']
                    latitudes.append(latitude)
                    longitudes.append(longitude)
                except:
                    pass
                
                
        if len(titles) != rownum:
            titles.append(None)

        if len(regions) != rownum:
            regions.append(None)

        if len(distances) != rownum:
            distances.append(None)

        if len(dist_types) != rownum:
            dist_types.append(None)

        if len(gains) != rownum:
            gains.append(None)

        if len(highests) != rownum:
            highests.append(None)

        if len(ratings) != rownum:
            ratings.append(None)

        if len(rating_counts) != rownum:
            rating_counts.append(None)
            
        if len(latitudes) != rownum:
            latitudes.append(None)

        if len(longitudes) != rownum:
            longitudes.append(None)
        


        report_link = url + '/@@related_tripreport_listing'
        report_html_rows = get_html_rows(report_link)
        report_date_list = []

        itr2 = -1
        for row in report_html_rows:
            itr2 += 1

            if '"count-data"' in row:
                report_count = int(report_html_rows[itr2 + 1].lstrip())
                report_counts.append(report_count)

            if '"elapsed-time"' in row:
                report_date = datetime.strptime(row[row.find('title="') + 7 : row.find('">')], '%b %d, %Y')
                report_date_list.append(report_date)

        if len(report_counts) != rownum:
            report_counts.append(None)

        if len(report_date_list) != 0:
            report_dates.append(report_date_list[0])
        elif len(report_dates) != rownum:
            report_dates.append(None)

        hike_links.append(url)
        
        print(str(rownum) + ' Hikes loaded...')
        rownum += 1
        
    print('Finished loading hikes!\n' + str(rownum - 1) + ' Hikes successfully loaded.') 
    print('titles: ', len(titles), ' entries')
    print('regions: ', len(regions), ' entries')
    print('distances: ', len(distances), ' entries')
    print('dist_types: ', len(dist_types), ' entries')
    print('gains: ', len(gains), ' entries')
    print('highests: ', len(highests), ' entries')
    print('ratings: ', len(ratings), ' entries')
    print('rating_counts: ', len(rating_counts), ' entries')
    print('latitudes: ', len(latitudes), 'Entries')
    print('longitudes: ', len(longitudes), 'Entries')
    print('report_dates: ', len(report_dates), ' entries')
    print('report_counts: ', len(report_counts), ' entries')
    print('hike_links: ', len(hike_links), ' entries')
    
    return pd.DataFrame({'TITLE': titles, 'REGION': regions, 'DISTANCE': distances,
                         'DIST_TYPE': dist_types, 'GAIN': gains, 'HIGHEST': highests,
                         'RATING': ratings, 'RATING_COUNT': rating_counts, 
                         'LATITUDE': latitudes, 'LONGITUDE': longitudes, 
                         'REPORT_DATE': report_dates, 'REPORT_COUNT': report_counts, 
                         'URL': hike_urls2})

In [6]:
# Get all hike page links.
all_hikes_list_pages = list(set().union(get_hikes_list_page_urls('https://www.wta.org/go-outside/hikes'), ['https://www.wta.org/go-outside/hikes']))

In [7]:
# Get all individual hike links.
all_individual_hikes = get_individual_hike_urls(all_hikes_list_pages)

In [8]:
# Get all hike data, initialize to DataFrame.
wta_hikes_df = get_hike_info(all_individual_hikes)

In [None]:
# Write to csv file.
curr_date = datetime.now().date()
wta_hikes_df.to_csv('YOUR_FILE_LOCATION'.format(curr_date), index = False)