## Notebook Objectives

Our objective is to scrape bridge height and coordinate data via the surface tracks API.

In [9]:
import pandas as pd
import numpy as np
import os
import glob
import json
import re

import time
from selenium import webdriver

## Selenium

View documentation [here](https://selenium-python.readthedocs.io/) to see how to web scrape using the Selenium library. 

In [52]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument('--dns-prefetch-disable')

In [83]:
# json file contains username and password

with open('st_credentials.json') as creds:    
    credentials = json.load(creds)

In [84]:
# Instantiate webdriver object for chrome browser
driver = webdriver.Chrome(chrome_options=chrome_options)

# Login page
driver.get("https://www.surfacetracks.com/amember/login")

# Locate login elements
user_element = driver.find_element_by_name("amember_login")
pass_element = driver.find_element_by_name("amember_pass")

# Input user credentials
user_element.send_keys(credentials['amember_login'])
pass_element.send_keys(credentials['amember_pass'])

# Click the login button
driver.find_element_by_xpath("//input[@value='Login']").click()

  


Now that we have our login credentials instantiated into the selenium driver object, we can now access the surface tracks API.

In [55]:
# Sample API url
driver.get('https://www.surfacetracks.com/plus/get-feature.php?id=100660')

In [56]:
# Locate element containing json data
api_get = driver.find_element_by_xpath("//pre[@style='word-wrap: break-word; white-space: pre-wrap;']")

In [57]:
api_get.text

'null'

## Auto Scrape (w/ Selenium)

Our next goal is to automate the scraping process by iterating the id number in each API url. We want every combination of ID to be scraped so that we ensure all data is recorded.

In [69]:
# We will initially test with the last two digits of the ID
base_url = 'https://www.surfacetracks.com/plus/get-feature.php?id=1'

num_dict = {
    0:'0',
    1:'1',
    2:'2',
    3:'3',
    4:'4',
    5:'5',
    6:'6',
    7:'7',
    8:'8',
    9:'9'
}

In [70]:
url_list = []

# Counters will increase using 'abacus' function below
ones_counter = 0
twos_counter = 0
threes_counter = 0
fours_counter = 0
fives_counter = 0

for i in range(1,11001):
    
    if ones_counter == 10:
        ones_counter = ones_counter - 10
        twos_counter += 1
        
    if twos_counter == 10:
        twos_counter = twos_counter - 10
        threes_counter += 1
        
    if threes_counter == 10:
        threes_counter = threes_counter - 10
        fours_counter += 1
        
    if fours_counter == 10:
        fours_counter = fours_counter - 10
        fives_counter += 1
    
    # url will change based upon the position of the counters
    url = base_url + num_dict[fives_counter] + num_dict[fours_counter] + num_dict[threes_counter] + num_dict[twos_counter] + num_dict[ones_counter]
    url_list.append(url)
    ones_counter += 1

In [71]:
# Check list function is working properly
url_list

['https://www.surfacetracks.com/plus/get-feature.php?id=200000',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200001',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200002',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200003',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200004',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200005',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200006',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200007',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200008',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200009',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200010',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200011',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200012',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200013',
 'https://www.surfacetracks.com/plus/get-feature.php?id=200014',
 'https://www.surfacetrac

In [72]:
# Check last iteration
url_list[-1]

'https://www.surfacetracks.com/plus/get-feature.php?id=299999'

In [73]:
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException

In [74]:
json_list = []

# Scrape 
for url in url_list:
    
    try:
        driver.get(url)
        api_scrape = driver.find_element_by_xpath("//pre[@style='word-wrap: break-word; white-space: pre-wrap;']").text

        if api_scrape == "null":
            pass
        else:
            json_list.append(api_scrape)

        # Time function so that we do not overload the server 
        time.sleep(.5)
    except NoSuchElementException:
        pass
    
    except TimeoutException:
        pass

KeyboardInterrupt: 

## DataFrame Creation

In [79]:
df = pd.DataFrame(data=eval(json_list[0]), index=[0])

In [80]:
for i in json_list[1:]:
    
    index =1
    
    df_test = pd.DataFrame(data = eval(i), index=[index])
    df = pd.concat([df, df_test])   
    index += 1

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1595 entries, 0 to 1
Data columns (total 28 columns):
UID               1595 non-null object
Type              1595 non-null object
Est               1595 non-null object
Latitude          1595 non-null object
Longitude         1595 non-null object
ChainName         1595 non-null object
Name              1595 non-null object
Address           1595 non-null object
City              1595 non-null object
St                1595 non-null object
Zip               1595 non-null object
Phone             1595 non-null object
Web               1595 non-null object
Directions        1595 non-null object
Cat               1595 non-null object
County            1595 non-null object
RecordID          1595 non-null object
Elev              1595 non-null object
trail             1595 non-null object
hike              1595 non-null object
AmenitiesExtra    1595 non-null object
Comments          1595 non-null object
brand             1595 non-null object

In [78]:
df.to_csv('bridge_data.csv')