## Dependencies

In [1]:
from splinter import Browser
from selenium.common.exceptions import InvalidElementStateException
from selenium.common.exceptions import ElementNotInteractableException
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import html5lib
import lxml
import shutil
import time
import urllib3
import requests
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
import json

## Variables

In [2]:
# Set the zip codes
codes = []
codes.append('27606')   # just hard code one in
code = codes[0]    # set the variable for zip_and_click
global browser   # calling this 'global' because it needs to be type set later
forecast_type = str   # so I can add the dict field in the function argument
data = {}
host = 'localhost'
port = 27017

In [3]:
# Set the url of the website to be scraped
url = 'https://weather.com/'

## Functions

In [4]:
def chrome():
    '''
    Finds the chromedriver in the system and creates a Chrome browser
    '''
    executable_path = {'executable_path': shutil.which('chromedriver')}
    browser = Browser('chrome', **executable_path)
    return(browser)

In [5]:
def zip_and_click(code):
    '''
    Enter zip codes into the search bar on weather.com and click the first result.
    No return, just leaves browser at the first data page.
    '''
    
    filled = False
    clicked = False
    
    inputs = browser.find_by_tag('input') # get the search box reference
    search_box = inputs[0]
    while not filled:
        try:
            search_box.fill(code)
            filled = True
        except InvalidElementStateException:
            time.sleep(1)
    while not clicked:
        try:
            browser.click_link_by_partial_href('/weather/today/l')
            print('no exception this time')
            clicked = True
        except ElementNotInteractableException:
            print('exception found....waiting...')
            time.sleep(1)
    return   

In [6]:
def scrape_now(browser):
    ''' Get observed weather data from the current weather page '''
    right_now = {}
    divs = browser.find_by_tag('div').first
    nowcard = divs.find_by_tag('section.today_nowcard-container').value
    now = nowcard.split('\n')
    right_now.update({'as_of': [x.lower() for  x in now[1].split()[3:5]],
                 'observations': {'temp F': {'measured': now[2].lower(),
                                            'feels_like': [x.lower() for x in now[4][11:]]
                                            },
                                 'description': now[3].lower(),
                                 'uv_index': now[6].split()[3].lower(),
                                 'wind': wind_dict(now[10]),
                                 'humidity': now[12].lower(),
                                 'dew_point': now[14].lower(),
                                 'pressure': {'in': now[16].split()[0].lower(),
                                             'change': ''
                                             },
                                  'visibility mi': now[18].lower()
                                 }
                 })
    data['right_now'] = right_now
    return

In [7]:
def wind_dict(wind_str):
    ''' handle the cases where the wind value is simply "calm" rather than direction and speed '''
    if wind_str == 'Calm':
        return {
            'speed': 0,
            'direction': None
        }
    else:
        return {
            'speed': wind_str.split()[1].lower(),
            'direction': wind_str.split()[0].lower()
        }

In [8]:
def scrape_next36(browser):
    ''' Get the "at a glance" data from the next 36 hours '''

    next_36 = {}
    
    # Pulling data from the "next 36 hours" overview displays. There are 5 sections
    # that are pulled from. Each section gets a different label.
    hour_code = {'0':'now',
                '1':'at_start',
                '2':'at_12',
                '3':'at_24',
                '4':'at_36'
                }
    for i in range(5):
        try:
            browser.find_by_tag(f'div#daypart-{i}').click()
        except ElementNotInteractableException:
            print('ElementNotInteractableException occured: breaking loop.')
            break
        snapshot = browser.find_by_tag(f'div#daypart-{i}')
        snap_values = snapshot.value
        details = browser.find_by_tag('span.wx-detail-value')
        detaillist = [details[j].value for j in range(4)]

        # store the data
        next_36.update({hour_code[str(i)]: {'hour': 'CALCULATED HOUR OF PREDICTION',
                                            'condition': snap_values.split('\n')[1].lower(),
                                            'high/low': snap_values.split('\n')[2].lower(),
                                            'temp_f': snap_values.split('\n')[3].lower(),
                                            'chance_precip': snap_values.split('\n')[4].lower(),
                                            'description': browser.find_by_id(f'dp{i}-details-narrative').first.value.lower(),
                                            'wind': wind_dict(detaillist[0]),
                                            'humidity': detaillist[1].lower(),
                                            'uv_index': detaillist[2].lower(),
                                            'sun': {'rise': detaillist[3].lower(),
                                                    'set': detaillist[3].lower()
                                                   }
                                              }
                      })
    data['next_36'] = next_36
    return

In [9]:
def goto_hourly(browser):
    ''' Take the browser to the hourly numbers '''
    browser.find_by_text('Hourly').click()
    time.sleep(.5)
    n=0
    while browser.is_text_present('Next 8 Hours'):
        n+=1
        browser.find_by_text('Next 8 Hours').click()
        time.sleep(.5)
    else:
        print(f'got {8*n} hours of forcast')
    return

In [10]:
def get_tables(forecast_type, browser):
    ''' Get the tables from the current page and turn to pandas df's '''
    url = browser.url
    r = requests.get(url)
    dfs = pd.read_html(r.text)
    df = dfs[0]
#     d_table = df.to_dict(orient='index')
#     data[forecast_type] = d_table
    return(df)

In [None]:
def goto_tenday(browser):
    ''' Get the ten day forcast '''
    tenday = browser.find_by_text('10 Day Forecast')
    if tenday:
        tenday[1].click()
    return

In [11]:
def check_db_access(str: host, port):
    '''A check that there is write access to the database'''
   
    client = MongoClient(host=host, port=port)
    try:
        # The ismaster command is cheap and does not require auth.
        client.admin.command('ismaster')
        print('Connection made')
    except ConnectionFailure:
        print("Server not available")
        return

    # check the database connections
        # Get a count of the number of databases at the connection (accessible through that port)
        # before attempting to add to it
    db_count_pre = len(client.list_database_names())
        # Add a database and collection
    db = client.test_db
    col = db.test_col

    # Insert something to the db
    post = {'name':'Chuck VanHoff',
           'age':'38',
           'hobby':'gardening'
           }
    col.insert_one(post)

        # Get a count of the databases after adding one
    db_count_post = len(client.list_database_names())

    if db_count_pre-db_count_post>=0:
        print('Your conneciton is flipped up')
    else:
        print('You have write access')

    client.drop_database(db)
    client.close()
    return(client)

In [None]:
def load(data):
    ''' Load the data to the database '''
    db = client.forcast
    col = db.code
    col.insert_one(data)
    return

## Run the functions

In [11]:
browser = chrome()

In [12]:
browser.visit(url)

In [13]:
zip_and_click(code)

<splinter.driver.webdriver.chrome.WebDriver at 0x113673b50>

In [16]:
scrape_now(browser)

{'as_of': ['pm', 'EST'],
 'observations': {'temp F': {'measured': '41°', 'feels_like': '37°'},
  'description': 'CLEAR',
  'uv_index': 'of',
  'wind': {'speed mph': '6', 'direction': 'WSW'},
  'humidity': '63%',
  'dew_point': '29°',
  'pressure': {'in': '29.84', 'change': ''},
  'visibility mi': '10.0 mi'}}

In [17]:
scrape_next36(browser)

looking for now snapshot
completed section 0 snapshot
looking for at_start snapshot
completed section 1 snapshot
looking for at_12 snapshot
completed section 2 snapshot
looking for at_24 snapshot
completed section 3 snapshot
looking for at_36 snapshot
completed section 4 snapshot


{'now': {'hour': 'CALCULATED HOUR OF PREDICTION',
  'condition': 'CLEAR',
  'high/low': 'LOW',
  'temp_f': '35°',
  'chance_precip': '0%',
  'description': 'Clear skies. Low near 35F. Winds WNW at 5 to 10 mph.',
  'wind': {'speed': '6', 'direction': 'WNW'},
  'humidity': '67%',
  'uv_index': '0 of 10',
  'sun': {'rise': '7:07 am5:01 pm', 'set': '7:07 am5:01 pm'}},
 'at_start': {'hour': 'CALCULATED HOUR OF PREDICTION',
  'condition': 'MOSTLY SUNNY',
  'high/low': 'HIGH',
  'temp_f': '51°',
  'chance_precip': '0%',
  'description': 'Generally sunny despite a few afternoon clouds. High 51F. Winds W at 5 to 10 mph.',
  'wind': {'speed': '8', 'direction': 'W'},
  'humidity': '57%',
  'uv_index': '3 of 10',
  'sun': {'rise': '7:07 am5:00 pm', 'set': '7:07 am5:00 pm'}},
 'at_12': {'hour': 'CALCULATED HOUR OF PREDICTION',
  'condition': 'CLOUDY',
  'high/low': 'LOW',
  'temp_f': '36°',
  'chance_precip': '0%',
  'description': 'Cloudy skies. Low 36F. Winds light and variable.',
  'wind': {'spe

In [18]:
goto_hourly(browser)

got 40 hours of forcast


In [19]:
get_hourly(browser)

'{"columns":["Time","Description","Temp","Feels","Precip","Humidity","Wind","Unnamed: 7"],"index":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],"data":[[null,"11:00 pmMon","Clear","41\\u00b0","37\\u00b0","0%","63%","WNW 6 mph"],[null,"12:00 amTue","Clear","40\\u00b0","36\\u00b0","0%","65%","WNW 6 mph"],[null,"1:00 amTue","Clear","40\\u00b0","37\\u00b0","0%","65%","WNW 5 mph"],[null,"2:00 amTue","Clear","39\\u00b0","36\\u00b0","0%","67%","WNW 5 mph"],[null,"3:00 amTue","Clear","38\\u00b0","34\\u00b0","0%","70%","WNW 5 mph"],[null,"4:00 amTue","Clear","37\\u00b0","34\\u00b0","0%","71%","WNW 4 mph"],[null,"5:00 amTue","Clear","37\\u00b0","33\\u00b0","0%","71%","WNW 5 mph"],[null,"6:00 amTue","Clear","37\\u00b0","32\\u00b0","5%","70%","WNW 6 mph"],[null,"7:00 amTue","Clear","36\\u00b0","32\\u00b0","5%","73%","WNW 5 mph"],[null,"8:00 amTue","Sunny","37\\u00b0","32\\u00b0","0%","72%","WNW 6 mph"],[null,"9:00 amTue","Sunny","40\\u00b0","34\\u00b0","0%","66%","WNW 8 mph"],[null,"10:00 amTue","Sunny"

In [12]:
check_db_access(host, port)

Connection made
You have write access


()

In [13]:
browser.quit()

NameError: name 'browser' is not defined