In [1]:
from splinter import Browser
from selenium.common.exceptions import InvalidElementStateException
from selenium.common.exceptions import ElementNotInteractableException
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd
import html5lib
import lxml
import shutil
import time
import urllib3
import requests
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure

In [62]:
# Set the zip codes
codes = []
codes.append('27006')   # just hard code one in
code = codes[0]    # set the variable for zip_and_click
global browser
data = {}
host = 'localhost'
port = 27017

In [3]:
# inititial website to visit
url = 'https://weather.com'

In [4]:
def chrome():
    '''
    Finds the chromedriver in the system and creates a Chrome browser
    '''
    executable_path = {'executable_path': shutil.which('chromedriver')}
    browser = Browser('chrome', **executable_path)
    return(browser)

In [5]:
def zip_and_click(code):
    '''
    Enter zip codes into the search bar on weather.com and click the first result.
    No return, just leaves browser at the first data page.
    '''
    import time
    
    filled = False
    clicked = False
    
    inputs = browser.find_by_tag('input') # get the search box reference
    search_box = inputs[0]
    while not filled:
        try:
            search_box.fill(code)
            filled = True
        except InvalidElementStateException:
            time.sleep(1)
    while not clicked:
        try:
            browser.click_link_by_partial_href('/weather/today/l')
            print('no exception this time')
            clicked = True
        except ElementNotInteractableException:
            print('exception found....waiting...')
            time.sleep(1)
    return(browser)     

In [6]:
def scrape_now(browser):
    ''' Get observed weather data from the current weather page '''
    right_now = {}
    divs = browser.find_by_tag('div').first
    nowcard = divs.find_by_tag('section.today_nowcard-container').value
    now = nowcard.split('\n')
    right_now.update({'as_of': now[1].split()[3:5],
                 'observations': {'temp F': {'measured': now[2],
                                            'feels_like': now[4][11:]
                                            },
                                 'description': now[3],
                                 'uv_index': now[6].split()[3],
                                 'wind': {'speed mph': now[10].split()[1],
                                         'direction': now[10].split()[0]
                                         },
                                 'humidity': now[12],
                                 'dew_point': now[14],
                                 'pressure': {'in': now[16].split()[0],
                                             'change': ''
                                             },
                                  'visibility mi': now[18]
                                 }
                 })
    data['right_now'] = right_now
    return(right_now)

In [7]:
def wind_dict(wind_str):
    ''' handle the cases where the wind value is simply "calm" rather than direction and speed '''
    if wind_str == 'Calm':
        return {
            'speed': 0,
            'direction': None
        }
    else:
        return {
            'speed': wind_str.split()[1],
            'direction': wind_str.split()[0]
        }

In [8]:
def scrape_next36(browser):
    ''' Get the "at a glance" data from the next 36 hours '''

    next_36 = {}
    
    # Pulling data from the "next 36 hours" overview displays. There are 5 sections
    # that are pulled from. Each section gets a different label.
    hour_code = {'0':'now',
                '1':'at_start',
                '2':'at_12',
                '3':'at_24',
                '4':'at_36'
                }
    for i in range(5):
        print(f'looking for {hour_code[str(i)]} snapshot')
        try:
            browser.find_by_tag(f'div#daypart-{i}').click()
        except ElementNotInteractableException:
            print('ElementNotInteractableException occured: breaking loop.')
            break
        snapshot = browser.find_by_tag(f'div#daypart-{i}')
        snap_values = snapshot.value
        details = browser.find_by_tag('span.wx-detail-value')
        detaillist = [details[j].value for j in range(4)]
        print(f'completed section {i} snapshot')

        #store the data
        next_36.update({hour_code[str(i)]: {'hour': 'CALCULATED HOUR OF PREDICTION',
                                            'condition': snap_values.split('\n')[1],
                                            'high/low': snap_values.split('\n')[2],
                                            'temp_f': snap_values.split('\n')[3],
                                            'chance_precip': snap_values.split('\n')[4],
                                            'description': browser.find_by_id(f'dp{i}-details-narrative').first.value,
                                            'wind': wind_dict(detaillist[0]),
                                            'humidity': detaillist[1],
                                            'uv_index': detaillist[2],
                                            'sun': {'rise': detaillist[3],
                                                    'set': detaillist[3]
                                                   }
                                              }
                      })
    data['next_36'] = next_36
    return(next_36)

In [9]:
def goto_hourly(browser):
    ''' Take the browser to the hourly numbers '''
    browser.find_by_text('Hourly').click()
    time.sleep(.5)
    n=0
    while browser.is_text_present('Next 8 Hours'):
        n+=1
        browser.find_by_text('Next 8 Hours').click()
        time.sleep(.5)
    else:
        print(f'got {8*n} hours of forcast')
    return

In [10]:
def get_hourly(browser):
    ''' Get the tables from the current page and turn to pandas df's '''
    url = browser.url
    r = requests.get(url)
    dfs = pd.read_html(r.text)
    df = dfs[0]
    hourly = df.to_json(orient='split')
    data['hourly'] = hourly
    return(hourly)

In [24]:
def check_db_access(str: host, port):
    '''A check that there is write access to the database'''
   
    client = MongoClient(host=host, port=port)
    try:
        # The ismaster command is cheap and does not require auth.
        client.admin.command('ismaster')
        print('Connection made')
    except ConnectionFailure:
        print("Server not available")
        return()

    # check the database connections
        # Get a count of the number of databases at the connection (accessible through that port)
        # before attempting to add to it
    db_count_pre = len(client.list_database_names())
        # Add a database and collection
    db = client.test_db
    col = db.test_col

    # Insert something to the db
    post = {'name':'Chuck VanHoff',
           'age':'38',
           'hobby':'gardening'
           }
    col.insert_one(post)

        # Get a count of the databases after adding one
    db_count_post = len(client.list_database_names())

    if db_count_pre-db_count_post>=0:
        print('Your conneciton is flipped up')
    else:
        print('You have write access')

    client.drop_database(db)
    client.close()
    return(client)

In [73]:
def load(data):
    ''' Load the data to the database '''
#     client = MongoClient(host=host, port=port)
    db = client.forcast
#     code = str(code)
    col = db.code
#     post = data
    col.insert_one(data)
    return()

In [70]:
client = check_db_access(host, port)

Connection made
You have write access


In [13]:
shutil.which('chromedriver')

'/Users/mastacow/data/forcast-forcast/env/bin/chromedriver'

In [37]:
browser = chrome()

In [63]:
browser.visit(url)

In [64]:
zip_and_click(code)

no exception this time


<splinter.driver.webdriver.chrome.WebDriver at 0x1137c7700>

In [65]:
right_now = scrape_now(browser)
# pprint(right_now)

In [66]:
next_36 = scrape_next36(browser)
# pprint(next_36)

looking for now snapshot
completed section 0 snapshot
looking for at_start snapshot
completed section 1 snapshot
looking for at_12 snapshot
completed section 2 snapshot
looking for at_24 snapshot
completed section 3 snapshot
looking for at_36 snapshot
completed section 4 snapshot


In [67]:
goto_hourly(browser)

got 40 hours of forcast


In [49]:
# A line that will click out of an advertisement
browser.find_by_tag('a#bx-close-inside-970188')[0].click()

In [68]:
hourly = get_hourly(browser)
# hourly

In [69]:
# data

In [71]:
load(code)

()

In [72]:
db = client.forcast
db.list_collection_names()

['code']

In [61]:
for x in db.code.find():
    pprint(x)

{'_id': ObjectId('5de6575e08b5184ef0c66939')}
{'_id': ObjectId('5de79796cad08f0bd0c801d8')}
{'_id': ObjectId('5de871afff395937db2f6f30'),
 'hourly': '{"columns":["Time","Description","Temp","Feels","Precip","Humidity","Wind","Unnamed: '
           '7"],"index":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],"data":[[null,"9:45 '
           'pmWed","Clear","44\\u00b0","44\\u00b0","0%","60%","W 2 '
           'mph"],[null,"10:00 '
           'pmWed","Clear","43\\u00b0","43\\u00b0","0%","60%","W 2 '
           'mph"],[null,"11:00 '
           'pmWed","Clear","42\\u00b0","42\\u00b0","0%","60%","WSW 1 '
           'mph"],[null,"12:00 '
           'amThu","Clear","42\\u00b0","42\\u00b0","0%","62%","WSW 2 '
           'mph"],[null,"1:00 '
           'amThu","Clear","41\\u00b0","41\\u00b0","0%","63%","WSW 2 '
           'mph"],[null,"2:00 '
           'amThu","Clear","42\\u00b0","39\\u00b0","0%","62%","W 4 '
           'mph"],[null,"3:00 '
           'amThu","Clear","41\\u00b0","38\\u00b0","0%","62%",

In [18]:
time.time()

1575322635.426256

In [20]:
goto_hourly(browser)

1
got next 8 hour spotted
clicked and waiting
end n=1 th pass
2
got next 8 hour spotted
clicked and waiting
end n=2 th pass
3
got next 8 hour spotted
clicked and waiting
end n=3 th pass
4
got next 8 hour spotted
clicked and waiting
end n=4 th pass
5
got next 8 hour spotted
clicked and waiting
end n=5 th pass


In [36]:
browser.quit()

In [55]:
now

['ADVANCE, NC',
 'as of 11:27 pm EST',
 '43°',
 'CLEAR',
 'feels like 41°',
 'H -- L 35°',
 'UV Index 0 of 10',
 'Holiday Week Travel Forecast',
 'RIGHT NOW',
 'Wind',
 'NNW 4 mph',
 'Humidity',
 '57%',
 'Dew Point',
 '29°',
 'Pressure',
 '30.31 in',
 'Visibility',
 '10.0 mi']

In [None]:
if now[10]

In [57]:
from pprint import pprint
pprint(next_36['now'])
# next_36

{'chance_precip': '0%',
 'condition': 'MOSTLY CLEAR',
 'description': 'Clear to partly cloudy. Low near 35F. Winds light and '
                'variable.',
 'high/low': 'LOW',
 'hour': 'CALCULATED HOUR OF PREDICTION',
 'humidity': '68%',
 'sun': {'rise': '7:10 am5:08 pm', 'set': '7:10 am5:08 pm'},
 'temp_f': '35°',
 'uv_index': '0 of 10',
 'wind': {'direction': 'NNE', 'speed': '4'}}


In [22]:
get_table(browser)

[    Time  Description        Temp Feels Precip Humidity Wind Unnamed: 7
 0    NaN  12:30 pmSat      Cloudy   48°    48°       0%  86%  SSE 2 mph
 1    NaN   1:00 pmSat      Cloudy   49°    49°       0%  85%  SSE 2 mph
 2    NaN   2:00 pmSat      Cloudy   50°    50°       0%  81%  SSE 2 mph
 3    NaN   3:00 pmSat      Cloudy   50°    50°      10%  78%  SSE 3 mph
 4    NaN   4:00 pmSat      Cloudy   50°    50°      25%  77%   SE 2 mph
 5    NaN   5:00 pmSat        Rain   50°    50°      70%  80%  ESE 1 mph
 6    NaN   6:00 pmSat     Showers   49°    49°      65%  89%  ESE 1 mph
 7    NaN   7:00 pmSat        Rain   48°    48°      60%  93%   SE 2 mph
 8    NaN   8:00 pmSat  Light Rain   47°    47°      65%  93%   SE 2 mph
 9    NaN   9:00 pmSat  Light Rain   47°    47°      65%  94%  ESE 2 mph
 10   NaN  10:00 pmSat        Rain   47°    47°      85%  96%  ESE 2 mph
 11   NaN  11:00 pmSat        Rain   47°    47°      90%  95%   SE 2 mph
 12   NaN  12:00 amSun        Rain   47°    46°    

In [36]:
browser.back()

In [35]:
len(dfs)

1

In [None]:
# snapshot = browser.find_by_tag('div#daypart-0').value
# snaplist = snapshot.split('\n')
# details = browser.find_by_tag('span.wx-detail-value')
# detailslist = [details[i].value for i in range(4)]
# print(snaplist)
# detailslist

In [49]:
print(snaplist)

['TONIGHT', 'RAIN', 'LOW', '42°', '90%']


None


<splinter.element_list.ElementList object at 0x104ad09d0>


NNW 10 mph
86%
0 of 10
6:43 am5:13 pm


'TODAY\nHIGH\n70°\n10%'

'A clear sky. Scattered frost possible. Low around 30F. Winds light and variable.'

In [40]:
browser.quit()

1

'Today'

got input box


1

In [25]:
hour_code = {'0':'now',
            '1':'at_start',
            '2':'at_12',
            '3':'at_24',
            '4':'at_36'
}
print(hour_code['0'])
for i in range(5):
    print(hour_code[str(i)])
    print(f'looking for {hour_code[str(i)]} snapshot')

now
now
looking for now snapshot
at_start
looking for at_start snapshot
at_12
looking for at_12 snapshot
at_24
looking for at_24 snapshot
at_36
looking for at_36 snapshot


In [None]:
### SEARCH FOR THE TABLE THAT HOLDS THE SEARCH RESULTS ###