In [106]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

# store in dictionary for starters
data = {}

In [134]:
current = "https://www.nwac.us/avalanche-forecast/current/mt-hood/"
r = requests.get(current)
sup = BeautifulSoup(r.content, "html5lib")

In [147]:
ps = sup.find(id="problems")
l = ps.find_all(class_='problem')
for e in l:
    print(e.attrs)

{'class': ['problem', 'wind-slab']}
{'class': ['problem', 'loose-wet']}


In [204]:
def extract(r):
    row = {}

    soup = BeautifulSoup(r.content, "html5lib")
    elevs = soup.find(id="elevation-levels")
    problem_section = soup.find(id="problems")
    problem_tags = problem_section.find_all(class_="problem", recursive=False)
    discussion = soup.find(id='discussion').contents
    
    problems = []
    sizes = []
    likelihoods = []
    octagons = []
    
    for div in problem_tags:
        # print(div.attrs)
        problems.append(div.attrs['class'][1])
        sizes.append(div.find(class_='problem-sizes').attrs['src'])
        likelihoods.append(div.find(class_='problem-likelihood').attrs['src'])
        octagons.append(div.find(class_='problem-octagon').attrs['src'])

    row['problems'] = problems
    row['sizes'] = sizes
    row['likelihoods'] = likelihoods
    row['octagons'] = octagons
    row['discussion'] = discussion
    
    issued = elevs.contents[1].contents[1].string.strip().strip('Issued: ')

    for tag in ["treeline-above", "treeline-near", "treeline-below"]:
        el = elevs.find(id=tag)
        danger = el.contents[3].contents[5].h4.string
        # print(tag, danger)
        row[tag] = danger

    print('\r' + str(row['problems']))
    return issued, row

# extract()

In [196]:
ex = extract(requests.get(current))

[['wind-slab', 'loose-wet']]


In [84]:
first_forecast = "https://www.nwac.us/avalanche-forecast/avalanche-region-forecast/73/mt-hood/"
fake_forecast = "https://www.nwac.us/avalanche-forecast/avalanche-region-forecast/3623/mt-hood/"
# url to format, 73 - 3900 ?:
url_base = "https://www.nwac.us/avalanche-forecast/avalanche-region-forecast/{}/mt-hood/"

In [None]:
for i in range(192, 4000):
# for i in range(73, 75):
    url = url_base.format(i)
    r = requests.get(url)
    print('\r' + str(i) + ' ' + str(r.status_code))
    if r.status_code == 404:
        continue
    else:
        # be nice to server
        sleep(.5)

    try:
        issued, row = extract(r)
        data[issued] = row
    except AttributeError:
        continue

192 200
['wind-slab']
193 200
['wind-slab']
194 404
195 404
196 404
197 404
198 200
['wind-slab']
199 200
['wind-slab']
200 200
['wind-slab']
201 200
202 404
203 200
['wind-slab']
204 404
205 404
206 200
207 200
['wind-slab']
208 200
209 200
['wind-slab', 'storm-slabs']
210 200
['wind-slab']
211 200
['wind-slab']
212 200
213 200
['wind-slab', 'storm-slabs']
214 404
215 404
216 200
['wind-slab']
217 200
['wind-slab']
218 200
['wind-slab']
219 200
['wind-slab']
220 200
['wind-slab']
221 200
['wind-slab', 'loose-wet']
222 200
['wind-slab', 'loose-wet']
223 404
224 404
225 200
['wind-slab', 'loose-wet']
226 200
['wind-slab', 'loose-wet']
227 200
['wind-slab', 'loose-wet']
228 404
229 404
230 200
['wind-slab', 'loose-wet']
231 200
['wind-slab']
232 200
['wind-slab']
233 200
['wind-slab']
234 200
['wind-slab']
235 200
['wind-slab']
236 200
['wind-slab']
237 200
['wind-slab']
238 200
['wind-slab']
239 404
240 404
241 404
242 404
243 200
['wind-slab', 'cornices']
244 200
['wind-slab']
245 200


In [201]:
data

{'10:57 AM PST Saturday, December 21, 2013': {'discussion': ['\n            ',
   <div class="forecast-snowpack" id="slider1-title" style="display: none">
                   <h3>Recent images from NWAC:</h3>
               </div>,
   '\n            ',
   ' Jssor Slider Begin ',
   '\n            ',
   ' You can move inline styles to css file or css block. ',
   '\n            ',
   <div id="slider1_container" style="position: relative; top: 0px; left: 0px; width: 809px; height: 150px; overflow: hidden; display: none;">
                   <!-- Loading Screen -->
                   <div style="position: absolute; top: 0px; left: 0px;" u="loading">
                       <div style="filter: alpha(opacity=70); opacity:0.7; position: absolute; display: block;
                           background-color: #000; top: 0px; left: 0px; width: 100%;height:100%;">
                       </div>
                       <div style="position: absolute; display: block; background: url(../../../static/ima

In [159]:
df = pd.DataFrame.from_dict(data, orient='index')
df.head()

Unnamed: 0,treeline-above,treeline-below,treeline-near,problems
"2:36 PM PST Tuesday, December 10, 2013",Moderate,Low,Moderate,"[[problem, wind-slab]]"
"2:42 PM PST Tuesday, December 10, 2013",Moderate,Low,Moderate,"[[problem, wind-slab]]"
"3:26 PM PST Wednesday, December 11, 2013",High,Considerable,High,"[[problem, wind-slab], [problem, storm-slabs]]"
"9:24 PM PST Tuesday, December 10, 2013",Moderate,Low,Moderate,"[[problem, wind-slab]]"


In [120]:
df.to_csv('nwac_mount_hood_problems.csv')

In [121]:
len(df)

164