In [36]:
import requests
from bs4 import BeautifulSoup as bs
import re
import dill
import numpy as np
from collections import defaultdict
from selenium import webdriver
import time
from datetime import datetime
import pickle
import resource
import sys
resource.setrlimit(resource.RLIMIT_STACK, [0x100 * 0x100000, resource.RLIM_INFINITY])
sys.setrecursionlimit(0x100000)

In [2]:
def create_data_dict():
    return defaultdict(lambda: {})

In [32]:
def get_soup(URL):
    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }
    # fake user agent to attempt being detected as a web scraping bot
    raw = requests.get(URL, headers=headers)
    return bs(raw.content, features='lxml')

In [21]:
def get_regions(data_dict):
    soup = get_soup('https://www.onthesnow.com/ski-resort.html')
    regions = []
    for x in soup.select('.country'):
        for region in x.find('span'):
            regions.append(str(region))       
    
    foo = []
    for y in soup.select('.relatedRegions'):
        sub_regions = []
        for z in y.find_all('a'):
            path = z['href']
            name = z.text
            sub_regions.append((str(name), str(path)))
        foo.append(sub_regions)
        
    for place, sub_regions in enumerate(foo):
        for sub_region in sub_regions:
            data_dict[ regions[place] ][sub_region] = {}
        
    return dict(data_dict)

In [22]:
def get_resorts(data_dict):
    for region in data_dict.keys():
        for sub_region in data_dict[region].keys():
            driver = webdriver.Chrome('/usr/bin/chromedriver')
            driver.get('https://www.onthesnow.com/{}'.format(sub_region[1]))

            for i in range(0, 40):
                driver.execute_script('window.scrollBy(0, 600)')
                time.sleep(0.4)

            soup = bs(driver.page_source, features='lxml')
            for x in soup.select('.name'):
                for web_add in x.find_all(href=True):
                    loc_name = web_add['title']
                    loc_path = web_add['href']
                    
                    data_dict[region][sub_region][(str(loc_name), str(loc_path))] = {}

            driver.quit()
    return data_dict

In [40]:
def resort_data(data_dict):
    years = np.arange(2008, 2018)
    for region in data_dict.keys():
        for sub_region in data_dict[region].keys():
            print(sub_region)
            for resort in data_dict[region][sub_region].keys():
                for year in years:
                    if year in data_dict[region][sub_region][resort].keys():
                        # data was already populated previously - don't overwrite
                        continue
                    try:
                        sf_vals, sf_dates = get_snowfall(resort, year)
                        sd_vals, sd_dates = get_depth(resort, year)
                    except:
                        # chances are connection was closed due to web scraping bot being detected
                        return data_dict
                    if (sf_vals is None) or (sd_vals is None):
                        # No data for that year
                        continue
                    elif sf_dates != sd_dates:
                        raise ValueError('Snowfall dates do not match snow depth dates')
                    else:
                        data_dict[region][sub_region][resort][year] = {}
                        data_dict[region][sub_region][resort][year]['snowfall'] = str_to_int(sf_vals)
                        data_dict[region][sub_region][resort][year]['dates'] = date_to_datetime(sf_dates)
                        data_dict[region][sub_region][resort][year]['depth'] = str_to_int(sd_vals)
    return data_dict

In [7]:
def name_from_path(path):
    return re.search('\/[^\/]*\/[^\/]*', path).group()

In [29]:
def date_to_datetime(lst):
    out = []
    for date in lst:
        try:
            out.append(datetime.strptime(date, '"%Y-%m-%dT%H:%M:%S.%fZ"').date())
        except:
            out.append(np.nan)
    return out

In [28]:
def str_to_int(lst):
    out = []
    for num in lst:
        try:
            out.append(int(num))
        except:
            out.append(0)
    return out

In [10]:
def get_snowfall(resort, year):
    name_path = name_from_path(resort[1])
    soup = get_soup('https://www.onthesnow.com{}/historical-snowfall.html?&y={}&q=snow'.format(name_path, year))
    for x in soup.find_all('div', attrs={'class': 'resBox'}):
        val_regex = 'var jssnowfalls{} = \[(.*?)\];'.format(year)
        date_regex = 'var jsdates{} = \[(.*?)\];'.format(year)
        m1 = re.search(val_regex, x.get_text())
        m2 = re.search(date_regex, x.get_text())
        if m1 and m2:
            vals = m1.group(1).split(',')
            dates = m2.group(1).split(',')

            if len(vals) != len(dates):
                raise ValueError('Values do not match number of dates')
            return vals, dates
    return None, None

In [11]:
def get_depth(resort, year):
    name_path = name_from_path(resort[1])
    soup = get_soup('https://www.onthesnow.com{}/historical-snowfall.html?&y={}&q=top'.format(name_path, year))
    for x in soup.find_all('div', attrs={'class': 'resBox'}):
        val_regex = 'var jssnowfalls{} = \[(.*?)\];'.format(year)
        date_regex = 'var jsdates{} = \[(.*?)\];'.format(year)
        m1 = re.search(val_regex, x.get_text())
        m2 = re.search(date_regex, x.get_text())
        if m1 and m2:
            vals = m1.group(1).split(',')
            dates = m2.group(1).split(',')

            if len(vals) != len(dates):
                raise ValueError('Values do not match number of dates')
            return vals, dates
    return None, None

In [12]:
def pickle_dict(output_file, dic):
    with open(output_file, 'wb') as f:
        pickle.dump(dic, f, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
def open_dict(input_file):
    with open(input_file, 'rb') as f:
        return pickle.load(f)

In [35]:
def dill_dict(output_file, dic):
    with open(output_file, 'wb') as f:
        dill.dump(dic, f, protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
def run():
    try:
        data_dict = open_dict('data/snow.pkl')
    except:
        print('Unable to open full data dictionary')
        
        try:
            data_dict = open_dict('data/resorts.pkl')
        except:
            print('Unable to open partial data dictionary (resort info)')
            data_dict = create_data_dict()
            data_dict = get_regions(data_dict)
            data_dict = get_resorts(data_dict)
            
            try:
                pickle_dict('data/resorts.pkl', data_dict)
            except:
                print('Unable to pickle partial data dictionary (resort info)')
    
    data_dict = resort_data(data_dict)
    
    try:
#         dill.detect.trace(True)
#         dill.detect.errors(data_dict)
        pickle_dict('data/snow.pkl', data_dict)
    except:
        print('Unable to pickle full data dictionary')
    
    return data_dict

In [None]:
foo = run()
foo

('Alaska', '/alaska/ski-resorts.html')
('Arizona', '/arizona/ski-resorts.html')
('California', '/california/ski-resorts.html')


In [None]:
for region in foo.keys():
    for sub_region in foo[region].keys():
        if not isinstance(sub_region, tuple):
            print(region,'-',sub_region)
        for resort in foo[region][sub_region].keys():
            if not isinstance(resort, tuple):
                print(region,'-',sub_region,'-',resort)
            elif not isinstance(foo[region][sub_region][resort], dict):
                print(region,'-',sub_region,'-',resort)