In [13]:
# coding: utf-8

from datetime import datetime, timedelta
import urllib
import os
import io

"""
def scrape_station(station):
    '''
    This function scrapes the weather data web pages from wunderground.com
    for the station you provide it.
    You can look up your city's weather station by performing a search for
    it on wunderground.com then clicking on the "History" section.
    The 4-letter name of the station will appear on that page.
    '''

    # Scrape between July 1, 2014 and July 1, 2015
    # You can change the dates here if you prefer to scrape a different range
    current_date = datetime(year=2012, month=7, day=1)
    end_date = datetime(year=2017, month=3, day=12)

    # Make sure a directory exists for the station web pages
    if not os.path.isdir('./%s' % (station)):
        os.mkdir(station)

    # Use .format(station, YYYY, M, D)
    lookup_URL = 'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html'

    while current_date != end_date:

        if current_date.day == 1:
            print(current_date)

        formatted_lookup_URL = lookup_URL.format(station,
                                                 current_date.year,
                                                 current_date.month,
                                                 current_date.day)
        html = urllib.urlopen(formatted_lookup_URL).read()

        out_file_name = '{}/{}-{}-{}.html'.format(station, current_date.year,
                                                  current_date.month,
                                                  current_date.day)

        with open(out_file_name,'w') as out_file:
            out_file.write(html)

        current_date += timedelta(days=1)


# Scrape the stations used in this article
for station in ['KNYC', 'KJRB', 'KEWR', 'KLGA', 'KJFK', 'KISP', 'KTEB', 'KFRG']:
    scrape_station(station)
"""

'\ndef scrape_station(station):\n    \'\'\'\n    This function scrapes the weather data web pages from wunderground.com\n    for the station you provide it.\n    You can look up your city\'s weather station by performing a search for\n    it on wunderground.com then clicking on the "History" section.\n    The 4-letter name of the station will appear on that page.\n    \'\'\'\n\n    # Scrape between July 1, 2014 and July 1, 2015\n    # You can change the dates here if you prefer to scrape a different range\n    current_date = datetime(year=2012, month=7, day=1)\n    end_date = datetime(year=2017, month=3, day=12)\n\n    # Make sure a directory exists for the station web pages\n    if not os.path.isdir(\'./%s\' % (station)):\n        os.mkdir(station)\n\n    # Use .format(station, YYYY, M, D)\n    lookup_URL = \'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html\'\n\n    while current_date != end_date:\n\n        if current_date.day == 1:\n            print(current

In [179]:
from bs4 import BeautifulSoup
def parse_station(station, outfile):
    '''
    This function parses the web pages downloaded from wunderground.com
    into a flat CSV file for the station you provide it.
    Make sure to run the wunderground scraper first so you have the web
    pages downloaded.
    '''

    # Scrape between July 1, 2014 and July 1, 2015
    # You can change the dates here if you prefer to parse a different range
    current_date = datetime(year=2012, month=7, day=1)
    end_date = datetime(year=2017, month=3, day=12)

    with open(outfile, 'a') as out_file:
    
        while current_date != end_date:
            with open('{}/{}-{}-{}.html'.format(station,
                                                current_date.year,
                                                current_date.month,
                                                current_date.day)) as in_file:
                soup = BeautifulSoup(in_file.read(), 'html.parser')

                try:
                    rows = soup.find(id='obsTable').find_all('tr', class_='no-metars')
                    for tr in rows:
                        cols = tr.findAll('td')
                        data = [c.text.replace('\n', '').replace('\t', '') for c in cols]
                        
                        if len(data) < 13:
                            data.insert(2, u'  -')        
                        
                        out_row = [station, '{}-{}-{}'.format(current_date.year, current_date.month, current_date.day)] + data
                        out_file.write('\t'.join(out_row).replace(u'\xa0', u' ').replace(u'\xb0', u'').encode('utf-8'))
                        out_file.write('\n')
                        
                except:
                    print "No history available for %s on %s" % (station, current_date)

                if current_date.day == 1:
                    print(current_date)
                    
                current_date += timedelta(days=1)

            # If the web page needs to be downloaded again, re-download it from
            # wunderground.com

            # If the parser gets stuck on a certain date, you may need to investigate
            # the page to find out what is going on. Sometimes data is missing, in
            # which case the parser will get stuck. You can manually put in the data
            # yourself in that case, or just tell the parser to skip this day.
        

In [182]:
with open('wunderground_weather.tsv', 'w') as out_file:
    out_file.write('station,date,time,temperature,heat_index,dew_point,humidity,pressure,visibility,'
                    'wind_dir,wind_speed,gust_speed,precip,events,conditions\n'.replace(',', '\t'))

In [None]:
for station in ['KNYC', 'KJRB', 'KEWR', 'KLGA', 'KJFK', 'KISP', 'KTEB', 'KFRG']:
    print "Starting %s:" % (station)
    parse_station(station, 'wunderground_weather.tsv')
    
print "Done!"

Starting KNYC:
2012-07-01 00:00:00
2012-08-01 00:00:00
2012-09-01 00:00:00
2012-10-01 00:00:00
2012-11-01 00:00:00
2012-12-01 00:00:00
2013-01-01 00:00:00
2013-02-01 00:00:00
2013-03-01 00:00:00
2013-04-01 00:00:00
2013-05-01 00:00:00
2013-06-01 00:00:00
2013-07-01 00:00:00
2013-08-01 00:00:00
2013-09-01 00:00:00
2013-10-01 00:00:00
2013-11-01 00:00:00
2013-12-01 00:00:00
2014-01-01 00:00:00
2014-02-01 00:00:00
2014-03-01 00:00:00
2014-04-01 00:00:00
2014-05-01 00:00:00
2014-06-01 00:00:00
2014-07-01 00:00:00
2014-08-01 00:00:00
2014-09-01 00:00:00
2014-10-01 00:00:00
2014-11-01 00:00:00
2014-12-01 00:00:00
2015-01-01 00:00:00
2015-02-01 00:00:00
2015-03-01 00:00:00
2015-04-01 00:00:00
2015-05-01 00:00:00
2015-06-01 00:00:00
2015-07-01 00:00:00
2015-08-01 00:00:00
2015-09-01 00:00:00
2015-10-01 00:00:00
2015-11-01 00:00:00
2015-12-01 00:00:00
2016-01-01 00:00:00
2016-02-01 00:00:00
2016-03-01 00:00:00
2016-04-01 00:00:00
2016-05-01 00:00:00
2016-06-01 00:00:00
2016-07-01 00:00:00
2016-