<h1> Python code for scraping one week of Weather Underground data, collecting once per hour, and saving into a file in the directory wunderground_data </h1>

<h5> Annamira O'Toole, June 2017 </h5>

<h3> Import necessary Python libraries </h3>

In [205]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import random
from mpl_toolkits.mplot3d import Axes3D
from bs4 import BeautifulSoup
import urllib
import json
import requests
from datetime import datetime
# import dicttoxml
%matplotlib inline

In [215]:
# station id codes for 37 stations around Boston
stations = {'BackBay': 'KMABOSTO278', 'NorthEnd': 'KMABOSTO124', 'Logan': 'KBOS',
                'Dorchester': 'KMABOSTO268', 'JP': 'KMAJAMAI7', 'FPZoo': 'KMABOSTO271',
                'ChestnutHill': 'KMABROOK7', 'MountAuburn': 'KMAWATER3', 'CambridgePort': 'KMACAMBR4',
                'Kendall': 'KMACAMBR70', 'PorterDavis': 'KMASOMER9', 'Belmont': 'KMABELMO12',
                'Winthrop': 'KMAWINTH7', 'Everett': 'KMACHELS2', 'Malden': 'KMAMALDE6',
                'Medford': 'KMAMEDFO4', 'Arlington': 'KMAARLIN20', 'NewtonCorner': 'KMANEWTO44',
                'Newton': 'KMANEWTO41', 'Waban': 'KMAWABAN1', 'Roslindale': 'KMABOSTO203',
                'Needham': 'KMANEEDH5', 'SouthWaltham': 'KMAWALTH25', 'Waltham': 'KMAWALTH26',
                'BeaverBrook': 'KMALEXIN6', 'Lexington': 'KMALEXIN18', 'TurkeyHill': 'KMAARLIN21',
                'Winchester': 'KMAWINCH55', 'WestWoburn': 'KMAWOBUR15', 'Woburn': 'KMAWOBUR4',
                'Melrose': 'KMAMELRO13', 'Saugus': 'KMASAUGU4', 'Lynn': 'KMALYNN3', 'Milton': 'KMAMILTO11',
                'BlueHills': 'KMAQUINC52', 'Quincy': 'KMAQUINC25', 'Southie': 'KMABOSTO167'}

stations2 = {'BackBay': 'KMABOSTO278', 'Logan': 'KBOS',
                'Dorchester': 'KMABOSTO268', 'JP': 'KMAJAMAI7',
                'ChestnutHill': 'KMABROOK7', 'CambridgePort': 'KMACAMBR4',
                'Belmont': 'KMABELMO12', 'Malden': 'KMAMALDE6',
                'Medford': 'KMAMEDFO4', 'Waban': 'KMAWABAN1'}


station_keys = stations2.keys()


<h3> Define functions: </h3>

In [212]:
# returns current time in form of datetime object
def update_time():
    time = datetime.now()
    return time

# returns customized string format of the given datetime object
# "month-day_hour-min"
def convert_format(time):
    return datetime.strftime(time, "%m-%d_%H-%M")

# returns customized string format of the global time variable
# uses convert format function
# "month-day_hour-min"
def update_collect_time():
    return convert_format(time)

# returns json file from wunderground API for Boston MA
def get_json():
    page = requests.get('http://api.wunderground.com/api/f71059696f42912d/forecast/geolookup/conditions/q/MA/Boston.json')
    data = json.loads(page.content)
    return data

# returns json file from wunderground API for station specified by ID
def get_json_withID(ID):
    page = requests.get('http://api.wunderground.com/api/f71059696f42912d/forecast/geolookup/conditions/q/pws:' + ID + '.json')
    data = json.loads(page.content)
    return data

# returns dictionary of location-data pairs for each Boston area station
def get_all_data():
    D = {}
    for key in station_keys:
        D[key] = get_json_withID(stations2[key])
    return D

# add given data to global dictionary variable
# with collect_time string variable as the key
def add_to_dict(data):
    key = str(collect_time)
    dictionary[key] = data

# writes global dictionary variable into opened file variable
def write_file(dictionary):
    # writing into file
    json.dump(dictionary, file)

# retrieves minutes from timedelta object
get_timedelta_mins = lambda delta: delta.seconds/60

# retrieves hours from timedelta object
get_hours = lambda delta: delta.seconds/3600

# retrieves minute attribute from datetime object
get_datetime_min = lambda time: int(datetime.strftime(time, "%M"))

# retrieves hour attribute from datetime object
get_datetime_hr = lambda time: int(datetime.strftime(time, "%H"))



<h3> Official weekly code: </h3>

In [None]:
dictionary = {}

time = datetime.now()
start_time = time

collect_time = datetime.strftime(time, "%m-%d_%H-%M")

week_name = collect_time

days_in_week = 7
hrs_in_day = 24
week_not_over = True
day_not_over = True

# collect data for entire week
# create a file of each day's data
prev_time = time
while week_not_over:
    
    day_not_over = True
    
    # record start time of daily collection
    day_start_time = datetime.now()
    
    # collect data every hour for 1 day
    while day_not_over:
        
        # set temporary time
        temp_time = datetime.now()
        # measure the time passed between last iteration and the current time
        hr_delta = temp_time - prev_time
        
        # if the hour has changed since the last iteration, collect data
        # and add the data to the current day's dictionary
        if get_datetime_hr(prev_time) != get_datetime_hr(temp_time):
            time = update_time() # update offical time
            collect_time = update_collect_time() # update collect_time
            data = get_all_data() # collect data from wunderground
            print('got json ' + collect_time) # acknowledge data collection
            # add data to the day's dictionary
            add_to_dict(data)
        
        # check if the day is over, set day_not_over boolean value appropriately
        day_delta = temp_time - day_start_time
        day_not_over = (get_hours(day_delta) != hours_in_day)
        prev_time = temp_time
        
    # once the day is over, write the day's dictionary to a file
    filename = "day_" + convert_format(day_start_time)
    filepath = 'wunderground_data/' + filename
    # filepath = 'wunderground_data/' + week_name + '/'
    day_file = open(filepath + '.json', 'w')
    dictionary2 = {}
    dictionary2[filename] = dictionary
    json.dump(dictionary, day_file)
    day_file.close()
    dictionary = {}
    dictionary2 = {}
    
    # check if the week is over, set week_not_over boolean value appropriately
    week_delta = temp_time - start_time
    week_not_over = (week_delta.days != week_length)
    
    
print('finished')
    

<h3> Most up to date and similar 5 minute test of the weekly code </h3>

In [214]:
dictionary = {}

time = datetime.now()
start_time = time

collect_time = datetime.strftime(time, "%m-%d_%H-%M")

week_name = collect_time

test_length = 9
day_length = 3
week_not_over = True
day_not_over = True

# collect data for entire week
# create a file of each day's data
prev_time = time
while week_not_over:
    
    day_not_over = True
    
    # record start time of daily collection
    day_start_time = datetime.now()
    
    # collect data every hour for 1 day
    while day_not_over:
        
        # set temporary time
        temp_time = datetime.now()
        # measure the time passed between last iteration and the current time
        hr_delta = temp_time - prev_time
        
        # if the hour has changed since the last iteration, collect data
        # and add the data to the current day's dictionary
        if get_datetime_min(prev_time) != get_datetime_min(temp_time):
            time = update_time() # update offical time
            collect_time = update_collect_time() # update collect_time
            data = get_all_data() # collect data from wunderground
            print('got json ' + collect_time) # acknowledge data collection
            # add data to the day's dictionary
            add_to_dict(data)
        
        # check if the day is over, set day_not_over boolean value appropriately
        day_delta = temp_time - day_start_time
        day_not_over = (get_timedelta_mins(day_delta) != day_length)
        prev_time = temp_time
        
    # once the day is over, write the day's dictionary to a file
    filename = "day_" + convert_format(day_start_time)
    filepath = 'wunderground_data_test/' + filename
    day_file = open(filepath + '.json', 'w')
    dictionary2 = {}
    dictionary2[filename] = dictionary
    json.dump(dictionary, day_file)
    dictionary = {}
    dictionary2 = {}
    
    # check if the week is over, set week_not_over boolean value appropriately
    week_delta = temp_time - start_time
    week_not_over = (get_timedelta_mins(week_delta) != test_length)
    
    
print('finished')

got json 06-19_16-26
got json 06-19_16-27
got json 06-19_16-28
got json 06-19_16-29
got json 06-19_16-30
got json 06-19_16-31
got json 06-19_16-32
got json 06-19_16-33
got json 06-19_16-34
finished


<h3> Main code for one week of data -- without datetime -- NOT FINISHED: </h3>

In [None]:
hr = int(datetime.strftime(time, "%H"))
day = 1
collections = 0
week_length = 7
hrs_in_week = 168
time_not_expired = True

prev_hr = hr
while time_not_expired:
    temp_time = datetime.now()
    temp_hr = int(datetime.strftime(temp_time, "%H"))
    temp_day = int(datetime.strftime(temp_time, "%d"))
    if prev_hr != temp_hr:
        time = update_time()
        hold = update_collect_time()[0]
        collect_time_str = hold[0]
        collect_hr_int = hold[1]
        data = get_json()
        collections += 1
        print('got json' + collect_time_str)
        add_to_dict(data)
        if hr = 24:
            day += 1
    time_not_expired = (day != week_length) and collections == hrs_in_week
    prev_min = temp_min

# write_file(dictionary)
json.dump(dictionary, file)

print('done')

<h3> Old code for one week of data -- with datetime -- NOT FINISHED: </h3>

In [None]:
prev_time = time
while time_not_expired:
    temp_time = datetime.now()
    hr_delta = temp_time - prev_time
    if hr_delta.hours >= 1:
        time = update_time()
        hold = update_collect_time()[0]
        collect_time_str = hold[0]
        collect_hr_int = hold[1]
        data = get_json()
        print('got json' + collect_time_str)
        add_to_dict(data)
    week_delta = temp_time - start_time
    time_not_expired = (week_delta.days == week_length)
    prev_time = temp_time

# write_file(dictionary)
json.dump(dictionary, file)

print('done')

<h3> 5 minute json file test code -- without datetime: </h3>

In [110]:
# 5 minute test

file = open('wunderground_data_test/fifth_5min_test', 'w')

min = int(datetime.strftime(time, "%M"))
end_time = 3
time_not_expired = True

prev_min = min
while time_not_expired:
    temp_time = datetime.now()
    temp_min = int(datetime.strftime(temp_time, "%M"))
    if prev_min != temp_min:
        time = update_time()
        hold = update_collect_time()[0]
        collect_time_str = hold[0]
        collect_hr_int = hold[1]
        data = get_json()
        print('got json')
        add_to_dict(data)
    time_not_expired = ((temp_min - min) != end_time)
    prev_min = temp_min

# write_file(dictionary)
json.dump(dictionary, file)

print('done')

got json
got json
got json
done


<h3> 5 minute json file test code -- with datetime -- NOT FINISHED: </h3>

In [177]:
test_length = 5
time_not_expired = True

file = open('wunderground_data_test/fifth_5min_test', 'w')

prev_time = time
while time_not_expired:
    temp_time = datetime.now()
    if get_datetime_min(temp_time) != get_datetime_min(prev_time):
        time = update_time()
        hold = update_collect_time()
        collect_time_str = hold[0]
        collect_hr_int = hold[1]
        data = get_json()
        print('got json' + collect_time_str)
        add_to_dict(data)
    time_passed = temp_time - start_time
    time_not_expired = (get_timedelta_mins(time_passed) != test_length)
    prev_time = temp_time

# write_file(dictionary)
json.dump(dictionary, file)

print('done')

got json06-15 12-58
got json06-15 12-59
got json06-15 13-00
got json06-15 13-01
got json06-15 13-02


ConnectionError: HTTPConnectionPool(host='api.wunderground.com', port=80): Max retries exceeded with url: /api/f71059696f42912d/forecast/geolookup/conditions/q/MA/Boston.json (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x1119a7a90>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))

<h3> Miscellaneous scraps of code: </h3>

In [180]:
file.close()

In [32]:
#for k, v in dictionary.items():
    #print(k, v)

print(dictionary.keys())
dictionary['current_observation'].keys()

dict_keys(['response', 'location', 'current_observation', 'forecast'])


dict_keys(['image', 'display_location', 'observation_location', 'estimated', 'station_id', 'observation_time', 'observation_time_rfc822', 'observation_epoch', 'local_time_rfc822', 'local_epoch', 'local_tz_short', 'local_tz_long', 'local_tz_offset', 'weather', 'temperature_string', 'temp_f', 'temp_c', 'relative_humidity', 'wind_string', 'wind_dir', 'wind_degrees', 'wind_mph', 'wind_gust_mph', 'wind_kph', 'wind_gust_kph', 'pressure_mb', 'pressure_in', 'pressure_trend', 'dewpoint_string', 'dewpoint_f', 'dewpoint_c', 'heat_index_string', 'heat_index_f', 'heat_index_c', 'windchill_string', 'windchill_f', 'windchill_c', 'feelslike_string', 'feelslike_f', 'feelslike_c', 'visibility_mi', 'visibility_km', 'solarradiation', 'UV', 'precip_1hr_string', 'precip_1hr_in', 'precip_1hr_metric', 'precip_today_string', 'precip_today_in', 'precip_today_metric', 'icon', 'icon_url', 'forecast_url', 'history_url', 'ob_url', 'nowcast'])