### Weather station, zipcode, and climate zone mappings

The scripts below are used to fetch, format, and process the raw data that form the mappings used by the eemeter internally.

The only manually created file is the climate zone file, which was constructed from a set of references downloaded below when the top script is run; all others are primary sources

In [None]:
!mkdir -p data

# ZIP code shapefiles from Census.gov
!wget http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_zcta510_500k.zip -P data
!unzip data/cb_2014_us_zcta510_500k.zip -d data
!mapshaper -i data/cb_2014_us_zcta510_500k.shp -o format=geojson data/cb_2014_us_zcta510_500k.json

# County shapefiles from Census.gov
!wget http://www2.census.gov/geo/tiger/GENZ2013/cb_2013_us_county_500k.zip -P data
!unzip data/cb_2013_us_county_500k.zip -d data
!mapshaper -i data/cb_2013_us_county_500k.shp -o format=geojson data/cb_2013_us_county_500k.json

# CA climate zone division shapefiles and transform from CEC
!wget http://www.energy.ca.gov/maps/renewable/CA_Building_Standards_Climate_Zones.zip -P data
!unzip data/CA_Building_Standards_Climate_Zones.zip -d data
!ogr2ogr -f "ESRI Shapefile" -t_srs EPSG:4326 data/CA_Building_Standards_Climate_Zones_reprojected.shp data/CA_Building_Standards_Climate_Zones.shp
!mapshaper -i data/CA_Building_Standards_Climate_Zones_reprojected.shp -o format=geojson data/CA_Building_Standards_Climate_Zones.json

# NCDC weather data quality
!wget ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-inventory.csv -P data

# NCDC station lat lngs
!wget ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.txt -P data

# IECC/Building America climate zone csv
!wget https://gist.githubusercontent.com/philngo/d3e251040569dba67942/raw/d1d2e13d73cc50147be6c90d8232f2e4c3eeaffc/climate_zones.csv -P data

# County ids - for reference - used to derive climate_zones.csv
!wget http://www2.census.gov/geo/docs/reference/codes/files/national_county.txt -P data

# Building america climate zone guide - for reference - used to derive file below.
!wget http://energy.gov/sites/prod/files/2015/02/f19/ba_climate_guide_2013.pdf -P data

In [None]:
import pandas as pd
from shapely.geometry import shape, Point, asShape

from datetime import datetime, timedelta
from collections import defaultdict
import json

Gathers the lat/long coordinates for all of the weather stations in the isd-history.txt file, storing them by USAF ID.

In [None]:
with open('data/isd-history.txt', 'r') as f:
    station_lat_lng = {}
    for row in f.readlines()[22:]:
        try: lat = float(row[57:64])
        except: lat = float('nan')

        try: lng = float(row[65:73])
        except: lng = float('nan')

        # skip stations which are missing data
        if pd.isnull(lat) or pd.isnull(lng): continue

        station_lat_lng[row[:6]] = (lat, lng)

Accept only stations which have sufficient data, defined here as stations that have

1. data for the past 7 years
2. an average daily sampling rate of at least 22/day


In [None]:
station_inventory = pd.read_csv('data/isd-inventory.csv', dtype={"USAF": str, "WBAN": str, "YEAR": str})
n_years = 7

def has_current_year(group):
    # well actually, current year minus one month, just to be certain
    # that weird things if the script runs on jan 1.
    # give 'em time to update.
    return group.YEAR.iloc[-1] == str((datetime.now() - timedelta(31)).year)

def has_consecutive_recent_years(group):
    consecutive = True
    for year1,year2 in zip(group.YEAR[-n_years:], group.YEAR[-(n_years - 1):]):
        consecutive &= (int(year1) + 1 == int(year2))
    return consecutive

def has_recent_enough_years(group):
    return group.shape[0] >= n_years

def has_rich_recent_years(group):
    # make sure theres enough data in recent years
    n_samples = group[["JAN", "FEB", "MAR", "APR", "MAY", "JUN",
                          "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]].sum(axis=1)
    # samples averaging at least 22 times a day for the last eight years,
    # not counting this year, which may be incomplete.
    return all(n_samples[-n_years:-1] > 365 * 22)

station_whitelist = []
for station, group in station_inventory.groupby("USAF"):
    # some stations have a mix of wban numbers that change the sorting
    group = group.sort_values(by='YEAR')
    if has_current_year(group) and \
            has_consecutive_recent_years(group) and \
            has_recent_enough_years(group) and \
            has_rich_recent_years(group):
        station_whitelist.append(station)
print("Accepted {} stations".format(len(station_whitelist)))

station_whitelist_lat_lng = {s: station_lat_lng[s] for s in station_whitelist}
station_points = {s: Point(lng, lat) for s, (lat,lng) in station_whitelist_lat_lng.items()}
station_lat_lngs = {s:(point.coords[0][1],point.coords[0][0]) for s, point in station_points.items()}

In [None]:
# load zipcode geojson
with open('data/cb_2014_us_zcta510_500k.json', 'r') as f:
    zip_js = json.load(f)
    
zipcode_polygons = {}
for zip_feature in zip_js['features']:
    zipcode = zip_feature['properties']['GEOID10']
    polygon = shape(zip_feature['geometry'])
    zipcode_polygons[zipcode] = polygon

In [None]:
# load county geojson
with open('data/cb_2013_us_county_500k.json', 'r') as f:
    county_js = json.load(f)
    
county_polygons = {}
for county_feature in county_js['features']:
    county = county_feature['properties']['GEOID']
    polygon = shape(county_feature['geometry'])
    county_polygons[county] = polygon

In [None]:
# load county climate zones:
climate_zones = pd.read_csv('data/climate_zones.csv',
        dtype={"State FIPS": str, "County FIPS": str},
        usecols=["State FIPS", "County FIPS", "IECC Climate Zone", "IECC Moisture Regime", "BA Climate Zone"])

In [None]:
# gather a list of counties (not including california)
counties_dict = {}
for i, row in climate_zones.iterrows():
    # if not in california
    if row["State FIPS"] != "06":
        county_id = row["State FIPS"] + row["County FIPS"]
        county_polygon = county_polygons.get(county_id)
        if county_polygon is not None:
            counties_dict[county_id] = {
                "climate_zone": "{}_{}_{}".format(
                    row["IECC Climate Zone"],
                    row["IECC Moisture Regime"] if not pd.isnull( row["IECC Moisture Regime"]) else "NA",
                    row["BA Climate Zone"]),
                "polygon": county_polygon,
            }
        else:
            print("Could not find county {}, skipping.".format(county_id))

In [None]:
# load CA climate zones:
with open('data/CA_Building_Standards_Climate_Zones.json', 'r') as f:
    ca_js = json.load(f)

california_climate_zone_polygons = {}
for ca_feature in ca_js['features']:
    zone = "CA_{:02d}".format(int(ca_feature['properties']['Zone']))
    polygon = shape(ca_feature['geometry'])
    california_climate_zone_polygons[zone] = polygon

In [None]:
# map zipcodes to climate zones

# outputs - may as well store the zipcode centroids for later use.
zipcode_to_climate_zone = {}
zipcode_points = {}
zipcode_centroids = {}

# some optimizations for the loop.
counties_dict_items = counties_dict.items()
california_climate_zone_polygons_items = california_climate_zone_polygons.items()
n_zipcodes = len(zipcode_polygons)

for i, (zipcode, zipcode_poly) in enumerate(zipcode_polygons.items()):
    print '\r{} of {}'.format(i+1, n_zipcodes),
    
    # note that centroids are not always within the zipcode, or even necessarily on land (or within the county they are contained by)! This is a rough approximation of location.
    zipcode_centroid = zipcode_poly.centroid
    zipcode_points[zipcode] = zipcode_centroid
    zipcode_centroids[zipcode] = (zipcode_centroid.coords[0][1], zipcode_centroid.coords[0][0])
    
    # check non-CA counties
    for county, county_dict in counties_dict_items:
        county_poly = county_dict["polygon"]
        if county_poly.contains(zipcode_centroid):
            zipcode_to_climate_zone[zipcode] = county_dict["climate_zone"]
            break
    else: #for else!
        # check CA climate zones
        for ca_cz, ca_cz_poly in california_climate_zone_polygons_items:
            if ca_cz_poly.contains(zipcode_centroid):
                zipcode_to_climate_zone[zipcode] = ca_cz
                break
        else: #for else!
            zipcode_to_climate_zone[zipcode] = None      

In [None]:
# map weather stations to climate zones

station_to_climate_zone = {}

n_stations = len(station_points)

for i, (station, station_point) in enumerate(station_points.items()):
    print '\r{} of {}'.format(i+1, n_stations),
    
    # Is the station in a non-CA county?
    for county, county_dict in counties_dict_items:
        county_poly = county_dict["polygon"]
        if county_poly.contains(station_point):
            station_to_climate_zone[station] = county_dict["climate_zone"]
            break
    else: #for else!
        
        # is the station in a california climate zone?
        for ca_cz, ca_cz_poly in california_climate_zone_polygons_items:
            if ca_cz_poly.contains(station_point):
                station_to_climate_zone[station] = ca_cz
                break
        else: #for else!
            station_to_climate_zone[station] = None

Map stations to zipcodes by looking, if possible, for the closest weather station within
the same climate zone. If not within a climate zone, just pick the station which is
overall closest.

In [None]:
# zipcode to station mapping
zipcode_to_station = {}
all_stations = station_points.keys()
for i, (zipcode, zipcode_point) in enumerate(zipcode_points.items()):
    print("\r{} of {}".format(i + 1, n_zipcodes)),
    
    # get set of stations to compare for distance.
    climate_zone = zipcode_to_climate_zone[zipcode]
    if climate_zone is None:
        stations = all_stations
    else:
        stations = climate_zone_to_stations[climate_zone]
    
    # find minimum distance
    min_dist = 1e16
    min_station = None
    for station in stations:
        dist = zipcode_point.distance(station_points[station])
        if dist < min_dist:
            min_dist = dist
            min_station = station
    zipcode_to_station[zipcode] = min_station

Make some other JSON products that may be useful.

In [None]:
# climate zone to zipcode list mapping
climate_zone_zipcodes = defaultdict(list)
for zipcode, climate_zone in zipcode_to_climate_zone.items():
    if climate_zone is not None:
        climate_zone_zipcodes[climate_zone].append(zipcode)
climate_zone_to_zipcodes = dict(climate_zone_zipcodes)

In [None]:
# climate zone to station list mapping
climate_zone_to_stations = defaultdict(list)
for station, climate_zone in station_to_climate_zone.items():
    if climate_zone is not None:
        climate_zone_to_stations[climate_zone].append(station)
climate_zone_to_stations = dict(climate_zone_to_stations)

In [None]:
# station to zipcode list mapping
station_to_zipcodes = defaultdict(list)
for zipcode, station in zipcode_to_station.items():
    if station is not None:
        station_to_zipcodes[station].append(zipcode)
station_to_zipcodes = dict(station_to_zipcodes)

In [None]:
# station to climate zone mapping
station_to_climate_zone = {}
for climate_zone, stations in climate_zone_to_stations.items():
    for station in stations:
        station_to_climate_zone[station] = climate_zone

In [None]:
# write all outputs:
!mkdir -p outputs


### Station -> X
# station -> lat long
with open('outputs/usaf_station_lat_long.json', 'w') as f:
    json.dump(station_lat_lngs, f)

# station -> zipcodes
with open('outputs/usaf_station_zipcodes.json', 'w') as f:
    json.dump(station_to_zipcodes, f)

# station -> climate_zone
with open('outputs/usaf_station_climate_zone.json', 'w') as f:
    json.dump(station_to_climate_zone, f)


### Zipcode -> X
# zipcode -> lat long
with open('outputs/zipcode_centroid_lat_long.json', 'w') as f:
    json.dump(zipcode_centroids, f)
    
# zipcode -> climate_zone
with open('outputs/zipcode_climate_zone.json', 'w') as f:
    json.dump(zipcode_to_climate_zone, f)

# zipcode -> station
with open('outputs/zipcode_usaf_station.json', 'w') as f:
    json.dump(zipcode_to_station, f)
    

### climate zone -> X
# climate_zone -> stations
with open('outputs/climate_zone_usaf_stations.json', 'w') as f:
    json.dump(climate_zone_to_stations, f)

# climate_zone -> zipcodes
with open('outputs/climate_zone_zipcodes.json', 'w') as f:
    json.dump(climate_zone_to_zipcodes, f)