This notebook exists to help perform some transformations and inspections of data.  Goal is to eventually collapse this into a .py file.

Data Sources / credits:
* City and State Zip search: https://pypi.org/project/uszipcode/
* Zip codes from: http://federalgovernmentzipcodes.us/free-zipcode-database-Primary.csv
* Averages and other weather business: https://www1.ncdc.noaa.gov/pub/data/normals/1981-2010/products/
* Important codebook: https://www1.ncdc.noaa.gov/pub/data/normals/1981-2010/readme.txt
* Weather station zips: https://www1.ncdc.noaa.gov/pub/data/normals/1981-2010/station-inventories/zipcodes-normals-stations.txt
* Thank you to the good people at Geopy for their module: https://pypi.org/project/geopy/


In [1]:
# Install some helpful libraries
#!pip install uszipcode
#!pip install pygeocoder
#!pip install requests
#!pip install geopy

# This notebook will analyze two datasets: the college town and hometown
# input sources.  True/False toggles which one you're working on.  Default
# set to False, for timeliness.
hometown = True

# Importing necessary libraries
import pandas as pd

from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)
from math import radians, sin, cos, sqrt, asin
import geopy.geocoders
from geopy.geocoders import Nominatim
from functools import partial
from geopy.extra.rate_limiter import RateLimiter

# Setup definition we'll need to calculate GPS coordinate distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6372.8  # Earth radius in kilometers
    dLat = radians(lat2 - lat1)
    dLon = radians(lon2 - lon1)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
    a = sin(dLat / 2)**2 + cos(lat1) * cos(lat2) * sin(dLon / 2)**2
    c = 2 * asin(sqrt(a))
    return R * c

# Setup our geolocator
geopy.geocoders.options.default_timeout = 7
geolocator = Nominatim(user_agent="college-town-weather-extractor")
geocode = partial(geolocator.geocode, language="en", timeout=120)

In [2]:
# Reading the files
fpath = './noaa/mly-tmax-normal.txt'
max_df = pd.read_csv(fpath, delim_whitespace=True, header=None)
fpath = './noaa/mly-tmin-normal.txt'
min_df = pd.read_csv(fpath, delim_whitespace=True, header=None)
fpath = './noaa/ann-snow-normal.txt'
snow_df = pd.read_csv(fpath, delim_whitespace=True, header=None)
fpath = './noaa/ann-prcp-normal.txt'
rain_df = pd.read_csv(fpath, delim_whitespace=True, header=None)
if hometown == False:
    fpath = '../college-input.csv'
else:
    fpath = '../hometown-input.csv'
city_df = pd.read_csv(fpath,header=0)
city_df['city_lat'],city_df['city_lon']='',''
city_df['ann_rain_inch'],city_df['ann_snow_inch']='',''
city_df['min_jan'],city_df['min_feb'],city_df['min_mar'],city_df['min_apr'],city_df['min_may'],city_df['min_jun'],city_df['min_jul'],city_df['min_aug'],city_df['min_sep'],city_df['min_oct'],city_df['min_nov'],city_df['min_dec']='','','','','','','','','','','',''
city_df['max_jan'],city_df['max_feb'],city_df['max_mar'],city_df['max_apr'],city_df['max_may'],city_df['max_jun'],city_df['max_jul'],city_df['max_aug'],city_df['max_sep'],city_df['max_oct'],city_df['max_nov'],city_df['max_dec']='','','','','','','','','','','',''
fpath = './free-zipcode-database-primary.csv'
zip_df = pd.read_csv(fpath,header=0)
fpath = './zipcodes-normals-stations.txt'
zstation_df = pd.read_fwf(fpath,header=None,converters={1: lambda x: int(x)})
zstation_df.rename(columns={0:'station_id',1:'zip',2:'city'}, inplace=True)
fpath = './allstations.txt'
gstation_df = pd.read_fwf(fpath,header=None,converters={1: lambda x: float(x)})
gstation_df.rename(columns={0:'station_id',1:'lat',2:'long',4:'state'}, inplace=True)

In [3]:
# strip indicator tag about quality of data, convert to fahrenheit
for h in range(0,max_df.shape[0]):
    for w in range(0,max_df.shape[1]):
        if (len(max_df[w][h])) == 11:
            continue
        else:
            l = len(max_df[w][h])
            max_df[w][h] = float(max_df[w][h][0:l-1])/10

for h in range(0,min_df.shape[0]):
    for w in range(0,min_df.shape[1]):
        if (len(min_df[w][h])) == 11:
            continue
        else:
            l = len(min_df[w][h])
            min_df[w][h] = float(min_df[w][h][0:l-1])/10

max_df.rename(columns={0:'station_id',1:'january',2:'february',3:'march',4:'april',5:'may',6:'june',7:'july',8:'august',9:'september',10:'october',11:'november',12:'december'}, inplace=True)
min_df.rename(columns={0:'station_id',1:'january',2:'february',3:'march',4:'april',5:'may',6:'june',7:'july',8:'august',9:'september',10:'october',11:'november',12:'december'}, inplace=True)

In [4]:
# strip indicator tag about quality of data, convert to (annual) inches
for h in range(0,snow_df.shape[0]):
    for w in range(0,snow_df.shape[1]):
        if (len(snow_df[w][h])) == 11:
            continue
        else:
            l = len(snow_df[w][h])
            if snow_df[w][h][0:l-1] == '-7777':
                snow_df[w][h] = 0
            else:
                snow_df[w][h] = float(snow_df[w][h][0:l-1])/100

for h in range(0,rain_df.shape[0]):
    for w in range(0,rain_df.shape[1]):
        if (len(rain_df[w][h])) == 11:
            continue
        else:
            l = len(rain_df[w][h])
            if rain_df[w][h][0:l-1] == '-7777':
                rain_df[w][h] = 0
            else:
                rain_df[w][h] = float(rain_df[w][h][0:l-1])/100                
                
snow_df.rename(columns={0:'station_id',1:'inches'}, inplace=True)
rain_df.rename(columns={0:'station_id',1:'inches'}, inplace=True)

In [5]:
# v1: using zip code data
for i in range(city_df.shape[0]):
    sid =[]
    city,state = city_df['Geographic Location'][i].split(",")
    result = search.by_city_and_state(city,state)
    for z in result:
        ctr = 0
        for zm in zstation_df.zip:
            if int(z.zipcode) == int(zm):
                sid.append(zstation_df['station_id'][ctr])
            ctr+=1
    if len(sid) == 0:
        # nothing found in that zip
        continue
    else:
        # pick a primary station name
        sname = str(sid[0])
        # pick a backup station
        ctr=0
        for sn in max_df.station_id:
            if sname == sn:
                city_df.at[i,'max_jan'] = max_df.loc[ctr][1]
                city_df.at[i,'max_feb'] = max_df.loc[ctr][2]
                city_df.at[i,'max_mar'] = max_df.loc[ctr][3]
                city_df.at[i,'max_apr'] = max_df.loc[ctr][4]
                city_df.at[i,'max_may'] = max_df.loc[ctr][5]
                city_df.at[i,'max_jun'] = max_df.loc[ctr][6]
                city_df.at[i,'max_jul'] = max_df.loc[ctr][7]
                city_df.at[i,'max_aug'] = max_df.loc[ctr][8]
                city_df.at[i,'max_sep'] = max_df.loc[ctr][9]
                city_df.at[i,'max_oct'] = max_df.loc[ctr][10]
                city_df.at[i,'max_nov'] = max_df.loc[ctr][11]
                city_df.at[i,'max_dec'] = max_df.loc[ctr][12]
            ctr+=1
        ctr=0
        for sn in min_df.station_id:
            if sname == sn:
                city_df.at[i,'min_jan'] = min_df.loc[ctr][1]
                city_df.at[i,'min_feb'] = min_df.loc[ctr][2]
                city_df.at[i,'min_mar'] = min_df.loc[ctr][3]
                city_df.at[i,'min_apr'] = min_df.loc[ctr][4]
                city_df.at[i,'min_may'] = min_df.loc[ctr][5]
                city_df.at[i,'min_jun'] = min_df.loc[ctr][6]
                city_df.at[i,'min_jul'] = min_df.loc[ctr][7]
                city_df.at[i,'min_aug'] = min_df.loc[ctr][8]
                city_df.at[i,'min_sep'] = min_df.loc[ctr][9]
                city_df.at[i,'min_oct'] = min_df.loc[ctr][10]
                city_df.at[i,'min_nov'] = min_df.loc[ctr][11]
                city_df.at[i,'min_dec'] = min_df.loc[ctr][12]
            ctr+=1



In [8]:
# improved v2: using GPS location
for i in range(city_df.shape[0]):
    try:
        city,state = city_df['Geographic Location'][i].split(",")
        location = geolocator.geocode(city_df['Geographic Location'][i]+" usa")
        clat,clon = location.latitude, location.longitude
        city_df.at[i,'city_lat'],city_df.at[i,'city_lon']=clat,clon
        #print("Searching for station closest to:",clat,clon)
        ctr = 0
        lowest_val = []
        lowest_pos = []
        for la,lo in zip(gstation_df['lat'],gstation_df['long']):
            distance = haversine(clat,clon,la,lo)
            if ctr == 0:
                lowest_val.append(distance)
                lowest_pos.append(ctr)
            else:
                if lowest_val[0] > distance:
                    lowest_val.insert(0,distance)
                    lowest_pos.insert(0,ctr)
            ctr+=1
        #print("lowest haversine value =",lowest_val,"for station",list(gstation_df.loc[lowest_pos[0]])[0])
        #print("with lat:",list(gstation_df.loc[lowest_pos[0]])[1],"long:",list(gstation_df.loc[lowest_pos[0]])[2])
        pos = 0
        while city_df.loc[i]['min_jan'] == "":
            sname = list(gstation_df.loc[lowest_pos[pos]])[0]
            ctr=0
            for sn in max_df.station_id:
                if sname == sn:
                    city_df.at[i,'max_jan'] = max_df.loc[ctr][1]
                    city_df.at[i,'max_feb'] = max_df.loc[ctr][2]
                    city_df.at[i,'max_mar'] = max_df.loc[ctr][3]
                    city_df.at[i,'max_apr'] = max_df.loc[ctr][4]
                    city_df.at[i,'max_may'] = max_df.loc[ctr][5]
                    city_df.at[i,'max_jun'] = max_df.loc[ctr][6]
                    city_df.at[i,'max_jul'] = max_df.loc[ctr][7]
                    city_df.at[i,'max_aug'] = max_df.loc[ctr][8]
                    city_df.at[i,'max_sep'] = max_df.loc[ctr][9]
                    city_df.at[i,'max_oct'] = max_df.loc[ctr][10]
                    city_df.at[i,'max_nov'] = max_df.loc[ctr][11]
                    city_df.at[i,'max_dec'] = max_df.loc[ctr][12]
                ctr+=1
            ctr=0
            for sn in min_df.station_id:
                if sname == sn:
                    city_df.at[i,'min_jan'] = min_df.loc[ctr][1]
                    city_df.at[i,'min_feb'] = min_df.loc[ctr][2]
                    city_df.at[i,'min_mar'] = min_df.loc[ctr][3]
                    city_df.at[i,'min_apr'] = min_df.loc[ctr][4]
                    city_df.at[i,'min_may'] = min_df.loc[ctr][5]
                    city_df.at[i,'min_jun'] = min_df.loc[ctr][6]
                    city_df.at[i,'min_jul'] = min_df.loc[ctr][7]
                    city_df.at[i,'min_aug'] = min_df.loc[ctr][8]
                    city_df.at[i,'min_sep'] = min_df.loc[ctr][9]
                    city_df.at[i,'min_oct'] = min_df.loc[ctr][10]
                    city_df.at[i,'min_nov'] = min_df.loc[ctr][11]
                    city_df.at[i,'min_dec'] = min_df.loc[ctr][12]
                ctr+=1
            pos+=1
        while city_df.loc[i]['ann_rain_inch'] == "":
            sname = list(gstation_df.loc[lowest_pos[pos]])[0]
            ctr=0
            for sn in rain_df.station_id:
                if sname == sn:
                    city_df.at[i,'ann_rain_inch'] = rain_df.loc[ctr][1]
                ctr+=1
            ctr=0
            pos+=1
        while city_df.loc[i]['ann_snow_inch'] == "":
            sname = list(gstation_df.loc[lowest_pos[pos]])[0]
            ctr=0
            for sn in snow_df.station_id:
                if sname == sn:
                    city_df.at[i,'ann_snow_inch'] = snow_df.loc[ctr][1]
                ctr+=1
            ctr=0
            pos+=1
    except AttributeError:
        continue

In [9]:
city_df

Unnamed: 0,Geographic Location,city_lat,city_lon,ann_rain_inch,ann_snow_inch,min_jan,min_feb,min_mar,min_apr,min_may,...,max_mar,max_apr,max_may,max_jun,max_jul,max_aug,max_sep,max_oct,max_nov,max_dec
0,"Lakeland, FL",28.0395,-81.9498,52.17,0,50.2,52.5,56.2,60,66.5,...,81,85.7,90.7,93.2,93.9,94.2,91.7,86.6,79.9,74.5
1,"Howe, TX",33.5073,-96.6127,43.9,0.23,29.6,33.2,41.4,48.2,59.1,...,65.5,74,80.7,88.4,93.4,94.3,86.5,76.7,65,54.5
2,"Northport, AL",33.2449,-87.6728,54.95,0.02,32.3,36.2,41.9,48.7,58.3,...,66,74.1,81.2,88.2,91,90.7,85.4,75.4,65.6,55.5
3,"Bristow, VA",38.7233,-77.5366,41.38,1.52,23.9,25.3,32.3,41.6,53,...,55.7,67.1,75,84.4,88.5,86.8,79.7,68.1,58.1,46.6
4,"Covington, GA",33.5968,-83.8602,51.52,0.06,32,35.4,41.5,48.6,57.7,...,65.7,73.6,81.1,87.5,90,88.8,83.3,73.5,64.4,54.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3600,"Watts, CA",33.9406,-118.243,14.32,0,48.9,50.5,52.9,54.9,58.5,...,67.6,68.9,70.6,73.1,76.5,78.2,77.2,74.7,71,66.3
3601,"Groesbeck, TX",31.5243,-96.5339,38.46,0.01,35.2,38.9,45.8,53.8,61.9,...,69.4,77,83.2,89.9,94.1,95.6,89.9,80.2,69.1,59.4
3602,"Ogden, UT",41.223,-111.974,22.27,4.85,21,23.7,32.2,38.8,47.6,...,53.1,61.6,71.5,81.7,91,89.2,78.4,64.6,48.8,37.5
3603,"Connell, WA",46.6604,-118.861,16.46,0.4,26.3,28.8,32.9,37.3,44.2,...,56.3,64.3,73.2,80,88.9,88,78.3,63.9,47,36.1


In [10]:
if hometown == False:
    city_df.to_csv('../collegetown-weather-output.csv', index=True)
else:
    city_df.to_csv('../hometown-weather-output.csv', index=True)