This notebook exists to help perform some transformations and inspections of data.  Goal is to eventually collapse this into a .py file.

Data Sources / credits:
* City and State Zip search: https://pypi.org/project/uszipcode/
* Zip codes from: http://federalgovernmentzipcodes.us/free-zipcode-database-Primary.csv
* Weather station zips: https://www1.ncdc.noaa.gov/pub/data/normals/1981-2010/station-inventories/zipcodes-normals-stations.txt
* Thank you to the good people at Geopy for their module: https://pypi.org/project/geopy/


In [1]:
# Install some helpful libraries
#!pip install uszipcode
#!pip install pygeocoder
#!pip install requests
#!pip install geopy

# Importing necessary libraries
import pandas as pd
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)
from math import radians, sin, cos, sqrt, asin
from geopy.geocoders import Nominatim
from functools import partial

# Setup definition we'll need to calculate GPS coordinate distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6372.8  # Earth radius in kilometers
    dLat = radians(lat2 - lat1)
    dLon = radians(lon2 - lon1)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
    a = sin(dLat / 2)**2 + cos(lat1) * cos(lat2) * sin(dLon / 2)**2
    c = 2 * asin(sqrt(a))
    return R * c

# Setup our geolocator
geolocator = Nominatim(user_agent="college-town-weather-extractor")
geocode = partial(geolocator.geocode, language="en")

In [2]:
# Reading the files
fpath = './noaa/mly-tmax-normal.txt'
max_df = pd.read_csv(fpath, delim_whitespace=True, header=None)
fpath = './noaa/mly-tmin-normal.txt'
min_df = pd.read_csv(fpath, delim_whitespace=True, header=None)
fpath = './college-city-mapping.csv'
city_df = pd.read_csv(fpath,header=0)
city_df['min_jan'],city_df['min_feb'],city_df['min_mar'],city_df['min_apr'],city_df['min_may'],city_df['min_jun'],city_df['min_jul'],city_df['min_aug'],city_df['min_sep'],city_df['min_oct'],city_df['min_nov'],city_df['min_dec']='','','','','','','','','','','',''
city_df['max_jan'],city_df['max_feb'],city_df['max_mar'],city_df['max_apr'],city_df['max_may'],city_df['max_jun'],city_df['max_jul'],city_df['max_aug'],city_df['max_sep'],city_df['max_oct'],city_df['max_nov'],city_df['max_dec']='','','','','','','','','','','',''
fpath = './free-zipcode-database-primary.csv'
zip_df = pd.read_csv(fpath,header=0)
fpath = './zipcodes-normals-stations.txt'
zstation_df = pd.read_fwf(fpath,header=None,converters={1: lambda x: int(x)})
zstation_df.rename(columns={0:'station_id',1:'zip',2:'city'}, inplace=True)
fpath = './allstations.txt'
gstation_df = pd.read_fwf(fpath,header=None,converters={1: lambda x: float(x)})
gstation_df.rename(columns={0:'station_id',1:'lat',2:'long',4:'state'}, inplace=True)

In [3]:
# strip indicator tag about quality of data, convert to fahrenheit
for h in range(0,max_df.shape[0]):
    for w in range(0,max_df.shape[1]):
        if (len(max_df[w][h])) == 11:
            continue
        else:
            l = len(max_df[w][h])
            max_df[w][h] = float(max_df[w][h][0:l-1])/10

for h in range(0,min_df.shape[0]):
    for w in range(0,min_df.shape[1]):
        if (len(min_df[w][h])) == 11:
            continue
        else:
            l = len(min_df[w][h])
            min_df[w][h] = float(min_df[w][h][0:l-1])/10

max_df.rename(columns={0:'station_id',1:'january',2:'february',3:'march',4:'april',5:'may',6:'june',7:'july',8:'august',9:'september',10:'october',11:'november',12:'december'}, inplace=True)
min_df.rename(columns={0:'station_id',1:'january',2:'february',3:'march',4:'april',5:'may',6:'june',7:'july',8:'august',9:'september',10:'october',11:'november',12:'december'}, inplace=True)

In [4]:
# v1: you probably don't want to run this one. It looks for exact zip matches
# by iterating through each city-zip pairing to extract weather station
for i in range(city_df.shape[0]):
    sid =[]
    city,state = city_df['Geographic Location'][i].split(",")
    result = search.by_city_and_state(city,state)
    for z in result:
        ctr = 0
        for zm in zstation_df.zip:
            if int(z.zipcode) == int(zm):
                sid.append(zstation_df['station_id'][ctr])
            ctr+=1
    if len(sid) == 0:
        # nothing found in that zip
        continue
    else:
        # pick a primary station name
        sname = str(sid[0])
        # pick a backup station
        ctr=0
        for sn in max_df.station_id:
            if sname == sn:
                city_df.at[i,'max_jan'] = max_df.loc[ctr][1]
                city_df.at[i,'max_feb'] = max_df.loc[ctr][2]
                city_df.at[i,'max_mar'] = max_df.loc[ctr][3]
                city_df.at[i,'max_apr'] = max_df.loc[ctr][4]
                city_df.at[i,'max_may'] = max_df.loc[ctr][5]
                city_df.at[i,'max_jun'] = max_df.loc[ctr][6]
                city_df.at[i,'max_jul'] = max_df.loc[ctr][7]
                city_df.at[i,'max_aug'] = max_df.loc[ctr][8]
                city_df.at[i,'max_sep'] = max_df.loc[ctr][9]
                city_df.at[i,'max_oct'] = max_df.loc[ctr][10]
                city_df.at[i,'max_nov'] = max_df.loc[ctr][11]
                city_df.at[i,'max_dec'] = max_df.loc[ctr][12]
            ctr+=1
        ctr=0
        for sn in min_df.station_id:
            if sname == sn:
                city_df.at[i,'min_jan'] = min_df.loc[ctr][1]
                city_df.at[i,'min_feb'] = min_df.loc[ctr][2]
                city_df.at[i,'min_mar'] = min_df.loc[ctr][3]
                city_df.at[i,'min_apr'] = min_df.loc[ctr][4]
                city_df.at[i,'min_may'] = min_df.loc[ctr][5]
                city_df.at[i,'min_jun'] = min_df.loc[ctr][6]
                city_df.at[i,'min_jul'] = min_df.loc[ctr][7]
                city_df.at[i,'min_aug'] = min_df.loc[ctr][8]
                city_df.at[i,'min_sep'] = min_df.loc[ctr][9]
                city_df.at[i,'min_oct'] = min_df.loc[ctr][10]
                city_df.at[i,'min_nov'] = min_df.loc[ctr][11]
                city_df.at[i,'min_dec'] = min_df.loc[ctr][12]
            ctr+=1

In [None]:
# improved v2:
# v2, doing it better
for i in range(city_df.shape[0]):
    city,state = city_df['Geographic Location'][i].split(",")
    location = geolocator.geocode(city_df['Geographic Location'][i])
    clat,clon = location.latitude, location.longitude
    #print("Searching for station closest to:",clat,clon)
    ctr = 0
    lowest_val = []
    lowest_pos = []
    for la,lo in zip(gstation_df['lat'],gstation_df['long']):
        distance = haversine(clat,clon,la,lo)
        if ctr == 0:
            lowest_val.append(distance)
            lowest_pos.append(ctr)
        else:
            if lowest_val[0] > distance:
                lowest_val.insert(0,distance)
                lowest_pos.insert(0,ctr)
        ctr+=1
    #print("lowest haversine value =",lowest_val,"for station",list(gstation_df.loc[lowest_pos[0]])[0])
    #print("with lat:",list(gstation_df.loc[lowest_pos[0]])[1],"long:",list(gstation_df.loc[lowest_pos[0]])[2])
    sname = list(gstation_df.loc[lowest_pos[0]])[0]
    ctr=0
    for sn in max_df.station_id:
        if sname == sn:
            city_df.at[i,'max_jan'] = max_df.loc[ctr][1]
            city_df.at[i,'max_feb'] = max_df.loc[ctr][2]
            city_df.at[i,'max_mar'] = max_df.loc[ctr][3]
            city_df.at[i,'max_apr'] = max_df.loc[ctr][4]
            city_df.at[i,'max_may'] = max_df.loc[ctr][5]
            city_df.at[i,'max_jun'] = max_df.loc[ctr][6]
            city_df.at[i,'max_jul'] = max_df.loc[ctr][7]
            city_df.at[i,'max_aug'] = max_df.loc[ctr][8]
            city_df.at[i,'max_sep'] = max_df.loc[ctr][9]
            city_df.at[i,'max_oct'] = max_df.loc[ctr][10]
            city_df.at[i,'max_nov'] = max_df.loc[ctr][11]
            city_df.at[i,'max_dec'] = max_df.loc[ctr][12]
        ctr+=1
    ctr=0
    for sn in min_df.station_id:
        if sname == sn:
            city_df.at[i,'min_jan'] = min_df.loc[ctr][1]
            city_df.at[i,'min_feb'] = min_df.loc[ctr][2]
            city_df.at[i,'min_mar'] = min_df.loc[ctr][3]
            city_df.at[i,'min_apr'] = min_df.loc[ctr][4]
            city_df.at[i,'min_may'] = min_df.loc[ctr][5]
            city_df.at[i,'min_jun'] = min_df.loc[ctr][6]
            city_df.at[i,'min_jul'] = min_df.loc[ctr][7]
            city_df.at[i,'min_aug'] = min_df.loc[ctr][8]
            city_df.at[i,'min_sep'] = min_df.loc[ctr][9]
            city_df.at[i,'min_oct'] = min_df.loc[ctr][10]
            city_df.at[i,'min_nov'] = min_df.loc[ctr][11]
            city_df.at[i,'min_dec'] = min_df.loc[ctr][12]
        ctr+=1
    if city_df.loc[i]['min_jan'] == "":
        sname = list(gstation_df.loc[lowest_pos[1]])[0]
        ctr=0
        for sn in max_df.station_id:
            if sname == sn:
                city_df.at[i,'max_jan'] = max_df.loc[ctr][1]
                city_df.at[i,'max_feb'] = max_df.loc[ctr][2]
                city_df.at[i,'max_mar'] = max_df.loc[ctr][3]
                city_df.at[i,'max_apr'] = max_df.loc[ctr][4]
                city_df.at[i,'max_may'] = max_df.loc[ctr][5]
                city_df.at[i,'max_jun'] = max_df.loc[ctr][6]
                city_df.at[i,'max_jul'] = max_df.loc[ctr][7]
                city_df.at[i,'max_aug'] = max_df.loc[ctr][8]
                city_df.at[i,'max_sep'] = max_df.loc[ctr][9]
                city_df.at[i,'max_oct'] = max_df.loc[ctr][10]
                city_df.at[i,'max_nov'] = max_df.loc[ctr][11]
                city_df.at[i,'max_dec'] = max_df.loc[ctr][12]
            ctr+=1
        ctr=0
        for sn in min_df.station_id:
            if sname == sn:
                city_df.at[i,'min_jan'] = min_df.loc[ctr][1]
                city_df.at[i,'min_feb'] = min_df.loc[ctr][2]
                city_df.at[i,'min_mar'] = min_df.loc[ctr][3]
                city_df.at[i,'min_apr'] = min_df.loc[ctr][4]
                city_df.at[i,'min_may'] = min_df.loc[ctr][5]
                city_df.at[i,'min_jun'] = min_df.loc[ctr][6]
                city_df.at[i,'min_jul'] = min_df.loc[ctr][7]
                city_df.at[i,'min_aug'] = min_df.loc[ctr][8]
                city_df.at[i,'min_sep'] = min_df.loc[ctr][9]
                city_df.at[i,'min_oct'] = min_df.loc[ctr][10]
                city_df.at[i,'min_nov'] = min_df.loc[ctr][11]
                city_df.at[i,'min_dec'] = min_df.loc[ctr][12]
            ctr+=1

In [None]:
city_df