In [1]:
from ftplib import FTP
import pandas as pd
import gzip
from io import BytesIO
import shutil
from glob import glob
import numpy as np

In [2]:
# import station information in order to build filenames for each station

isd_history = pd.read_csv("sources/isd-history.csv")
isd_history.head()

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
0,7018,99999,WXPOD 7018,,,,0.0,0.0,7018.0,20110309,20130730
1,7026,99999,WXPOD 7026,AF,,,0.0,0.0,7026.0,20120713,20170822
2,7070,99999,WXPOD 7070,AF,,,0.0,0.0,7070.0,20140923,20150926
3,8260,99999,WXPOD8270,,,,0.0,0.0,0.0,19960101,20100731
4,8268,99999,WXPOD8278,AF,,,32.95,65.567,1156.7,20100519,20120323


In [3]:
# filter dataframe to keep only 'US' weather stations

is_US = isd_history['CTRY']=='US'
isd_US = isd_history[is_US]
isd_US.head()

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
13138,621010,99999,MOORED BUOY,US,,,50.6,-2.933,-999.0,20080721,20080721
13140,621110,99999,MOORED BUOY,US,,,58.9,-0.2,-999.0,20041118,20041118
13141,621130,99999,MOORED BUOY,US,,,58.4,0.3,-999.0,20040726,20040726
13142,621160,99999,MOORED BUOY,US,,,58.1,1.8,-999.0,20040829,20040829
13143,621170,99999,MOORED BUOY,US,,,57.9,0.1,-999.0,20040726,20040726


In [4]:
#  Get list of USAF identifiers and define which year's weather data we want to view

usaf_list = [item for item in isd_US['USAF']]
len(usaf_list)

7327

In [6]:
# Grab all the files for each item in the usaf list
df_list = []
for thing in usaf_list:
    try:
        # Get the required parameters from the isd_history dataframe
        lat = isd_history.query(f"USAF == '{thing}'").LAT.to_list()[0]
        lon = isd_history.query(f"USAF == '{thing}'").LON.to_list()[0]
        wban = isd_history.query(f"USAF == '{thing}'").WBAN.to_list()[0]
        usaf = isd_history.query(f"USAF == '{thing}'").USAF.to_list()[0]

        # build the filename to query from FTP
        filename = f"{usaf}-{wban}-2018"

        # put the lats and lons in a dataframe so we can append it to the weather data later
        lat_lon_df = pd.DataFrame([lat], columns=['lat'])
        lat_lon_df['lon'] = [lon]
        print(filename)

        # use the FTP library to query the weather data
        with FTP("ftp.ncdc.noaa.gov") as ftp, BytesIO() as flo:
            ftp.login()
            ftp.retrbinary(f"RETR pub/data/noaa/isd-lite/2018/{filename}.gz", flo.write)
            flo.seek(0)
            with open(f"{filename}.gz", "wb") as fout, gzip.GzipFile(fileobj = flo) as gzipobj:
                shutil.copyfileobj(gzipobj, fout)
        with open(f'{filename}.gz', "r") as f:

            #append the annual average weather data to the df_list list as well as the latitudes and longitudes
            df_list.append(
                pd.read_csv(
                    f,
                    delim_whitespace=True,
                    header=None,
                    names=[
                        "year",
                        "month",
                        "day",
                        "hour",
                        "tmpc",
                        "dwpc",
                        "mslp",
                        "wdir",
                        "wspd",
                        "skct",
                        "pr1h",
                        "pr6h"
                    ],
                    na_values=-9999
                ).groupby(['year']).mean().assign(lat=lat, lon=lon)
            )
        print(lat, lon)    
    except Exception as e:

        # catch file not found errors (there is a fair bit of data missing from some weather stations)
        print(e)
        print(f'no data available for {filename}')

621010-99999-2018
550 pub/data/noaa/isd-lite/2018/621010-99999-2018.gz: No such file or directory
no data available for 621010-99999-2018
621110-99999-2018
550 pub/data/noaa/isd-lite/2018/621110-99999-2018.gz: No such file or directory
no data available for 621110-99999-2018
621130-99999-2018
550 pub/data/noaa/isd-lite/2018/621130-99999-2018.gz: No such file or directory
no data available for 621130-99999-2018
621160-99999-2018
550 pub/data/noaa/isd-lite/2018/621160-99999-2018.gz: No such file or directory
no data available for 621160-99999-2018
621170-99999-2018
550 pub/data/noaa/isd-lite/2018/621170-99999-2018.gz: No such file or directory
no data available for 621170-99999-2018
621220-99999-2018
550 pub/data/noaa/isd-lite/2018/621220-99999-2018.gz: No such file or directory
no data available for 621220-99999-2018
621250-99999-2018
550 pub/data/noaa/isd-lite/2018/621250-99999-2018.gz: No such file or directory
no data available for 621250-99999-2018
621260-99999-2018
550 pub/data/noa

In [7]:
# concatenate the dataframes in the df_list list in order to get the full dataframe
# drop the columns we don't need

df = pd.concat(df_list)
df = df.drop(['month', 'day', 'hour', 'skct', 'pr1h', 'pr6h'], axis = 1)
df

Unnamed: 0_level_0,tmpc,dwpc,mslp,wdir,wspd,lat,lon
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018,215.530118,-10.764131,10140.207549,218.283832,36.954203,34.300,-116.167
2018,215.530118,-10.764131,10140.207549,218.283832,36.954203,34.300,-116.167
2018,38.198992,3.643683,10114.252759,120.485437,32.077775,60.785,-148.839
2018,38.198992,3.643683,10114.252759,120.485437,32.077775,60.785,-148.839
2018,-25.127398,-51.723646,10119.352360,126.758268,47.243894,66.600,-159.986
...,...,...,...,...,...,...,...
2018,138.696944,98.198611,,,,37.601,-81.559
2018,203.718236,64.645438,10254.392066,126.295078,22.243747,30.350,-81.883
2018,192.189931,139.562230,10191.827624,160.891892,34.380952,30.507,-86.960
2018,120.643671,70.870168,,139.037822,30.239753,36.380,-88.985


In [21]:
from citipy import citipy

# the weather data only has coordinates, so here we are using the citipy module to get the city nearest the coordinates
df['city'] = df.apply(lambda x: citipy.nearest_city(x['lat'], x['lon']).city_name, axis=1)

In [24]:
# some stations have multiple annual entries for the same station, so we average them in order to
# get one row per station
weather_df_2018 = df.groupby(['city']).mean()
weather_df_2018.head(20)

Unnamed: 0_level_0,tmpc,dwpc,mslp,wdir,wspd,lat,lon
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abbeville,213.062144,,10177.004181,73.380167,41.332922,29.495,-92.18
aberdeen,112.785133,58.699559,10172.513732,152.44333,31.991904,41.462333,-83.584333
abilene,180.762063,79.040004,10151.258131,162.384855,48.443962,32.425667,-99.794
ada,156.676491,95.905447,,149.515826,28.436108,34.804,-96.671
adelanto,134.607129,-23.021935,10109.243828,130.180784,26.524514,34.576667,-117.480333
aguadilla,262.476744,209.95656,,93.072283,48.328348,18.498,-67.129
aiea,256.401119,190.847779,10154.831108,101.34297,45.198995,21.324,-157.929
alabaster,177.523008,127.883343,10185.769887,123.071921,17.626592,33.178,-86.782
alameda,139.729588,94.90177,10170.382529,203.646882,36.910928,37.721,-122.221
alamogordo,175.091283,-4.190469,10141.661814,158.488798,32.420724,32.845,-106.0455


In [132]:
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import tqdm
from tqdm._tqdm_notebook import tqdm_notebook

# the citipy module does not have functionality to include the state, so we obtain it
# here with the geopy library
locator = Nominatim(user_agent='Sleeper_Cities', timeout=10)
rgeocode = RateLimiter(locator.reverse, min_delay_seconds=0.001)
weather_df_2018['coords'] = weather_df_2018['lat'].map(str) + ',' + weather_df_2018['lon'].map(str)

# tqdm is used to show a progress bar
tqdm_notebook.pandas()
weather_df_2018['address'] = weather_df_2018['coords'].progress_apply(rgeocode)

weather_df_2018.head(20)

  0%|          | 0/1141 [00:00<?, ?it/s]

Unnamed: 0_level_0,tmpc,dwpc,mslp,wdir,wspd,lat,lon,coords,address
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
abbeville,213.062144,,10177.004181,73.380167,41.332922,29.495,-92.18,"29.495,-92.18","(Deadman Island, Vermilion Parish, Louisiana, ..."
aberdeen,112.785133,58.699559,10172.513732,152.44333,31.991904,41.462333,-83.584333,"41.46233333333333,-83.58433333333333","(Middleton Pike, Dunbridge, Wood County, Ohio,..."
abilene,180.762063,79.040004,10151.258131,162.384855,48.443962,32.425667,-99.794,"32.425666666666665,-99.794","(5356, Alamo Drive, Quail Park, Abilene, Taylo..."
ada,156.676491,95.905447,,149.515826,28.436108,34.804,-96.671,"34.804,-96.671","(Ada Municipal Airport, 2800, Bloomfield Loop,..."
adelanto,134.607129,-23.021935,10109.243828,130.180784,26.524514,34.576667,-117.480333,"34.57666666666666,-117.48033333333332","(Vinton Road, Adelanto, San Bernardino County,..."
aguadilla,262.476744,209.95656,,93.072283,48.328348,18.498,-67.129,"18.498,-67.129","(Maleza Alta, Aguadilla, Puerto Rico, 00604, U..."
aiea,256.401119,190.847779,10154.831108,101.34297,45.198995,21.324,-157.929,"21.324,-157.929","(Worchester Avenue, Honolulu, Honolulu County,..."
alabaster,177.523008,127.883343,10185.769887,123.071921,17.626592,33.178,-86.782,"33.178,-86.782","(Shelby County Airport, County Road 12, Calera..."
alameda,139.729588,94.90177,10170.382529,203.646882,36.910928,37.721,-122.221,"37.721,-122.221","(Oakland International Airport, Pardee Lane, O..."
alamogordo,175.091283,-4.190469,10141.661814,158.488798,32.420724,32.845,-106.0455,"32.845,-106.0455","(US 70, Alamogordo, Otero County, New Mexico, ..."


In [127]:
weather_df_2018.dtypes

tmpc       float64
dwpc       float64
mslp       float64
wdir       float64
wspd       float64
lat        float64
lon        float64
coords      object
address     object
dtype: object

In [146]:
# the 'coords' column we created to use with geopy is no longer necessary
weather_df_2018 = weather_df_2018.drop(['coords'], axis=1)

In [147]:
weather_df_2018.head(10)

Unnamed: 0_level_0,tmpc,dwpc,mslp,wdir,wspd,lat,lon,address
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
abbeville,213.062144,,10177.004181,73.380167,41.332922,29.495,-92.18,"(Deadman Island, Vermilion Parish, Louisiana, ..."
aberdeen,112.785133,58.699559,10172.513732,152.44333,31.991904,41.462333,-83.584333,"(Middleton Pike, Dunbridge, Wood County, Ohio,..."
abilene,180.762063,79.040004,10151.258131,162.384855,48.443962,32.425667,-99.794,"(5356, Alamo Drive, Quail Park, Abilene, Taylo..."
ada,156.676491,95.905447,,149.515826,28.436108,34.804,-96.671,"(Ada Municipal Airport, 2800, Bloomfield Loop,..."
adelanto,134.607129,-23.021935,10109.243828,130.180784,26.524514,34.576667,-117.480333,"(Vinton Road, Adelanto, San Bernardino County,..."
aguadilla,262.476744,209.95656,,93.072283,48.328348,18.498,-67.129,"(Maleza Alta, Aguadilla, Puerto Rico, 00604, U..."
aiea,256.401119,190.847779,10154.831108,101.34297,45.198995,21.324,-157.929,"(Worchester Avenue, Honolulu, Honolulu County,..."
alabaster,177.523008,127.883343,10185.769887,123.071921,17.626592,33.178,-86.782,"(Shelby County Airport, County Road 12, Calera..."
alameda,139.729588,94.90177,10170.382529,203.646882,36.910928,37.721,-122.221,"(Oakland International Airport, Pardee Lane, O..."
alamogordo,175.091283,-4.190469,10141.661814,158.488798,32.420724,32.845,-106.0455,"(US 70, Alamogordo, Otero County, New Mexico, ..."


In [154]:
# export the dataframe to a csv
weather_df_2018.to_csv('weather_data_2018.csv')