### Merging 2016 Weather Data and Initial Inspection and Cleaning

In [18]:
import os
import pandas as pd
file_list = os.listdir()
flat_file_list = list(filter(lambda x: x.endswith('csv'), file_list)) 
data_frames = []
for file_name in flat_file_list:
    df = pd.read_csv(file_name)
    data_frames.append(df)
merged_frame = pd.concat(data_frames)

In [19]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,TEMP_ATTRIBUTES,DEWP,DEWP_ATTRIBUTES,...,MXSPD,GUST,MAX,MAX_ATTRIBUTES,MIN,MIN_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,7,66.7,7,...,999.9,999.9,100.4,*,87.8,*,0.0,I,999.9,0
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,24,69.7,24,...,999.9,999.9,98.6,*,78.8,*,0.0,I,999.9,0
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,24,69.3,24,...,999.9,999.9,93.2,*,69.8,*,99.99,,999.9,10000
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,24,71.8,24,...,999.9,999.9,89.6,*,73.4,*,0.0,I,999.9,0
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,24,63.4,24,...,999.9,999.9,91.4,*,69.8,*,0.0,I,999.9,0


There appear to be zero and NaN latitude/longitude data.  For now lets verify that we have nonempty entries in these fields.  I can always lookup by station number later if I need to find data near a point.

In [20]:
merged_frame[(merged_frame['LATITUDE'] != 0)].dropna().head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,TEMP_ATTRIBUTES,DEWP,DEWP_ATTRIBUTES,...,MXSPD,GUST,MAX,MAX_ATTRIBUTES,MIN,MIN_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,SNDP,FRSHTT
0,1001099999,2016-01-01,70.933333,-8.666667,9.0,"JAN MAYEN NOR NAVY, NO",35.0,24,31.7,24,...,11.5,15.5,36.9,*,32.5,*,0.0,G,999.9,0
1,1001099999,2016-01-02,70.933333,-8.666667,9.0,"JAN MAYEN NOR NAVY, NO",36.3,24,32.9,24,...,29.1,999.9,39.4,,32.5,,0.0,G,999.9,10000
2,1001099999,2016-01-03,70.933333,-8.666667,9.0,"JAN MAYEN NOR NAVY, NO",33.0,24,29.5,24,...,11.7,999.9,34.9,*,29.7,*,0.14,G,999.9,11000
3,1001099999,2016-01-04,70.933333,-8.666667,9.0,"JAN MAYEN NOR NAVY, NO",35.0,24,33.2,24,...,9.7,999.9,35.8,,33.4,,0.07,G,999.9,110000
4,1001099999,2016-01-05,70.933333,-8.666667,9.0,"JAN MAYEN NOR NAVY, NO",34.6,24,29.7,24,...,15.5,999.9,36.1,,33.1,*,0.03,G,999.9,0


There are entries with valid latitude and longitude.  Table will be tidied up and then exported to a merged CSV for DB import.

Temp attributes give the number of measurements averaged for mean temperature.  This will not assist me in modeling, so I am going to drop this column.

In [21]:
merged_frame.drop(merged_frame.columns[7], axis = 1, inplace = True)

In [22]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,DEWP_ATTRIBUTES,SLP,...,MXSPD,GUST,MAX,MAX_ATTRIBUTES,MIN,MIN_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,7,9999.9,...,999.9,999.9,100.4,*,87.8,*,0.0,I,999.9,0
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,24,9999.9,...,999.9,999.9,98.6,*,78.8,*,0.0,I,999.9,0
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,24,9999.9,...,999.9,999.9,93.2,*,69.8,*,99.99,,999.9,10000
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,24,9999.9,...,999.9,999.9,89.6,*,73.4,*,0.0,I,999.9,0
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,24,9999.9,...,999.9,999.9,91.4,*,69.8,*,0.0,I,999.9,0


The same situation applies with the other attribute fields.  Removing them as well.

In [23]:
merged_frame.drop(['DEWP_ATTRIBUTES', 'MAX_ATTRIBUTES', 'MIN_ATTRIBUTES', 'PRCP_ATTRIBUTES'], axis = 1, inplace=True)

In [24]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,SLP_ATTRIBUTES,...,VISIB_ATTRIBUTES,WDSP,WDSP_ATTRIBUTES,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,9999.9,0,...,4,0.0,7,999.9,999.9,100.4,87.8,0.0,999.9,0
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,9999.9,0,...,24,0.0,24,999.9,999.9,98.6,78.8,0.0,999.9,0
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,9999.9,0,...,22,0.0,24,999.9,999.9,93.2,69.8,99.99,999.9,10000
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,9999.9,0,...,23,0.0,24,999.9,999.9,89.6,73.4,0.0,999.9,0
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,9999.9,0,...,22,0.0,24,999.9,999.9,91.4,69.8,0.0,999.9,0


For Temp, dewpoint, SLP, and STP missing values are reported as 9999.9.  These are going to be replaced with NaN for clarity and DB processing purposes.

In [43]:
import numpy as np

def set_null_9999(arg):
    if arg == 9999.9:
        return np.NaN
    else:
        return arg
    
merged_frame['TEMP'] = merged_frame['TEMP'].apply(set_null_9999) 

In [44]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,SLP_ATTRIBUTES,...,VISIB_ATTRIBUTES,WDSP,WDSP_ATTRIBUTES,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,9999.9,0,...,4,0.0,7,999.9,999.9,100.4,87.8,0.0,999.9,0
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,9999.9,0,...,24,0.0,24,999.9,999.9,98.6,78.8,0.0,999.9,0
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,9999.9,0,...,22,0.0,24,999.9,999.9,93.2,69.8,99.99,999.9,10000
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,9999.9,0,...,23,0.0,24,999.9,999.9,89.6,73.4,0.0,999.9,0
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,9999.9,0,...,22,0.0,24,999.9,999.9,91.4,69.8,0.0,999.9,0


In [46]:
merged_frame['DEWP'] = merged_frame['DEWP'].apply(set_null_9999)
merged_frame['SLP'] = merged_frame['SLP'].apply(set_null_9999) 
merged_frame['STP'] = merged_frame['STP'].apply(set_null_9999)

In [47]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,SLP_ATTRIBUTES,...,VISIB_ATTRIBUTES,WDSP,WDSP_ATTRIBUTES,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,,0,...,4,0.0,7,999.9,999.9,100.4,87.8,0.0,999.9,0
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,,0,...,24,0.0,24,999.9,999.9,98.6,78.8,0.0,999.9,0
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,,0,...,22,0.0,24,999.9,999.9,93.2,69.8,99.99,999.9,10000
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,,0,...,23,0.0,24,999.9,999.9,89.6,73.4,0.0,999.9,0
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,,0,...,22,0.0,24,999.9,999.9,91.4,69.8,0.0,999.9,0


For VISIB, WDSP, MXSPD, and GUST the missing value is encoded as 999.9.  As before replacing with NaN.

In [50]:
def set_null_999(arg):
    if arg == 999.9:
        return np.NaN
    else:
        return arg
    
merged_frame['VISIB'] = merged_frame['VISIB'].apply(set_null_999) 

In [51]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,SLP_ATTRIBUTES,...,VISIB_ATTRIBUTES,WDSP,WDSP_ATTRIBUTES,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,,0,...,4,0.0,7,999.9,999.9,100.4,87.8,0.0,999.9,0
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,,0,...,24,0.0,24,999.9,999.9,98.6,78.8,0.0,999.9,0
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,,0,...,22,0.0,24,999.9,999.9,93.2,69.8,99.99,999.9,10000
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,,0,...,23,0.0,24,999.9,999.9,89.6,73.4,0.0,999.9,0
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,,0,...,22,0.0,24,999.9,999.9,91.4,69.8,0.0,999.9,0


In [52]:
merged_frame['WDSP'] = merged_frame['WDSP'].apply(set_null_999)
merged_frame['MXSPD'] = merged_frame['MXSPD'].apply(set_null_999)
merged_frame['GUST'] = merged_frame['GUST'].apply(set_null_999)

In [53]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,SLP_ATTRIBUTES,...,VISIB_ATTRIBUTES,WDSP,WDSP_ATTRIBUTES,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,,0,...,4,0.0,7,,,100.4,87.8,0.0,999.9,0
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,,0,...,24,0.0,24,,,98.6,78.8,0.0,999.9,0
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,,0,...,22,0.0,24,,,93.2,69.8,99.99,999.9,10000
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,,0,...,23,0.0,24,,,89.6,73.4,0.0,999.9,0
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,,0,...,22,0.0,24,,,91.4,69.8,0.0,999.9,0


In [54]:
merged_frame.drop(['SLP_ATTRIBUTES', 'VISIB_ATTRIBUTES', 'WDSP_ATTRIBUTES'], axis=1, inplace=True)

In [55]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,STP,STP_ATTRIBUTES,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,,999.9,0,6.2,0.0,,,100.4,87.8,0.0,999.9,0
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,,999.9,0,6.2,0.0,,,98.6,78.8,0.0,999.9,0
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,,999.9,0,5.8,0.0,,,93.2,69.8,99.99,999.9,10000
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,,999.9,0,5.9,0.0,,,89.6,73.4,0.0,999.9,0
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,,999.9,0,6.2,0.0,,,91.4,69.8,0.0,999.9,0


For snow depth missing data will be reported as 999.9.  However, most stations do not report 0 on days with no snow on ground.  As such I will clean this value to zero on such days.

In [57]:
def set_zero_999(arg):
    if arg == 999.9:
        return 0
    else:
        return arg
    
merged_frame['SNDP'] = merged_frame['SNDP'].apply(set_zero_999)

In [60]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,STP,STP_ATTRIBUTES,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,,999.9,0,6.2,0.0,,,100.4,87.8,0.0,0.0,0
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,,999.9,0,6.2,0.0,,,98.6,78.8,0.0,0.0,0
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,,999.9,0,5.8,0.0,,,93.2,69.8,99.99,0.0,10000
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,,999.9,0,5.9,0.0,,,89.6,73.4,0.0,0.0,0
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,,999.9,0,6.2,0.0,,,91.4,69.8,0.0,0.0,0


FRSHTT is a combined indicator for occurences of meteorological conditions.  The interpretation for this field is coppied from the data source readme (https://www.ncei.noaa.gov/data/global-summary-of-the-day/doc/readme.txt).


FRSHTT - Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day of:
                         Fog ('F' - 1st digit).
                         Rain or Drizzle ('R' - 2nd digit).
                         Snow or Ice Pellets ('S' - 3rd digit).
                         Hail ('H' - 4th digit).
                         Thunder ('T' - 5th digit).
                         Tornado or Funnel Cloud ('T' - 6th digit).

In [63]:
def process_FRSHTT(arg):
    string = str(arg)
    if len(string)>6:
        #more than six chars would be invalid
        return np.NaN
    index = 0
    outputs = ['Fog', 'Rain', 'Snow', 'Hail', 'Thunder', 'Tornado']
    output = []
    for char in string:
        if char == '1':
            output.append(outputs[index])
        index+=1
    return "|".join(output)

In [65]:
merged_frame['FRSHTT'] = merged_frame['FRSHTT'].apply(process_FRSHTT)

In [66]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,STP,STP_ATTRIBUTES,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,,999.9,0,6.2,0.0,,,100.4,87.8,0.0,0.0,
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,,999.9,0,6.2,0.0,,,98.6,78.8,0.0,0.0,
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,,999.9,0,5.8,0.0,,,93.2,69.8,99.99,0.0,Fog
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,,999.9,0,5.9,0.0,,,89.6,73.4,0.0,0.0,
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,,999.9,0,6.2,0.0,,,91.4,69.8,0.0,0.0,


In [67]:
merged_frame.drop('STP_ATTRIBUTES', axis=1, inplace=True)

99.99 denotes a missing PRCP value.  Many stations do not report precip if the value is 0.  So I will replace those values with 0.

In [70]:
def set_zero_99(arg):
    if arg == 99.99:
        return 0
    else:
        return arg
    
merged_frame['PRCP'] = merged_frame['PRCP'].apply(set_zero_99)

In [71]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,,999.9,6.2,0.0,,,100.4,87.8,0.0,0.0,
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,,999.9,6.2,0.0,,,98.6,78.8,0.0,0.0,
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,,999.9,5.8,0.0,,,93.2,69.8,0.0,0.0,Fog
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,,999.9,5.9,0.0,,,89.6,73.4,0.0,0.0,
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,,999.9,6.2,0.0,,,91.4,69.8,0.0,0.0,


MAX and MIN also report missing data as 9999.9.  Cleaning as such with NaN.

In [72]:
merged_frame['MAX'] = merged_frame['MAX'].apply(set_null_9999) 
merged_frame['MIN'] = merged_frame['MIN'].apply(set_null_9999)

In [73]:
merged_frame.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,702699999,2016-06-22,0.0,0.0,7026.0,"WXPOD 7026, AF",94.7,66.7,,999.9,6.2,0.0,,,100.4,87.8,0.0,0.0,
1,702699999,2016-06-23,0.0,0.0,7026.0,"WXPOD 7026, AF",88.3,69.7,,999.9,6.2,0.0,,,98.6,78.8,0.0,0.0,
2,702699999,2016-06-24,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,69.3,,999.9,5.8,0.0,,,93.2,69.8,0.0,0.0,Fog
3,702699999,2016-06-25,0.0,0.0,7026.0,"WXPOD 7026, AF",81.4,71.8,,999.9,5.9,0.0,,,89.6,73.4,0.0,0.0,
4,702699999,2016-06-26,0.0,0.0,7026.0,"WXPOD 7026, AF",80.5,63.4,,999.9,6.2,0.0,,,91.4,69.8,0.0,0.0,


In [74]:
merged_frame.to_csv('2016-merged-cleaned.csv.dat')#appending .dat to avoid accidental reprocessing if script is rerun.
