### Preprocess the raw data from NOAA
This notebook is setup to take in the CSV from NOAA and remove the unneccasary data. This will also seperate out each station for later positioning.

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
#Getting a list of files in raw data folder
filenames = os.listdir('D:/Nico/Desktop/full_grid')

In [3]:
header_wanted = [
 'HOURLYVISIBILITY',
 'HOURLYDRYBULBTEMPC',
 'HOURLYWETBULBTEMPC',
 'HOURLYDewPointTempC',
 'HOURLYRelativeHumidity',
 'HOURLYWindSpeed',
 'HOURLYWindGustSpeed',
 'HOURLYStationPressure',
 'HOURLYPressureTendency',
 'HOURLYPressureChange',
 'HOURLYSeaLevelPressure',
 'HOURLYPrecip',
 'HOURLYAltimeterSetting']

In [4]:
usecols = ['DATE','STATION'] + header_wanted

In [5]:
#Loading all files into a pandas Dataframe
tqdm.pandas()
df = pd.concat([pd.read_csv('D:/Nico/Desktop/full_grid/{}'.format(x), usecols=usecols, low_memory=False) for x in tqdm(filenames)])

100%|██████████████████████████████████████████████████████████████████████████████████| 82/82 [03:04<00:00,  2.25s/it]


In [6]:
#Getting the station names
wban = df['STATION'].unique()


In [7]:
def remove_letters(headers,dataframes):
    for i in tqdm(headers):
        dataframes[i].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
        dataframes[i] = dataframes[i].apply(pd.to_numeric)

In [8]:
remove_letters(header_wanted,df)

100%|███████████████████████████████████████████████████████████████████████████████| 13/13 [1:16:01<00:00, 350.87s/it]


In [17]:
by_station_list = []

for i in tqdm(wban):
    by_station_list.append(df.loc[df.STATION == i])
del df

100%|████████████████████████████████████████████████████████████████████████████████| 394/394 [10:28<00:00,  1.60s/it]


In [18]:
by_station_list[0].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50881 entries, 0 to 50880
Data columns (total 15 columns):
STATION                   50881 non-null object
DATE                      50881 non-null object
HOURLYVISIBILITY          50315 non-null float64
HOURLYDRYBULBTEMPC        43475 non-null float64
HOURLYWETBULBTEMPC        41842 non-null float64
HOURLYDewPointTempC       43475 non-null float64
HOURLYRelativeHumidity    43475 non-null float64
HOURLYWindSpeed           50380 non-null float64
HOURLYWindGustSpeed       3039 non-null float64
HOURLYStationPressure     48624 non-null float64
HOURLYPressureTendency    0 non-null float64
HOURLYPressureChange      0 non-null float64
HOURLYSeaLevelPressure    0 non-null float64
HOURLYPrecip              0 non-null float64
HOURLYAltimeterSetting    48624 non-null float64
dtypes: float64(13), object(2)
memory usage: 6.2+ MB


In [22]:
for i in tqdm(range(len(by_station_list))):
    by_station_list[i] = by_station_list[i].set_index(pd.DatetimeIndex(by_station_list[i]['DATE']))

100%|████████████████████████████████████████████████████████████████████████████████| 394/394 [00:05<00:00, 68.16it/s]


In [23]:
by_station_list[0].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 50881 entries, 2016-08-01 00:15:00 to 2018-07-31 23:59:00
Data columns (total 15 columns):
STATION                   50881 non-null object
DATE                      50881 non-null object
HOURLYVISIBILITY          50315 non-null float64
HOURLYDRYBULBTEMPC        43475 non-null float64
HOURLYWETBULBTEMPC        41842 non-null float64
HOURLYDewPointTempC       43475 non-null float64
HOURLYRelativeHumidity    43475 non-null float64
HOURLYWindSpeed           50380 non-null float64
HOURLYWindGustSpeed       3039 non-null float64
HOURLYStationPressure     48624 non-null float64
HOURLYPressureTendency    0 non-null float64
HOURLYPressureChange      0 non-null float64
HOURLYSeaLevelPressure    0 non-null float64
HOURLYPrecip              0 non-null float64
HOURLYAltimeterSetting    48624 non-null float64
dtypes: float64(13), object(2)
memory usage: 6.2+ MB


In [39]:
for i in tqdm(range(len(by_station_list))):
    by_station_list[i] = by_station_list[i].resample('60T').mean()

100%|████████████████████████████████████████████████████████████████████████████████| 394/394 [00:04<00:00, 80.90it/s]


In [45]:
wban_list = []
for x in wban:
    wban_list.append(x[:4] + x[5:])

In [46]:
wban_list[0]

'WBAN00445'

In [40]:
by_station_list[0]

Unnamed: 0_level_0,HOURLYVISIBILITY,HOURLYDRYBULBTEMPC,HOURLYWETBULBTEMPC,HOURLYDewPointTempC,HOURLYRelativeHumidity,HOURLYWindSpeed,HOURLYWindGustSpeed,HOURLYStationPressure,HOURLYPressureTendency,HOURLYPressureChange,HOURLYSeaLevelPressure,HOURLYPrecip,HOURLYAltimeterSetting
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-08-01 00:00:00,1000.000000,254.000000,25.600000,254.000000,100.000000,3.000000,,2992.000000,,,,,3004.000000
2016-08-01 01:00:00,1000.000000,246.333333,24.600000,246.333333,100.000000,1.000000,,2991.333333,,,,,3003.333333
2016-08-01 02:00:00,1000.000000,,,,,2.000000,,2992.000000,,,,,3004.000000
2016-08-01 03:00:00,1000.000000,,,,,0.000000,,2992.333333,,,,,3004.333333
2016-08-01 04:00:00,1000.000000,,,,,0.000000,,2994.000000,,,,,3006.000000
2016-08-01 05:00:00,1000.000000,,,,,0.000000,,2995.666667,,,,,3007.666667
2016-08-01 06:00:00,1000.000000,,,,,2.000000,,2997.333333,,,,,3009.333333
2016-08-01 07:00:00,1000.000000,253.000000,24.600000,242.666667,94.333333,3.666667,,2998.000000,,,,,3010.000000
2016-08-01 08:00:00,1000.000000,274.000000,24.933333,238.333333,81.000000,1.666667,,2998.000000,,,,,3010.000000
2016-08-01 09:00:00,1000.000000,296.333333,25.066667,231.333333,68.000000,2.000000,,2998.000000,,,,,3010.000000


In [47]:
for i in tqdm(range(len(wban))):
    by_station_list[i].to_csv('D:/Nico/Desktop/processed_data/{}.csv'.format(wban_list[i]))

100%|████████████████████████████████████████████████████████████████████████████████| 394/394 [02:04<00:00,  3.17it/s]


### Current Issues

1) data is not synced across time-zone and all entries start at midnight local time. 

2) some stations have multiple entries per hour and need to be reduced.

Solutions

Remove rows from data based on timezone to sync times
limit only 1 entry per hour for a station