### Preprocess the raw data from NOAA
This notebook is setup to take in the CSV from NOAA and remove the unneccasary data. This will also seperate out each station for later positioning.

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
#Getting a list of files in raw data folder
filenames = os.listdir('D:/Nico/Desktop/full_grid')

In [3]:
header_wanted = [
 'HOURLYVISIBILITY',
 'HOURLYDRYBULBTEMPC',
 'HOURLYWETBULBTEMPC',
 'HOURLYDewPointTempC',
 'HOURLYRelativeHumidity',
 'HOURLYWindSpeed',
 'HOURLYWindGustSpeed',
 'HOURLYStationPressure',
 'HOURLYPressureTendency',
 'HOURLYPressureChange',
 'HOURLYSeaLevelPressure',
 'HOURLYPrecip',
 'HOURLYAltimeterSetting']

In [4]:
usecols = ['DATE','STATION'] + header_wanted

In [5]:
#Loading all files into a pandas Dataframe
tqdm.pandas()
df = pd.concat([pd.read_csv('D:/Nico/Desktop/full_grid/{}'.format(x), usecols=usecols, low_memory=False) for x in tqdm(filenames)])

100%|██████████████████████████████████████████████████████████████████████████████████| 82/82 [03:05<00:00,  2.26s/it]


In [7]:
#Getting the station names
#wban = df['STATION'].unique()
stations = pd.read_csv("../Playground/stations_unique.csv", usecols = ['STATION_ID'])
wban_list = stations['STATION_ID'].tolist()
len(wban_list)

406

In [8]:
def remove_letters(headers,dataframes):
    for i in tqdm(headers):
        dataframes[i].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
        dataframes[i] = dataframes[i].apply(pd.to_numeric)

In [9]:
remove_letters(header_wanted,df)

100%|███████████████████████████████████████████████████████████████████████████████| 13/13 [1:11:23<00:00, 329.48s/it]


In [10]:
by_station_list = []

for i in tqdm(wban_list):
    by_station_list.append(df.loc[df.STATION == i])
del df

100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [10:47<00:00,  1.59s/it]


In [11]:
by_station_list[0].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46719 entries, 109127 to 155845
Data columns (total 15 columns):
STATION                   46719 non-null object
DATE                      46719 non-null object
HOURLYVISIBILITY          43438 non-null float64
HOURLYDRYBULBTEMPC        46196 non-null float64
HOURLYWETBULBTEMPC        44790 non-null float64
HOURLYDewPointTempC       46195 non-null float64
HOURLYRelativeHumidity    46195 non-null float64
HOURLYWindSpeed           44852 non-null float64
HOURLYWindGustSpeed       6657 non-null float64
HOURLYStationPressure     44796 non-null float64
HOURLYPressureTendency    0 non-null float64
HOURLYPressureChange      0 non-null float64
HOURLYSeaLevelPressure    0 non-null float64
HOURLYPrecip              2096 non-null float64
HOURLYAltimeterSetting    46197 non-null float64
dtypes: float64(13), object(2)
memory usage: 5.7+ MB


In [18]:
for i in tqdm(range(len(by_station_list))):
    by_station_list[i]['STATION_ID'] = by_station_list[i]['STATION']
    by_station_list[i] = by_station_list[i].set_index(pd.DatetimeIndex(by_station_list[i]['DATE']))

100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:07<00:00, 54.83it/s]


In [19]:
by_station_list[0].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 46719 entries, 2016-08-01 00:15:00 to 2018-07-31 23:59:00
Data columns (total 16 columns):
STATION                   46719 non-null object
DATE                      46719 non-null object
HOURLYVISIBILITY          43438 non-null float64
HOURLYDRYBULBTEMPC        46196 non-null float64
HOURLYWETBULBTEMPC        44790 non-null float64
HOURLYDewPointTempC       46195 non-null float64
HOURLYRelativeHumidity    46195 non-null float64
HOURLYWindSpeed           44852 non-null float64
HOURLYWindGustSpeed       6657 non-null float64
HOURLYStationPressure     44796 non-null float64
HOURLYPressureTendency    0 non-null float64
HOURLYPressureChange      0 non-null float64
HOURLYSeaLevelPressure    0 non-null float64
HOURLYPrecip              2096 non-null float64
HOURLYAltimeterSetting    46197 non-null float64
STATION_ID                46719 non-null object
dtypes: float64(13), object(3)
memory usage: 6.1+ MB


In [20]:
for i in tqdm(range(len(by_station_list))):
    by_station_list[i] = by_station_list[i].resample('60T').mean()

100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:05<00:00, 73.63it/s]


In [21]:
wban_name = []
for x in wban_list:
    wban_name.append(x[:4] + x[5:])

In [25]:
wban_name[0]

'WBAN00184'

In [24]:
wban_list[0]

'WBAN:00184'

In [28]:
for i in tqdm(range(len(by_station_list))):
    by_station_list[i]['STATION'] = wban_list[i] 

100%|███████████████████████████████████████████████████████████████████████████████| 406/406 [00:00<00:00, 675.55it/s]


In [29]:
by_station_list[0]

Unnamed: 0_level_0,HOURLYVISIBILITY,HOURLYDRYBULBTEMPC,HOURLYWETBULBTEMPC,HOURLYDewPointTempC,HOURLYRelativeHumidity,HOURLYWindSpeed,HOURLYWindGustSpeed,HOURLYStationPressure,HOURLYPressureTendency,HOURLYPressureChange,HOURLYSeaLevelPressure,HOURLYPrecip,HOURLYAltimeterSetting,STATION
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2016-08-01 00:00:00,525.000000,243.000000,24.400000,243.000000,100.000000,0.000000,,2999.000000,,,,,3004.000000,WBAN:00184
2016-08-01 01:00:00,433.333333,240.333333,23.900000,240.333333,100.000000,0.000000,,2999.333333,,,,,3004.333333,WBAN:00184
2016-08-01 02:00:00,316.666667,237.333333,23.500000,237.333333,100.000000,0.000000,,2998.666667,,,,,3003.666667,WBAN:00184
2016-08-01 03:00:00,91.666667,237.333333,23.900000,237.333333,100.000000,0.000000,,2998.333333,,,,,3003.333333,WBAN:00184
2016-08-01 04:00:00,733.333333,235.666667,23.500000,235.666667,100.000000,0.000000,,2999.333333,,,,,3004.333333,WBAN:00184
2016-08-01 05:00:00,633.333333,237.666667,23.900000,237.666667,100.000000,0.000000,,3000.333333,,,,,3005.333333,WBAN:00184
2016-08-01 06:00:00,900.000000,249.000000,25.000000,249.000000,100.000000,0.000000,,3001.666667,,,,,3006.666667,WBAN:00184
2016-08-01 07:00:00,1000.000000,264.000000,26.366667,262.666667,99.333333,0.000000,,3003.666667,,,,,3008.666667,WBAN:00184
2016-08-01 08:00:00,1000.000000,283.000000,27.133333,267.333333,91.000000,1.666667,,3004.000000,,,,,3009.000000,WBAN:00184
2016-08-01 09:00:00,1000.000000,296.666667,27.333333,265.000000,83.000000,1.000000,,3004.000000,,,,,3009.000000,WBAN:00184


In [30]:
for i in tqdm(range(len(wban_name))):
    by_station_list[i].to_csv('D:/Nico/Desktop/processed_data/{}.csv'.format(wban_name[i]))

100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [02:11<00:00,  3.10it/s]


### Current Issues

1) data is not synced across time-zone and all entries start at midnight local time. 

2) some stations have multiple entries per hour and need to be reduced.

Solutions

Remove rows from data based on timezone to sync times
limit only 1 entry per hour for a station