In [1]:
import pandas as pd

## Weather Data

In [2]:
raw_df = pd.read_csv("Instructions/Resources/hawaii_measurements.csv")
num_samples = 10
subset = raw_df[raw_df.isnull().any(axis=1)].sample(num_samples)
subset

Unnamed: 0,station,date,prcp,tobs
18170,USC00516128,2013-07-25,,72
8667,USC00517948,2015-11-12,,70
7793,USC00517948,2011-02-01,,65
8704,USC00517948,2016-02-08,,62
8612,USC00517948,2015-08-13,,79
8249,USC00517948,2013-08-20,,78
6560,USC00514830,2014-01-27,,66
8943,USC00517948,2017-03-13,,69
8067,USC00517948,2012-05-29,,75
7891,USC00517948,2011-07-18,,77


In [3]:
# checking data points for random subset reveals that there is no regularity in NaN true values
# some are null, some are 0, some are "T" for Trace amount, some are positive values
# scroll to bottom of web page and select month and year under View Station Data to check
for ii in range(num_samples):
    station = subset["station"].iloc[ii]
    date = subset["date"].iloc[ii]
    print(f"https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:{station}/detail     {date}")

https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USC00516128/detail     2013-07-25
https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USC00517948/detail     2015-11-12
https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USC00517948/detail     2011-02-01
https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USC00517948/detail     2016-02-08
https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USC00517948/detail     2015-08-13
https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USC00517948/detail     2013-08-20
https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USC00514830/detail     2014-01-27
https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USC00517948/detail     2017-03-13
https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USC00517948/detail     2012-05-29
https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USC00517948/detail     2011-07-18


In [4]:
# so NaNs must be dropped outright without replacement
prcp_df = raw_df.dropna(subset = ["prcp"])
prcp_df[prcp_df.isnull().any(axis=1)] #should be empty and is

Unnamed: 0,station,date,prcp,tobs


In [5]:
prcp_df.to_csv("clean_resources/hawaii_prcp_measurements.csv")

## Temp Observations

In [6]:
temp_df = raw_df[["station", "date", "tobs"]].dropna(how = "any")
temp_df.head()

Unnamed: 0,station,date,tobs
0,USC00519397,2010-01-01,65
1,USC00519397,2010-01-02,63
2,USC00519397,2010-01-03,74
3,USC00519397,2010-01-04,76
4,USC00519397,2010-01-06,73


In [7]:
temp_df.to_csv("clean_resources/hawaii_temp_measurements.csv")

## Station Data


In [8]:
station_df = pd.read_csv("Instructions/Resources/hawaii_stations.csv")
station_df

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


In [9]:
#clean, so move to clean_resources
station_df.to_csv("clean_resources/hawaii_stations.csv")