# Step 1 - Data Engineering

The climate data for Hawaii is provided through two CSV files. Start by using Python and Pandas to inspect the content of these files and clean the data.

* Create a Jupyter Notebook file called `data_engineering.ipynb` and use this to complete all of your Data Engineering tasks.

* Use Pandas to read in the measurement and station CSV files as DataFrames.

* Inspect the data for NaNs and missing values. You must decide what to do with this data.

* Save your cleaned CSV files with the prefix `clean_`.

---

In [1]:
# Dependencies
import pandas as pd
from datetime import datetime

In [2]:
# Store filepath in a variable.
hawaii_measurements_path = "Resources/hawaii_measurements.csv"
hawaii_stations_path = "Resources/hawaii_stations.csv"

In [3]:
# Read measurements into a dataframe.
hawaii_measurements_df = pd.read_csv(hawaii_measurements_path)

In [4]:
# Read stations into a dataframe.
hawaii_stations_df = pd.read_csv(hawaii_stations_path)

In [5]:
print("Hawaii Measurements Count:  {:,.0f}".format(len(hawaii_measurements_df.index)))
print("Hawaii Stations Count:  {:,.0f}".format(len(hawaii_stations_df.index)))

Hawaii Measurements Count:  19,550
Hawaii Stations Count:  9


In [6]:
# Check for null values in the station attribute.
if hawaii_measurements_df.station.isnull().values.any() == True:
    print("Hawaii Measurements were removed because of null station identifier.")
    hawaii_measurements_df = hawaii_measurements_df.loc[hawaii_measurements_df["station"].notnull()]
else:
    print("Hawaii Measurements had no null values for station identifier.")

Hawaii Measurements had no null values for station identifier.


In [7]:
if hawaii_stations_df.station.isnull().values.any() == True:
    print("Hawaii Stations were removed because of null station identifier.")
    hawaii_stations_df = hawaii_stations_df.loc[hawaii_stations_df["station"].notnull()]
else:
    print("Hawaii Stations had no null values for station identifier.")

Hawaii Stations had no null values for station identifier.


In [8]:
# Compare station identifiers, aussuring equality.
hawaii_measurements_indentifiers = hawaii_measurements_df.station.unique()

hawaii_stations_indentifiers = list(hawaii_stations_df.station)

if set(hawaii_measurements_indentifiers) == set(hawaii_stations_indentifiers):
    print("Unique station identifiers in both datasets are the same.")


Unique station identifiers in both datasets are the same.


In [9]:
# Remove any Hawaii Measurements rows that had a null date string.
if hawaii_measurements_df.date.isnull().values.any() == True:
    Print("Hawaii Measuremeents were removed because of null dates.")
    hawaii_measurements_df = hawaii_measurements_df.loc[hawaii_measurements_df["date"].notnull()]
else:
    print("Hawaii Measurements had no null values for date.")

Hawaii Measurements had no null values for date.


In [10]:
# Remove any Hawaii Mesurements rows that have invalid dates.
error_messaged_displayed = False

for i in range(0, len(hawaii_measurements_df.index)):

    try:
        date_series = hawaii_measurements_df.loc[i, ['date']]
        date = date_series.date
        datetime_object = datetime.strptime(date, '%Y-%m-%d')
#         hawaii_measurements_df.loc[i, ['date']] = datetime_object
    
    except KeyError as e:
        pass
    
    except ValueError as e:
        hawaii_measurements_df.drop(hawaii_measurements_df.index[[i]])
        if error_message_displayed:
            pass
        else:
            print("Hawaii Mesurements rows were removed because of invalid dates.")
            error_message_displayed = true

if error_messaged_displayed == False:
    print("Hawaii Measurements had no invalid dates.")

Hawaii Measurements had no invalid dates.


In [11]:
# Remove any Hawaii Measurements rows that had a null precipitation values.
if hawaii_measurements_df.prcp.isnull().values.any() == True:
    print("Hawaii Measuremeents rows were removed because of null precipitation values.")
    hawaii_measurements_df = hawaii_measurements_df.loc[hawaii_measurements_df['prcp'].notnull()]
else:
    print("Hawaii Measurements had no null values for precipitation.")

Hawaii Measuremeents rows were removed because of null precipitation values.


In [12]:
# Remove any Hawaii Measurements rows that had a null TOBS values.
if hawaii_measurements_df.tobs.isnull().values.any() == True:
    print("Hawaii Measuremeents were removed because of null TOBS values.")
    hawaii_measurements_df = hawaii_measurements_df.loc[hawaii_measurements_df['tobs'].notnull()]
else:
    print("Hawaii Measurements had no null values for TOBS.")


Hawaii Measurements had no null values for TOBS.


In [13]:
hawaii_measurements_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
5,USC00519397,2010-01-07,0.06,70


In [14]:
# Remove any Hawaii Stations rows that had a null name values.
if hawaii_stations_df.name.isnull().values.any() == True:
    print("Hawaii Stations were removed because of null name values.")
    hawaii_stations_df = hawaii_stations_df.loc[hawaii_stations_df['name'].notnull()]
else:
    print("Hawaii Stations had no null values for name.")

Hawaii Stations had no null values for name.


In [15]:
# Remove any Hawaii Stations rows that had a null latitude values.
if hawaii_stations_df.latitude.isnull().values.any() == True:
    print("Hawaii Stations were removed because of null latitude values.")
    hawaii_stations_df = hawaii_stations_df.loc[hawaii_stations_df['latitude'].notnull()]
else:
    print("Hawaii Stations had no null values for latitude.")

Hawaii Stations had no null values for latitude.


In [16]:
# Remove any Hawaii Stations rows that had a null longitude values.
if hawaii_stations_df.longitude.isnull().values.any() == True:
    print("Hawaii Stations were removed because of null longitude values.")
    hawaii_stations_df = hawaii_stations_df.loc[hawaii_stations_df['longitude'].notnull()]
else:
    print("Hawaii Stations had no null values for longitude.")

Hawaii Stations had no null values for longitude.


In [17]:
# Remove any Hawaii Stations rows that had a null elevation values.
if hawaii_stations_df.elevation.isnull().values.any() == True:
    print("Hawaii Stations were removed because of null elevation values.")
    hawaii_stations_df = hawaii_stations_df.loc[hawaii_stations_df['elevation'].notnull()]
else:
    print("Hawaii Stations had no null values for elevation.")

Hawaii Stations had no null values for elevation.


In [18]:
# Export Hawaii Measurements as a CSV, without the Pandas index, but with the header
hawaii_measurements_df.to_csv("Resources/clean_hawaii_measurements.csv", index=False, header=True)

In [19]:
# Export Hawaii Stations as a CSV, without the Pandas index, but with the header
hawaii_stations_df.to_csv("Resources/clean_hawaii_stations.csv", index=False, header=True)

In [20]:
print("Hawaii Measurements Count:  {:,.0f}".format(len(hawaii_measurements_df.index)))
print("Hawaii Stations Count:  {:,.0f}".format(len(hawaii_stations_df.index)))

Hawaii Measurements Count:  18,103
Hawaii Stations Count:  9
