# Step 1 - Data Engineering

In [1]:
# The climate data for Hawaii is provided through two CSV files. 
# Start by using Python and Pandas to inspect the content of these files and clean the data.

# Create a Jupyter Notebook file called data_engineering.ipynb and 
# use this to complete all of your Data Engineering tasks.

# Use Pandas to read in the measurement and station CSV files as DataFrames.

# Inspect the data for NaNs and missing values. You must decide what to do with this data.

# Save your cleaned CSV files with the prefix clean_.

In [2]:
# Import Dependencies
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 

In [3]:
df1 = pd.read_csv("Resources/hawaii_measurements.csv", encoding='cp1252')
df1.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
print(df1.count())
print(f"prcp has missing data") 
print("------------------------------")

# or look for which columns have missing data this way: 
# which columns have missing data? 
print(df1.isnull().any())

# the number of missing values 
print("------------------------------")

print(f"the number of missing values:")
df1.isnull().sum().sum() 

station    19550
date       19550
prcp       18103
tobs       19550
dtype: int64
prcp has missing data
------------------------------
station    False
date       False
prcp        True
tobs       False
dtype: bool
------------------------------
the number of missing values:


1447

In [5]:
# Drop the ROWS where ANY of the elements are nan:

df1a = df1.dropna(axis=0, how='any')

print(df1a.count())
print("------------------------------")

# or look for which columns have missing data this way: 
# which columns have missing data? 
print(df1a.isnull().any())

# the number of missing values 
print("------------------------------")

print(f"the number of missing values:")
df1a.isnull().sum().sum() 

station    18103
date       18103
prcp       18103
tobs       18103
dtype: int64
------------------------------
station    False
date       False
prcp       False
tobs       False
dtype: bool
------------------------------
the number of missing values:


0

### measurements df1a is cleaned (dropped any rows with nans)

In [6]:
df1a.to_csv("clean_hawaii_measurements.csv")

In [7]:
df2 = pd.read_csv("Resources/hawaii_stations.csv", encoding='cp1252')
df2.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [8]:
print(df2.count())
print(f"no columns with missing data") 
print("------------------------------")

station      9
name         9
latitude     9
longitude    9
elevation    9
dtype: int64
no columns with missing data
------------------------------


In [9]:
df2

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


In [10]:
df2.to_csv("clean_hawaii_stations.csv")