In [1]:
import os
import csv
import pandas as pd
import numpy as np

In [2]:
#connect to the measurements csv file:
measurement_csv = os.path.join("csv_folder", "hawaii_measurements.csv")

In [3]:
#read csv file
measurement_df = pd.read_csv(measurement_csv, dtype=object)

In [4]:
#view dataframe
measurement_df.head(2)

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63


In [5]:
#view data scale
measurement_df.shape

(19550, 4)

In [6]:
#display empty cells by column:
measurement_df.isnull().sum()

station       0
date          0
prcp       1447
tobs          0
dtype: int64

In [7]:
#replace empty cells with NaNs and view dataframe:
measurement_df.replace(r'^\s*$', np.nan, regex=True, inplace = True)
measurement_df.describe()

Unnamed: 0,station,date,prcp,tobs
count,19550,19550,18103,19550
unique,9,2792,342,35
top,USC00519281,2012-01-19,0,74
freq,2772,9,8185,1752


In [8]:
#drop rows with no 'prcp' data leaving 18,103 rows:
clean_measurement_df = measurement_df.dropna(axis=0)
clean_measurement_df.head(2)

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63


In [9]:
#check row counts:
clean_measurement_df.describe()

Unnamed: 0,station,date,prcp,tobs
count,18103,18103,18103,18103
unique,9,2792,342,35
top,USC00519281,2011-04-01,0,74
freq,2772,9,8185,1627


In [10]:
#index the new data:
clean_measurement_df.reset_index(drop=True)
clean_measurement_df.head(2)

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63


In [11]:
#save data in a new csv:
new_file_name_1 = os.path.join("csv_folder","clean_hawaii_measurements.csv")
clean_measurement_df.to_csv(new_file_name_1, index=False)

In [12]:
#connect to the second csv file:
station_csv = os.path.join("csv_folder", "hawaii_stations.csv")

In [13]:
#read csv file
station_df = pd.read_csv(station_csv, dtype=object)

In [14]:
#view dataframe
station_df.head(10)

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


In [15]:
#view data scale
station_df.shape

(9, 5)

In [16]:
#display empty cells by column:
station_df.isnull().sum()

station      0
name         0
latitude     0
longitude    0
elevation    0
dtype: int64

In [18]:
#the stations data is clean, so saving as 'clean' file
new_file_name_2 = os.path.join("csv_folder","clean_hawaii_stations.csv")
station_df.to_csv(new_file_name_2, index=False)