### The summary of data cleaning:
1. Find and correct errors in nominal data
2. Find and count the total number of missing values (nan/null) for each column
3. Drop columns with more than 10% missing values
4. Drop rows with more than 20% missing values
5. Calculate z-score for columns with continous data and replace outliers by the mean
6. Replace all missing values by implementing appropriate algorithms such as Linear Regression or the mean
7. Visualizing each column with appropriate graphs.

In [272]:
# Import all necessary libraries
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import collections
import datetime
import json
import re

%matplotlib inline

In [273]:
# Retrieve data
data = pd.read_csv('data/weatherAUS.csv')
columns = data.columns
print(columns)
data.describe()

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
      dtype='object')


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RISK_MM
count,141556.0,141871.0,140787.0,81350.0,74377.0,132923.0,140845.0,139563.0,140419.0,138583.0,128179.0,128212.0,88536.0,85099.0,141289.0,139467.0,142193.0
mean,12.1864,23.226784,2.349974,5.469824,7.624853,39.984292,14.001988,18.637576,68.84381,51.482606,1017.653758,1015.258204,4.437189,4.503167,16.987509,21.687235,2.360682
std,6.403283,7.117618,8.465173,4.188537,3.781525,13.588801,8.893337,8.803345,19.051293,20.797772,7.105476,7.036677,2.887016,2.720633,6.492838,6.937594,8.477969
min,-8.5,-4.8,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4,0.0
25%,7.6,17.9,0.0,2.6,4.9,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6,0.0
50%,12.0,22.6,0.0,4.8,8.5,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1,0.0
75%,16.8,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4,0.8
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7,371.0


In [274]:
# Data information
with open('data/data_info.json', 'r') as fp:
    data_info = json.load(fp)

for key, value in data_info.items():
    print(f'{key}: {value}')
    print("")

Date: The date of observation

Location: The common name of the location of the weather station

MinTemp: The minimum temperature in degrees celsius

MaxTemp: The maximum temperature in degrees celsius

Rainfall: The amount of rainfall recorded for the day in mm

Evaporation: The so-called Class A pan evaporation (mm) in the 24 hours to 9am

Sunshine: The number of hours of bright sunshine in the day

WindGustDir: The direction of the strongest wind gust in the 24 hours to midnight

WindGustSpeed: The speed (km/h) of the strongest wind gust in the 24 hours to midnight

WindDir9am: Direction of the wind at 9am

WindDir3pm: Direction of the wind at 3pm

WindSpeed9am: Wind speed (km/hr) averaged over 10 minutes prior to 9am

WindSpeed3pm: Wind speed (km/hr) averaged over 10 minutes prior to 3pm

Humidity9am: Humidity (percent) at 9am

Humidity3pm: Humidity (percent) at 3pm

Pressure9am: Atmospheric pressure (hpa) reduced to mean sea level at 9am

Pressure3pm: Atmospheric pressure (hpa) re

### 1. Find and correct errors in nominal columns

In [275]:
"""
Find nominal columns by searching non-float columns
"""

string_columns = []
float_columns = []
data_types = data.dtypes.values

for index in range(len(data_types)):
    if data_types[index] != np.dtype('float64'):
        string_columns.append(index)
    else:
        float_columns.append(index)
        
nominal_columns = [columns[col] for col in string_columns]
print("The nominal columns:", ", ".join(nominal_columns))

The nominal columns: Date, Location, WindGustDir, WindDir9am, WindDir3pm, RainToday, RainTomorrow


In [276]:
"""
Evaluate Date
"""
# Define a function to check if the date is valid
def check_valid_format(date):
    date_format = '%Y-%m-%d'
    try:
        datetime.datetime.strptime(date, date_format)
        return True
    except ValueError:
        return False
    
# In order to reduce iteration time, find all unique dates
unique_date = set(data[nominal_columns[0]])

# An array of boolean values: True if value; otherwise, False
is_valid_date_format = []
for date in unique_date:
    is_valid_date_format.append(check_valid_format(date))
    
# Find invalid data corresponding to False
count = 0
invalid_date = []
for index in range(len(is_valid_date_format)):
    if is_valid_date_format[index] == False:
        count += 1
        invalid_date.append(is_vaid_date_format[index])
        
if count == 0:
    print("There are no invalid date in 'Date' column in terms of format.")

There are no invalid date in 'Date' column in terms of format.


In [277]:
"""
Evaluate Location
"""
# In order to reduce iteration time, find all unique location
unique_location = set(data[nominal_columns[1]])
print("***Before fixed:", ", ".join(unique_location))
print("")

# There are some locations needed to be fixed
# Insert a space before capitcal letter if the location
# name has 2 words
fixed_location = []
for location in unique_location:
    fixed_name = re.sub(r"(\w)([A-Z])", r"\1 \2", location)
    fixed_location.append(fixed_name)

print("***After fixed:",", ".join(fixed_location))

# Replace the old locations with the fixed locations
data = data.replace(unique_location, fixed_location)

***Before fixed: Penrith, Nhil, Moree, Newcastle, Bendigo, PerthAirport, Perth, WaggaWagga, Townsville, Sale, Nuriootpa, Woomera, Mildura, SydneyAirport, Albany, MountGambier, Witchcliffe, CoffsHarbour, Albury, AliceSprings, SalmonGums, Ballarat, Portland, Canberra, Brisbane, GoldCoast, Adelaide, Melbourne, NorfolkIsland, Watsonia, Darwin, Dartmoor, Richmond, BadgerysCreek, Katherine, Wollongong, Cobar, MountGinini, Cairns, PearceRAAF, Hobart, MelbourneAirport, Launceston, NorahHead, Williamtown, Sydney, Tuggeranong, Walpole, Uluru

***After fixed: Penrith, Nhil, Moree, Newcastle, Bendigo, Perth Airport, Perth, Wagga Wagga, Townsville, Sale, Nuriootpa, Woomera, Mildura, Sydney Airport, Albany, Mount Gambier, Witchcliffe, Coffs Harbour, Albury, Alice Springs, Salmon Gums, Ballarat, Portland, Canberra, Brisbane, Gold Coast, Adelaide, Melbourne, Norfolk Island, Watsonia, Darwin, Dartmoor, Richmond, Badgerys Creek, Katherine, Wollongong, Cobar, Mount Ginini, Cairns, Pearce RA AF, Hobart, M

In [278]:
"""
WindGustDir
"""
# In order to reduce iteration time, find all unique WindGustDir
unique_WindGustDir = list(set(data[nominal_columns[2]]))
print("The direction:", ", ".join(unique_WindGustDir[1:]))

print("There is no invalid data in WinGusGir column in terms of format.")

The direction: NW, WNW, ENE, SE, W, WSW, NE, NNE, ESE, SSE, SSW, SW, S, E, N, NNW
There is no invalid data in WinGusGir column in terms of format.


In [279]:
"""
WindDir9am
"""
# In order to reduce iteration time, find all unique WindDir9am
unique_WindDir9am = list(set(data[nominal_columns[3]]))
print("The direction:", ", ".join(unique_WindDir9am[1:]))

print("There is no invalid data in WindDir9am column in terms of format.")

The direction: SE, NW, ENE, WNW, W, NE, WSW, NNE, ESE, SSE, SSW, SW, S, E, N, NNW
There is no invalid data in WindDir9am column in terms of format.


In [280]:
"""
WindDir3pm
"""
# In order to reduce iteration time, find all unique WindDir3pm
unique_WindDir3pm = list(set(data[nominal_columns[4]]))
print("The direction:", ", ".join(unique_WindDir9am[1:]))

print("There is no invalid data in WindDir3pm cloumn in terms of format.")

The direction: SE, NW, ENE, WNW, W, NE, WSW, NNE, ESE, SSE, SSW, SW, S, E, N, NNW
There is no invalid data in WindDir3pm cloumn in terms of format.


In [281]:
"""
RainToday
"""
# In order to reduce iteration time, find all unique RainToday
unique_RainToday = list(set(data[nominal_columns[5]]))
print("The values:", ", ".join(unique_RainToday[1:]))

print("There is no invalid data in RainToday cloumn in terms of grammar.")

The values: Yes, No
There is no invalid data in RainToday cloumn in terms of grammar.


In [282]:
"""
RainTomorrow
"""
# In order to reduce iteration time, find all unique RainTomorrow
unique_RainTomorrow = list(set(data[nominal_columns[6]]))
print("The values:", ", ".join(unique_RainTomorrow))

print("There is no invalid data in RainTomorrow cloumn in terms of grammar.")

The values: Yes, No
There is no invalid data in RainTomorrow cloumn in terms of grammar.


### 2. Find and count the total number of missing values (nan/null) for each column

In [293]:
def count_nan(column, data):
    count = int(data[column].isna().sum())
    return count

def print_index(count, column):
    if count == 0:
        print(f"There are no nan values in {column}.")
    else:
        print(f"The number of nan value in {column}: {count}")

nan_count_list = [] # Used for step 3
for column_order in range(0, data.shape[1]):
    nan_count = count_nan(column=columns[column_order], data=data)
    nan_count_list.append(nan_count)
    print_index(nan_count, columns[column_order])

There are no nan values in Date.
There are no nan values in Location.
The number of nan value in MinTemp: 637
The number of nan value in MaxTemp: 322
The number of nan value in Rainfall: 1406
The number of nan value in Evaporation: 60843
The number of nan value in Sunshine: 67816
The number of nan value in WindGustDir: 9330
The number of nan value in WindGustSpeed: 9270
The number of nan value in WindDir9am: 10013
The number of nan value in WindDir3pm: 3778
The number of nan value in WindSpeed9am: 1348
The number of nan value in WindSpeed3pm: 2630
The number of nan value in Humidity9am: 1774
The number of nan value in Humidity3pm: 3610
The number of nan value in Pressure9am: 14014
The number of nan value in Pressure3pm: 13981
The number of nan value in Cloud9am: 53657
The number of nan value in Cloud3pm: 57094
The number of nan value in Temp9am: 904
The number of nan value in Temp3pm: 2726
The number of nan value in RainToday: 1406
There are no nan values in RISK_MM.
There are no nan v

### 3. Drop columns with more than 10% missing values

In [296]:
# Find columns with missing values (nan)
missing_value_dict = dict() # Key: Column name; Value: the number of nan
for index in range(len(nan_count_list)):
    if nan_count_list[index] != 0:
        missing_value_dict[columns[index]] = nan_count_list[index]

# Calculate percent of missing value for each column
percent_nan_dict = dict()
for key, value in missing_value_dict.items():
    percent_nan_dict[key] = (value / data.shape[0]) * 100
    
# Find corrensponding columns for columns with more than 10% missing values
removed_columns = []
for key, value in percent_nan_dict.items():
    if(value > 10):
        removed_columns.append(key)

if removed_columns!= []:
    print("The columns with more than 10% missing values:", ", ".join(removed_columns))

    data.drop(columns=removed_columns, inplace=True)
else:
    print("There are no columns with more than 10% missing values")

The columns with more than 10% missing values: Evaporation, Sunshine, Cloud9am, Cloud3pm


### 4. Drop rows with more than 20% missing values

In [326]:
# Find rows with missing values (nan)
missing_values_dict = dict()

# Count the total number of missing values for each row
for i in range(0, data.shape[0]):
    total_missing_value = 0
    is_nan_list = np.array(data.iloc[i:i+1,:].isnull())[0]
    total_missing_value = np.count_nonzero(is_nan_list)  
    missing_values_dict[i] = total_missing_value

In [335]:
# Calculate percent of missing value for each row
percent_nan_dict = dict()
for key, value in missing_values_dict.items():
    percent_nan_dict[key] = (value / data.shape[0]) * 100

# Find corrensponding rows for rows with more than 20% missing values
removed_rows = []
for key, value in percent_nan_dict.items():
    if(value > 20):
        removed_rows.append(key)

if removed_rows != []:
    print("The rows with more than w0% missing values:", ", ".join(removed_columns))

    data.drop(removed_columns, inplace=True)
else:
    print("There are no rows with more than 20% missing values")

There are no rows with more than 20% missing values


### 5.Calculate z-score for columns with continous data and replace outliers by the mean