# Wet Kangaroos
## Will it rain tomorrow in Australia?

### 1. Setup workspace

In [1]:
import pandas as pd
import numpy as np

#Append folder for custom libraries
import sys
sys.path.append('libraries')

#Import custom library
import shinypanda

weather = pd.read_csv('data/weatherAUS.csv')

### 2. Explore the dataset

In [2]:
#Head of the dataframe
weather.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [3]:
#Type of data
weather.dtypes

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RISK_MM          float64
RainTomorrow      object
dtype: object

In [4]:
#Descriptive stats
weather.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RISK_MM
count,141556.0,141871.0,140787.0,81350.0,74377.0,132923.0,140845.0,139563.0,140419.0,138583.0,128179.0,128212.0,88536.0,85099.0,141289.0,139467.0,142193.0
mean,12.1864,23.226784,2.349974,5.469824,7.624853,39.984292,14.001988,18.637576,68.84381,51.482606,1017.653758,1015.258204,4.437189,4.503167,16.987509,21.687235,2.360682
std,6.403283,7.117618,8.465173,4.188537,3.781525,13.588801,8.893337,8.803345,19.051293,20.797772,7.105476,7.036677,2.887016,2.720633,6.492838,6.937594,8.477969
min,-8.5,-4.8,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4,0.0
25%,7.6,17.9,0.0,2.6,4.9,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6,0.0
50%,12.0,22.6,0.0,4.8,8.5,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1,0.0
75%,16.8,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4,0.8
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7,371.0


### 3. Data cleaning
#### In order to use sklearn algorithms, the entire dataset must contain operable numerical data

#### 3.1 Remove overdata

In [5]:
#Col 'Risk_MM' includes final data
#Col 'Date' is not relevant to this analysis
weather = weather.drop(columns=['RISK_MM','Date'],axis=1)

#### 3.2 Manage missing values

In [6]:
#Check for null values
weather.count().sort_values()

Sunshine          74377
Evaporation       81350
Cloud3pm          85099
Cloud9am          88536
Pressure9am      128179
Pressure3pm      128212
WindDir9am       132180
WindGustDir      132863
WindGustSpeed    132923
WindDir3pm       138415
Humidity3pm      138583
Temp3pm          139467
WindSpeed3pm     139563
Humidity9am      140419
RainToday        140787
Rainfall         140787
WindSpeed9am     140845
Temp9am          141289
MinTemp          141556
MaxTemp          141871
Location         142193
RainTomorrow     142193
dtype: int64

In [7]:
#By default, 'wipe_empty_columns' remove columns with more than 70% of NaN
#Arg 'max_empty' is used to change the threshold
weather = shinypanda.wipe_empty_columns(weather, max_empty=0.33)
weather.count().sort_values()

Pressure9am      128179
Pressure3pm      128212
WindDir9am       132180
WindGustDir      132863
WindGustSpeed    132923
WindDir3pm       138415
Humidity3pm      138583
Temp3pm          139467
WindSpeed3pm     139563
Humidity9am      140419
Rainfall         140787
RainToday        140787
WindSpeed9am     140845
Temp9am          141289
MinTemp          141556
MaxTemp          141871
Location         142193
RainTomorrow     142193
dtype: int64

In [8]:
#Remove NAs
weather = weather.dropna(how='any')
weather.count().sort_values()

Location         112925
Temp3pm          112925
Temp9am          112925
Pressure3pm      112925
Pressure9am      112925
Humidity3pm      112925
Humidity9am      112925
WindSpeed3pm     112925
WindSpeed9am     112925
WindDir3pm       112925
WindDir9am       112925
WindGustSpeed    112925
WindGustDir      112925
Rainfall         112925
MaxTemp          112925
MinTemp          112925
RainToday        112925
RainTomorrow     112925
dtype: int64

#### 3.3 Transform strings

In [9]:
#Change bool values from string to int
weather['RainToday'].replace({'No': 0, 'Yes': 1}, inplace = True)
weather['RainTomorrow'].replace({'No': 0, 'Yes': 1}, inplace = True)

weather.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Albury,13.4,22.9,0.6,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,0,0
1,Albury,7.4,25.1,0.0,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,0,0
2,Albury,12.9,25.7,0.0,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,0,0
3,Albury,9.2,28.0,0.0,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,0,0
4,Albury,17.5,32.3,1.0,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,0,0


In [10]:
#Get dummies from categorical columns (wind directions)
weather = pd.get_dummies(weather, columns=['WindGustDir', 'WindDir3pm', 'WindDir9am'])

weather.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW
0,Albury,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,...,0,0,0,0,0,0,0,1,0,0
1,Albury,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,...,1,0,0,0,0,0,0,0,0,0
2,Albury,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,...,0,0,0,0,0,0,0,1,0,0
3,Albury,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,...,0,0,0,1,0,0,0,0,0,0
4,Albury,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,...,0,0,0,0,0,0,0,0,0,0


## 4. Area selector _(optional)_
#### If executed, specify an area of Australia to improve algorythms accuracy

In [11]:
import ipywidgets as widgets

locations_list = list(weather['Location'].unique())
location = ''

#Charge the selector
selector = widgets.Dropdown(
    options=locations_list,
    description='Area:',
    disabled=False,
    )

display(selector)

Dropdown(description='Area:', options=('Albury', 'BadgerysCreek', 'Cobar', 'CoffsHarbour', 'Moree', 'NorahHead…

In [21]:
#Import the selector
location = selector.value

#Filter dataset by location
location_filter = weather['Location']==location
located_weather = weather[location_filter]

#Remove location column
located_weather = weather.drop(columns='Location', axis=1)

print('Data loaded for', location, "area:")
located_weather.head()

Data loaded for Ballarat area:


Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,...,0,0,0,0,0,0,0,1,0,0
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,...,1,0,0,0,0,0,0,0,0,0
2,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,...,0,0,0,0,0,0,0,1,0,0
3,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,...,0,0,0,1,0,0,0,0,0,0
4,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,...,0,0,0,0,0,0,0,0,0,0


## 5. Machine learning methods comparison

In [None]:
#Random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time

X = located_weather.drop('RainTomorrow', axis=1)
y = located_weather['RainTomorrow']

t0 = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=5)
clf_rf.fit(X_train, y_train)
y_pred = clf_rf.predict(X_test)
score = accuracy_score(y_test, y_pred)

print('Accuracy :', score)
print('Time taken :' , time.time()-t0)

In [None]:
#Decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

t0=time.time()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)
clf_dt = DecisionTreeClassifier(random_state=5)
clf_dt.fit(X_train,y_train)
y_pred = clf_dt.predict(X_test)
score = accuracy_score(y_test, y_pred)

print('Accuracy :', score)
print('Time taken :' , time.time()-t0)

In [None]:
from sklearn import tree
from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron = perceptron.fit(X_train, y_train)
perceptron.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, perceptron.predict(X_test)))

In [None]:
#Support vector
from sklearn import svm
from sklearn.model_selection import train_test_split

t0=time.time()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)
clf_svc = svm.SVC(kernel='linear')
clf_svc.fit(X_train,y_train)
y_pred = clf_svc.predict(X_test)
score = accuracy_score(y_test,y_pred)

print('Accuracy :', score)
print('Time taken :' , time.time()-t0)

### con localizaciones dummies

In [None]:
weather_dummies = weather.copy()
weather_dummies = pd.get_dummies(weather_dummies, columns=['Location'])

In [None]:
#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time

X = weather_dummies.drop('RainTomorrow', axis=1)
y = weather_dummies['RainTomorrow']

t0 = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=5)
clf_rf.fit(X_train, y_train)
y_pred = clf_rf.predict(X_test)
score = accuracy_score(y_test, y_pred)

print('Accuracy :', score)
print('Time taken :' , time.time()-t0)

In [None]:
#DECISION TREE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

t0=time.time()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)
clf_dt = DecisionTreeClassifier(random_state=5)
clf_dt.fit(X_train,y_train)
y_pred = clf_dt.predict(X_test)
score = accuracy_score(y_test, y_pred)
print('Accuracy :', score)
print('Time taken :' , time.time()-t0)

In [None]:
#SUPPORT VECTOR
from sklearn import svm
from sklearn.model_selection import train_test_split

t0=time.time()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33)
clf_svc = svm.SVC(kernel='linear')
clf_svc.fit(X_train,y_train)
y_pred = clf_svc.predict(X_test)
score = accuracy_score(y_test,y_pred)
print('Accuracy :',score)
print('Time taken :' , time.time()-t0)