# Weather prediction: Rains or not
I build a machine-learning classification model to make a prediction for the weather, whether it rains or not tomorrow. I work with the data from Australia in order to run the model and get results. I check accuracy scores and computation time of various models to find the best model that fits the data, including Logistic Regression, Decision Tree, Random Forest, and Support Vector Machine.

In [1]:
#Load the csv file as data frame.
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('weatherAUS.csv')


In [2]:
# The shape (rows and columns) of the data in random sample
print('Size of the data frame is :',df.shape)
df.sample(5)


Size of the data frame is : (142193, 24)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
104313,2012-10-04,Woomera,15.3,34.6,0.0,19.0,11.5,N,52.0,N,...,7.0,1018.5,1013.5,0.0,1.0,27.7,34.0,No,0.0,No
120750,2017-01-27,Perth,21.4,34.0,0.0,14.2,12.9,SW,43.0,S,...,39.0,1009.9,1007.8,3.0,1.0,27.4,31.1,No,0.0,No
4693,2013-12-24,BadgerysCreek,18.1,21.4,0.0,,,ESE,24.0,SSE,...,65.0,1018.6,1018.6,,,19.6,20.3,No,0.0,No
50017,2015-05-31,Tuggeranong,-0.9,14.0,0.0,,,WSW,46.0,,...,48.0,1016.3,1013.0,,,2.6,11.1,No,1.0,No
96682,2016-10-02,Adelaide,12.9,20.7,0.0,,,N,44.0,NNW,...,56.0,996.1,996.7,,,19.8,17.7,No,27.6,Yes


# Data Pre-processing

In [3]:
# Find out null values
df.count().sort_values()

Sunshine          74377
Evaporation       81350
Cloud3pm          85099
Cloud9am          88536
Pressure9am      128179
Pressure3pm      128212
WindDir9am       132180
WindGustDir      132863
WindGustSpeed    132923
WindDir3pm       138415
Humidity3pm      138583
Temp3pm          139467
WindSpeed3pm     139563
Humidity9am      140419
RainToday        140787
Rainfall         140787
WindSpeed9am     140845
Temp9am          141289
MinTemp          141556
MaxTemp          141871
Date             142193
Location         142193
RISK_MM          142193
RainTomorrow     142193
dtype: int64

In [4]:
# Drop columns with the highest missing values
# Drop the location column as the task is to predict rain in Australia
# Drop the Date column
# Drop RISK_MM to prevent the multicollinearity problem
df = df.drop(columns=['Sunshine','Evaporation','Cloud3pm','Cloud9am','Location','RISK_MM','Date'],axis=1)
df.shape

(142193, 17)

In [5]:
# Get rid of all rows with null values
df = df.dropna(how='any')
df.shape

(112925, 17)

In [6]:
# Get rid of the outliers in the numerical data - Use z-score to detect and remove the outliers
from scipy import stats
z = np.abs(stats.zscore(df._get_numeric_data()))
print(z)
df= df[(z < 3).all(axis=1)]
print(df.shape)

[[0.11756741 0.10822071 0.20666127 ... 1.14245477 0.08843526 0.04787026]
 [0.84180219 0.20684494 0.27640495 ... 1.04184813 0.04122846 0.31776848]
 [0.03761995 0.29277194 0.27640495 ... 0.91249673 0.55672435 0.15688743]
 ...
 [1.44940294 0.23548728 0.27640495 ... 0.58223051 1.03257127 0.34701958]
 [1.16159206 0.46462594 0.27640495 ... 0.25166583 0.78080166 0.58102838]
 [0.77784422 0.4789471  0.27640495 ... 0.2085487  0.37167606 0.56640283]]
(107868, 17)


In [7]:
# Recode the categorical columns: 'yes' to 1 and 'no' to 0 for RainToday and RainTomorrow
df['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
df['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)


In [8]:
# See values of other categorical columns
print("WindGustDir: ", np.unique(df['WindGustDir']))
print("WindDir3pm:  ", np.unique(df['WindDir3pm']))
print("WindDir9am:  ", np.unique(df['WindDir9am']))

WindGustDir:  ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']
WindDir3pm:   ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']
WindDir9am:   ['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']


In [9]:
# Create dummies from the categorical columns
df = pd.get_dummies(df, columns=['WindGustDir', 'WindDir3pm', 'WindDir9am'])
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,...,0,0,0,0,0,0,0,1,0,0
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,...,1,0,0,0,0,0,0,0,0,0
2,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,...,0,0,0,0,0,0,0,1,0,0
3,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,...,0,0,0,1,0,0,0,0,0,0
4,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Standardize the data with MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW
0,0.518717,0.464198,0.021429,0.506849,0.486486,0.52381,0.674157,0.22,0.268409,0.309353,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.358289,0.518519,0.0,0.506849,0.054054,0.47619,0.370787,0.25,0.337292,0.326139,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.505348,0.533333,0.0,0.534247,0.459459,0.571429,0.303371,0.3,0.266033,0.347722,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.406417,0.590123,0.0,0.232877,0.243243,0.166667,0.382022,0.16,0.503563,0.446043,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.628342,0.696296,0.035714,0.465753,0.135135,0.428571,0.797753,0.33,0.342043,0.282974,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature selection

In [11]:
# Select the 3 most important features for the prediction of RainTomorrow - use SelectKBest based on chi2 values
from sklearn.feature_selection import SelectKBest, chi2
X = df.loc[:, df.columns!='RainTomorrow']
y = df[['RainTomorrow']]
selector = SelectKBest(chi2, k=3)
selector.fit(X, y)
X_new = selector.transform(X)
print(X.columns[selector.get_support(indices=True)])

Index(['Rainfall', 'Humidity3pm', 'RainToday'], dtype='object')


In [12]:
# Redefine the dataframe and find the best predictor for y
df = df[['Humidity3pm','Rainfall','RainToday','RainTomorrow']]
# Play with the variables of the df to find out the best predictor for RainTomorrow
# And see that the Humidity3pm is the best one to assign it as X for the final model
X = df[['Humidity3pm']] 
y = df[['RainTomorrow']]
print(df.shape)
df.head()

(107868, 4)


Unnamed: 0,Humidity3pm,Rainfall,RainToday,RainTomorrow
0,0.22,0.021429,0.0,0.0
1,0.25,0.0,0.0,0.0
2,0.3,0.0,0.0,0.0
3,0.16,0.0,0.0,0.0
4,0.33,0.035714,0.0,0.0


# Train test splitting

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=0.25)

# Finding the best model

In [14]:
#Logistic Regression 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import time

t0=time.time()
logreg = LogisticRegression(random_state=0)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
score = accuracy_score(y_test, y_pred)
print('Accuracy : ', score)
print('Time taken : ', time.time()-t0)

Accuracy :  0.8355026513887344
Time taken :  0.14999055862426758


  return f(**kwargs)


In [15]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

t0=time.time()
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

score = accuracy_score(y_test, y_pred)
print('Accuracy :',score)
print('Time taken :' , time.time()-t0)

Accuracy : 0.8359105573478696
Time taken : 0.08699417114257812


In [16]:
# Random Forest 
from sklearn.ensemble import RandomForestClassifier

t0=time.time()
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

score = accuracy_score(y_test, y_pred)
print('Accuracy  :', score)
print('Time taken:', time.time()-t0)

  rf.fit(X_train, y_train)


Accuracy  : 0.8359105573478696
Time taken: 4.126753807067871


In [17]:
#Support Vector Machine
from sklearn import svm

t0=time.time()
sv = svm.SVC(kernel='linear', probability=True)
sv.fit(X_train, y_train)
y_pred = sv.predict(X_test)

score = accuracy_score(y_test,y_pred)
print('Accuracy  :',score)
print('Time taken :', time.time()-t0)

  return f(**kwargs)


Accuracy  : 0.7929321021989839
Time taken : 388.7428081035614


Considering the computation time and accuracy rate the DecisionTreeClassifier model appears to be the best model.