In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [98]:
df = pd.read_csv("../weatherAUS.csv")
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,12/1/2008,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,12/2/2008,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,12/3/2008,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,12/4/2008,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,12/5/2008,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,6/21/2017,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,6/22/2017,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,6/23/2017,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,6/24/2017,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


In [99]:
#Remove Rows where required columns are null
df1 = df[df['RainTomorrow'].notna() & df['Location'].notna()]

#Remove columns
df1 = df1.drop('Date', axis=1)

#Replace with mean values for numerical values
mean_col = ['MinTemp','MaxTemp','Rainfall','Evaporation','Sunshine','WindGustSpeed','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Pressure9am','Pressure3pm','Cloud9am','Cloud3pm','Temp9am','Temp3pm']
df1[mean_col] = df1[mean_col].fillna(df1.groupby('Location')[mean_col].transform(lambda x: x.fillna(x.mean())))
#Replace with mean of the column for null mean values of groups
df1[mean_col] = df1[mean_col].fillna(df1.groupby('Location')[mean_col].mean().mean())

#Replace with mode values for categorical values
mode_col = ['WindGustDir','WindDir9am','WindDir3pm','RainToday']
df1[mode_col] = df1[mode_col].fillna(df1.groupby('Location')[mode_col].transform(lambda x: x.fillna(x.mode().iloc[0])))
#Replace with mode of the column for null mode values of groups
df1[mode_col] = df1[mode_col].fillna(df1.mode().iloc[0])

In [104]:
#Convert label values into numerical values
le = LabelEncoder()
df1['Location'] = le.fit_transform(df1['Location'])
df1['WindGustDir'] = le.fit_transform(df1['WindGustDir'])
df1['WindDir9am'] = le.fit_transform(df1['WindDir9am'])
df1['WindDir3pm'] = le.fit_transform(df1['WindDir3pm'])
df1['RainToday'] = le.fit_transform(df1['RainToday'])
df1['RainTomorrow'] = le.fit_transform(df1['RainTomorrow'])

In [105]:
#Define x,y variables
x = df1.drop(['RainTomorrow'], axis=1)
y = df1['RainTomorrow']

In [106]:
#Divide the dataset for training and testing
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2)

In [110]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

#Logistic regression to the training set model 2
logreg = LogisticRegression(solver='liblinear', random_state=0)
result = logreg.fit(x_train, y_train)

#Predict the test set results model 2
y_pred_test = logreg.predict(x_test)
print('Model 2 accuracy (Logistic Regression): {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))

Model 2 accuracy (Linear Regression): 0.8383


In [111]:
#Naive Bayes
from sklearn.preprocessing import StandardScaler  
from sklearn.naive_bayes import GaussianNB  

#Feature scaling
sc = StandardScaler()  
x_train = sc.fit_transform(x_train)  
x_test = sc.transform(x_test)  

#Fit Naive Bayes to the training set model 2
classifier = GaussianNB()  
classifier.fit(x_train, y_train)  

#Predict the test set results model 2
y_pred_test = classifier.predict(x_test)  
print('Model 2 accuracy (Naive Bayes): {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))

Model 2 accuracy (Naive Bayes): 0.7995


In [112]:
#K-Nearest Neighbor
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier  

#Feature Scaling     
sc= StandardScaler()    
x_train= sc.fit_transform(x_train)    
x_test= sc.transform(x_test)  

#Fit K-Nearest Neighbor to the training set model 2
classifier= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 )  
classifier.fit(x_train, y_train)  

#Predict the test set results model 2
y_pred_test= classifier.predict(x_test)  
print('Model 2 accuracy (K-Nearest Neighbor): {0:0.4f}'. format(accuracy_score(y_test, y_pred_test)))

Model 2 accuracy (K-Nearest Neighbor): 0.8317
