# Rainfall Prediction in Australian Areas
* Data set and extraction 
  * Dataset is taken from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package/version/2#weatherAUS.csv 
  * Data extraction is by downloading and reading the excel file. 
* Data cleanup and organizing existing data
  * Column Dropped #These columns are not used as I have derived new columns using these.
      * 'Date', 'Location','RainToday', 'RISK_MM'
  * All the null values 'NaN' is replaced by mean of the column as this will not have wrong effect on our model for calculations  
* For Data visualization
  * The critical dimensions(columns) of the data are Rainfall, Humidity, Evaporation, Pressure, Sunshine 
      * The correlation of rainfall with Evaporation, Humidity and WindSpeed has been plotted.
* Predictive statistics
  * Target - It will Rain Tomorrow? Yes or No


# Step 1 - Importing libararies and Reading file

In [1]:
#Importing libararies and Reading file
import pandas as pd
import os
import numpy as np  
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels
import statsmodels.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
print(os.getcwd())


C:\Users\Mandy\Documents\ALY6020\Final Project\Final Project


In [2]:
mydata = pd.read_csv('weatherAUS.csv')

In [3]:
#Printing the top 5 Row of the imported dataset
mydata.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [4]:
#Following are the headers of the 
print(mydata.columns)

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
      dtype='object')


# Step 2 - Exploring and Cleaning the Data 


## Handle the Nulls values and Reducing Variables

In [5]:
mydata.shape

(142193, 24)

In [6]:
mydata.fillna(mydata.mean(), inplace=True)

In [7]:
mydata.dtypes

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RISK_MM          float64
RainTomorrow      object
dtype: object

## Converting categorical to numerical data

In [8]:
mydata['WindGustDir'],_ = pd.factorize(mydata['WindGustDir'])
mydata['WindDir9am'],_ = pd.factorize(mydata['WindDir9am'])
mydata['WindDir3pm'],_ = pd.factorize(mydata['WindDir3pm'])
mydata['RainTomorrow'],_ = pd.factorize(mydata['RainTomorrow'])

In [9]:
mydata.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,5.469824,7.624853,0,44.0,0,...,22.0,1007.7,1007.1,8.0,4.503167,16.9,21.8,No,0.0,0
1,2008-12-02,Albury,7.4,25.1,0.0,5.469824,7.624853,1,44.0,1,...,25.0,1010.6,1007.8,4.437189,4.503167,17.2,24.3,No,0.0,0
2,2008-12-03,Albury,12.9,25.7,0.0,5.469824,7.624853,2,46.0,0,...,30.0,1007.6,1008.7,4.437189,2.0,21.0,23.2,No,0.0,0
3,2008-12-04,Albury,9.2,28.0,0.0,5.469824,7.624853,3,24.0,2,...,16.0,1017.6,1012.8,4.437189,4.503167,18.1,26.5,No,1.0,0
4,2008-12-05,Albury,17.5,32.3,1.0,5.469824,7.624853,0,41.0,3,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,0


In [10]:
mydata.dtypes

Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir        int64
WindGustSpeed    float64
WindDir9am         int64
WindDir3pm         int64
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RISK_MM          float64
RainTomorrow       int64
dtype: object

In [11]:
mydata.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RISK_MM,RainTomorrow
count,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,...,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0,142193.0
mean,12.1864,23.226784,2.349974,5.469824,7.624853,6.994135,39.984292,6.848931,7.186528,14.001988,...,68.84381,51.482606,1017.653758,1015.258204,4.437189,4.503167,16.987509,21.687235,2.360682,0.224181
std,6.388924,7.109554,8.423217,3.168114,2.734927,4.994925,13.138385,4.90862,4.720509,8.851082,...,18.932077,20.532065,6.746248,6.681788,2.27808,2.104709,6.472166,6.870771,8.477969,0.417043
min,-8.5,-4.8,0.0,0.0,0.0,-1.0,6.0,-1.0,-1.0,0.0,...,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4,0.0,0.0
25%,7.6,17.9,0.0,4.0,7.624853,2.0,31.0,3.0,3.0,7.0,...,57.0,37.0,1013.5,1011.0,3.0,4.0,12.3,16.7,0.0,0.0
50%,12.0,22.7,0.0,5.469824,7.624853,7.0,39.0,7.0,7.0,13.0,...,70.0,51.482606,1017.653758,1015.258204,4.437189,4.503167,16.8,21.3,0.0,0.0
75%,16.8,28.2,0.8,5.469824,8.7,11.0,46.0,11.0,11.0,19.0,...,83.0,65.0,1021.8,1019.4,6.0,6.0,21.5,26.3,0.8,0.0
max,33.9,48.1,371.0,145.0,14.5,15.0,135.0,15.0,15.0,130.0,...,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7,371.0,1.0


## Normalizing the Data-set

In [12]:
mydata.drop(columns={'Date', 'Location','RainToday', 'RISK_MM'}, inplace=True) 

In [13]:
mydata.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
0,13.4,22.9,0.6,5.469824,7.624853,0,44.0,0,0,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,4.503167,16.9,21.8,0
1,7.4,25.1,0.0,5.469824,7.624853,1,44.0,1,1,4.0,22.0,44.0,25.0,1010.6,1007.8,4.437189,4.503167,17.2,24.3,0
2,12.9,25.7,0.0,5.469824,7.624853,2,46.0,0,1,19.0,26.0,38.0,30.0,1007.6,1008.7,4.437189,2.0,21.0,23.2,0
3,9.2,28.0,0.0,5.469824,7.624853,3,24.0,2,2,11.0,9.0,45.0,16.0,1017.6,1012.8,4.437189,4.503167,18.1,26.5,0
4,17.5,32.3,1.0,5.469824,7.624853,0,41.0,3,3,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0


In [15]:
mydata_norm = (mydata - mydata.min()) / (mydata.max() - mydata.min())

## Load features and target values

In [16]:
y = mydata_norm['RainTomorrow']
X = mydata_norm.drop(columns=['RainTomorrow'])

## Split Train and Test Data Set

In [17]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=40)

In [18]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=21)  
classifier.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=21, p=2,
           weights='uniform')

## Evaluating model performance with K=21

In [19]:
from sklearn.metrics import classification_report, confusion_matrix  
y_pred = classifier.predict(X_test)  
print(np.mean(y_pred != y_test))
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

0.16276943633742397
[[21109   879]
 [ 3750  2701]]
              precision    recall  f1-score   support

         0.0       0.85      0.96      0.90     21988
         1.0       0.75      0.42      0.54      6451

   micro avg       0.84      0.84      0.84     28439
   macro avg       0.80      0.69      0.72     28439
weighted avg       0.83      0.84      0.82     28439



In [20]:
print(np.mean(y_pred == y_test))

0.8372305636625761


## Finding the Optimal K

In [21]:
error = []

# Calculating error for K values between 1 and 300
for i in range(1, 100):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

KeyboardInterrupt: 

In [None]:

plt.figure(figsize=(12, 6))  
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',  
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')  
plt.xlabel('K Value')  
plt.ylabel('Mean Error')

## Decision Tree

In [59]:
y = mydata['RainTomorrow']
X = mydata.drop(columns=['RainTomorrow'])

In [60]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=7)

In [62]:
from sklearn.tree import DecisionTreeClassifier
my_tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=5, min_samples_leaf=5).fit(X_train,y_train)
#my_tree = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5).fit(X_train,y_train)

In [63]:
y_pred = my_tree.predict(X_test)

In [64]:
print("The prediction accuracy is: ",my_tree.score(X_test,y_test)*100,"%")

The prediction accuracy is:  83.80450070323488 %


In [65]:
print(np.mean(y_pred != y_test))## Visualize the tree

0.1619549929676512


In [66]:
 from sklearn.metrics import classification_report, confusion_matrix  
y_pred = my_tree.predict(X_test) 

In [67]:
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

[[10467   556]
 [ 1747  1450]]
              precision    recall  f1-score   support

           0       0.86      0.95      0.90     11023
           1       0.72      0.45      0.56      3197

   micro avg       0.84      0.84      0.84     14220
   macro avg       0.79      0.70      0.73     14220
weighted avg       0.83      0.84      0.82     14220



## Visualize the tree

In [68]:
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

In [69]:
from sklearn import tree
import collections

from sklearn.tree import export_graphviz
import pydotplus
import pydot

data_feature_names = X.columns
dot_data = tree.export_graphviz(my_tree,
                                feature_names=data_feature_names,
                                out_file=None,
                                filled=True,
                                rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data)

colors = ('turquoise', 'orange')
edges = collections.defaultdict(list)

for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])

graph.write_png('Rain Tree.png')

True

In [70]:
my_tree.feature_importances_

array([0.00198193, 0.        , 0.05327209, 0.        , 0.06630406,
       0.        , 0.10037497, 0.        , 0.        , 0.        ,
       0.        , 0.00264312, 0.70783295, 0.        , 0.060903  ,
       0.        , 0.        , 0.00363552, 0.00305236])

In [71]:
data_feature_names

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm'],
      dtype='object')

## Random Forest Classifier

In [72]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=800)

In [73]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=800)

In [74]:
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [75]:
y_pred=clf.predict(X_test)

In [76]:
from sklearn.metrics import classification_report, confusion_matrix 
import numpy as np
print(np.mean(y_pred == y_test))

0.8578762306610408


In [77]:
from sklearn.metrics import classification_report, confusion_matrix 
import numpy as np
print(np.mean(y_pred != y_test))

0.1421237693389592


In [78]:
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

[[10533   490]
 [ 1531  1666]]
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     11023
           1       0.77      0.52      0.62      3197

   micro avg       0.86      0.86      0.86     14220
   macro avg       0.82      0.74      0.77     14220
weighted avg       0.85      0.86      0.85     14220



In [79]:
clf.feature_importances_

array([0.05009842, 0.04857717, 0.07365258, 0.0277376 , 0.05788576,
       0.03029435, 0.05791373, 0.03085641, 0.03166586, 0.03331624,
       0.0355893 , 0.05750379, 0.18192996, 0.05814719, 0.06536499,
       0.02312878, 0.03962776, 0.04719449, 0.04951563])

## Logistic Regression

In [80]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=1, solver='newton-cg', max_iter=550,
                         multi_class='multinomial')
clf.fit(X_train, y_train) 



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=550, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=1, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [81]:
y_pred = clf.predict(X_test)

In [82]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

[[10416   607]
 [ 1636  1561]]
              precision    recall  f1-score   support

           0       0.86      0.94      0.90     11023
           1       0.72      0.49      0.58      3197

   micro avg       0.84      0.84      0.84     14220
   macro avg       0.79      0.72      0.74     14220
weighted avg       0.83      0.84      0.83     14220



In [83]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 84.23%


In [84]:
from sklearn.metrics import log_loss
log_loss(y_test, y_pred)

5.448028186316792

## Linear regression

In [86]:
from sklearn import datasets, linear_model
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [87]:
y_pred = clf.predict(X_test)

In [89]:
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained R2 square score: 1 is perfect prediction
print('R2 square: %.2f' % r2_score(y_test, y_pred))

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % explained_variance_score(y_test, y_pred))

Coefficients: 
 [-0.0014597  -0.00275602  0.00401227  0.00098054 -0.01895917 -0.00030909
  0.00771082  0.00107738 -0.00143867 -0.00021213 -0.00471067 -0.00017903
  0.00835547  0.01469799 -0.02386327 -0.00242832  0.01180673 -0.00185055
  0.00595657]
Mean squared error: 0.16
R2 square: 0.09
Variance score: 0.12


In [113]:
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

results = smf.ols('RainTomorrow ~ Humidity9am + WindGustSpeed + Sunshine+ Rainfall+ Pressure3pm', data = mydata).fit()
print(results.params)
results.summary()

Intercept        10.863068
Humidity9am       0.004864
WindGustSpeed     0.005924
Sunshine         -0.035050
Rainfall          0.005133
Pressure3pm      -0.010791
dtype: float64


0,1,2,3
Dep. Variable:,RainTomorrow,R-squared:,0.235
Model:,OLS,Adj. R-squared:,0.235
Method:,Least Squares,F-statistic:,8746.0
Date:,"Thu, 28 Mar 2019",Prob (F-statistic):,0.0
Time:,17:21:07,Log-Likelihood:,-58340.0
No. Observations:,142193,AIC:,116700.0
Df Residuals:,142187,BIC:,116800.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.8631,0.162,67.111,0.000,10.546,11.180
Humidity9am,0.0049,5.78e-05,84.083,0.000,0.005,0.005
WindGustSpeed,0.0059,8.17e-05,72.479,0.000,0.006,0.006
Sunshine,-0.0350,0.000,-91.903,0.000,-0.036,-0.034
Rainfall,0.0051,0.000,42.446,0.000,0.005,0.005
Pressure3pm,-0.0108,0.000,-68.013,0.000,-0.011,-0.010

0,1,2,3
Omnibus:,15624.841,Durbin-Watson:,1.735
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21453.898
Skew:,0.95,Prob(JB):,0.0
Kurtosis:,3.113,Cond. No.,170000.0
