# Rain Prediction in Australia

### Install Required Packages

In [90]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
import sklearn.tree as tree

### Download and Load the Dataset

In [91]:
path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv"
df = pd.read_csv(path)

In [92]:
# A brief overview
df.head()
df.columns
df.dtypes

Date              object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed      int64
WindDir9am        object
WindDir3pm        object
WindSpeed9am       int64
WindSpeed3pm       int64
Humidity9am        int64
Humidity3pm        int64
Pressure9am      float64
Pressure3pm      float64
Cloud9am           int64
Cloud3pm           int64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object

### Data Preprocessing

Convert categorical variables to binary variables

In [93]:
df_sydney_processed = pd.get_dummies(data = df, 
                                     columns = ['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

In [94]:
df_sydney_processed.replace(['No', 'Yes'], [0, 1], 
                            inplace = True)

In [95]:
# Remove the Date column
df_sydney_processed.drop("Date", axis = 1, inplace = True)
# df_sydney_processed.head()

In [96]:
df_sydney_processed = df_sydney_processed.astype(float)
# df_sydney_processed.dtypes

Set **features (x)** and **target variable (Y)**

In [97]:
features = df_sydney_processed.drop(columns = 'RainTomorrow', axis = 1)
Y = df_sydney_processed['RainTomorrow']

#### Question 1, Train and Test Datasets

In [98]:
x_train, x_test, y_train, y_test = train_test_split(features, Y, 
                                                    test_size = 0.2, 
                                                    random_state = 10)

#### Question 2, Linear Regression Model

In [99]:
LinearReg = linear_model.LinearRegression()

train_x = np.asanyarray(x_train)
train_y = np.asanyarray(y_train)
LinearReg.fit(train_x, train_y)

print("Intercept: ", LinearReg.intercept_)
print("Coefficients: ", LinearReg.coef_)

Intercept:  -41499557225.61028
Coefficients:  [-2.36933212e-02  1.30007994e-02  7.30206238e-04  6.48991926e-03
 -3.51699778e-02  4.23739763e-03  1.83047446e-03  7.90999468e-04
  9.55896155e-04  8.56089162e-03  7.70813418e-03 -9.25470178e-03
 -8.86504271e-03  1.00331733e-02  1.44689084e-02 -3.47639901e-03
  2.14785220e+10  2.14785220e+10 -5.67846844e+09 -5.67846844e+09
 -5.67846844e+09 -5.67846844e+09 -5.67846844e+09 -5.67846844e+09
 -5.67846844e+09 -5.67846844e+09 -5.67846844e+09 -5.67846844e+09
 -5.67846844e+09 -5.67846844e+09 -5.67846844e+09 -5.67846844e+09
 -5.67846844e+09 -5.67846844e+09  1.70623652e+10  1.70623652e+10
  1.70623652e+10  1.70623652e+10  1.70623652e+10  1.70623652e+10
  1.70623652e+10  1.70623652e+10  1.70623652e+10  1.70623652e+10
  1.70623652e+10  1.70623652e+10  1.70623652e+10  1.70623652e+10
  1.70623652e+10  1.70623652e+10  8.63713855e+09  8.63713855e+09
  8.63713855e+09  8.63713855e+09  8.63713855e+09  8.63713855e+09
  8.63713855e+09  8.63713855e+09  8.63713855

#### Question 3, Predict on the Test Data

In [100]:
predictions = LinearReg.predict(x_test)
print(predictions)


[ 1.31782532e-01  2.76153564e-01  9.78088379e-01  2.87483215e-01
  1.32377625e-01  4.60464478e-01  3.56773376e-01  8.56460571e-01
  6.75010681e-01  3.82766724e-02  4.82940674e-03  2.81181335e-01
  3.39042664e-01  7.80868530e-02  6.26449585e-02  5.64521790e-01
 -6.15615845e-02  5.24208069e-01  1.53785706e-01  3.59672546e-01
  6.05087280e-02  9.03572083e-01  4.67338562e-01  2.03323364e-01
 -7.10830688e-02  3.83865356e-01  5.36071777e-01 -2.28652954e-02
  6.40052795e-01 -9.56726074e-02  3.78089905e-01  1.20277405e-01
 -1.81350708e-02  5.53970337e-02  5.63514709e-01  1.06302643e+00
 -6.73675537e-03  5.14488220e-01 -8.83865356e-02  6.92062378e-02
  2.44827271e-02  8.71726990e-01  2.44621277e-01  3.94760132e-01
  2.67494202e-01  4.46762085e-01 -4.75540161e-02  1.89407349e-01
  7.76573181e-01  1.57707214e-01  3.91387939e-03 -5.19638062e-02
  2.07328796e-01 -2.07908630e-01 -7.60879517e-02  2.49641418e-01
  2.79273987e-01  6.02851868e-01  6.29592896e-01  4.90715027e-01
  5.63888550e-02  1.05461



#### Question 4, Calculate the Value for Each Metric

In [101]:
LinearRegression_MAE = metrics.mean_absolute_error(y_test, predictions)
LinearRegression_MSE = metrics.mean_squared_error(y_test, predictions)
LinearRegression_R2 = np.sqrt(metrics.mean_squared_error(y_test, predictions))

print("MAE: ", LinearRegression_MAE, 
      "MSE: ", LinearRegression_MSE, 
      "R2: ", LinearRegression_R2)

MAE:  0.25631900234076815 MSE:  0.11572291759564364 R2:  0.34018071314471027


#### Question 5, Report, showing the MAE, MSE, and R2

In [102]:
Report = {"Metrics": ["MAE", "MSE", "R2"], 
          "Result": [LinearRegression_MAE, 
                     LinearRegression_MSE, 
                     LinearRegression_R2]}

print(pd.DataFrame(Report))

  Metrics    Result
0     MAE  0.256319
1     MSE  0.115723
2      R2  0.340181


#### Question 6, K-NN

In [103]:
K = 4
KNN = KNeighborsClassifier(n_neighbors = K).fit(x_train, y_train)
KNN

#### Question 7, Predict on K-NN

In [104]:
prediction_KNN = KNN.predict(x_test.values)
print(prediction_KNN)

[0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0.
 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1.
 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.



#### Question 8, Calculate the Value for Each Metric

In [105]:
KNN_Accuracy_Score = metrics.accuracy_score(y_test, prediction_KNN)
KNN_JaccardIndex = jaccard_score(y_test, prediction_KNN, pos_label = 0)
KNN_F1_Score = f1_score(y_test, prediction_KNN)

print("Accuracy Score: ", KNN_Accuracy_Score, 
      "Jaccard Score: ", KNN_JaccardIndex, 
      "F1 Score: ", KNN_F1_Score)
print("-------------------------------------------------------------------------------------------------------")
Report_KNN = {"Metrics KNN": ["Accuracy", "JaccardIndex", "F1Score"], 
              "Result": [KNN_Accuracy_Score, 
                         KNN_JaccardIndex, 
                         KNN_F1_Score]}
print(pd.DataFrame(Report_KNN))

Accuracy Score:  0.8183206106870229 Jaccard Score:  0.7901234567901234 F1 Score:  0.5966101694915255
-------------------------------------------------------------------------------------------------------
    Metrics KNN    Result
0      Accuracy  0.818321
1  JaccardIndex  0.790123
2       F1Score  0.596610


#### Question 9, Decision Tree Model

In [106]:
Tree = DecisionTreeClassifier()
Tree.fit(x_train, y_train)

#### Question 10, Predict on Test Data Using Decision Tree

In [107]:
prediction_tree = Tree.predict(x_test)
print(prediction_tree)
# tree.plot_tree(Tree)

[0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1.
 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0.
 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1.
 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.

#### Question 11, Calculate the Value for Each Metric

In [108]:
Tree_Accuracy_Score = metrics.accuracy_score(y_test, prediction_tree)
Tree_JaccardIndex = jaccard_score(y_test, prediction_tree)
Tree_F1_Score = f1_score(y_test, prediction_tree)

Report_Tree = {"Metrics KNN": ["Accuracy", "JaccardIndex", "F1Score"], 
              "Result": [Tree_Accuracy_Score, 
                         Tree_JaccardIndex, 
                         Tree_F1_Score]}
print(pd.DataFrame(Report_Tree))

    Metrics KNN    Result
0      Accuracy  0.748092
1  JaccardIndex  0.382022
2       F1Score  0.552846


#### Question 12, Logistic Regression Split the Dataset

In [109]:
x_log_train, x_log_test, y_log_train, y_log_test = train_test_split(features, Y, 
                                                                    test_size = 0.2, 
                                                                    random_state = 1)

#### Question 13, Logistic Regression Model

In [110]:
LR = LogisticRegression(solver = "liblinear").fit(x_log_train, y_log_train)
LR

#### Question 14, Predict Using Logistic Regression

In [111]:
predictions_log = LR.predict(x_log_test)
print(predictions_log)

[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0.
 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0.
 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.

In [112]:
predictions_proba_log = LR.predict_proba(x_log_test)
print(predictions_proba_log)

[[0.74339483 0.25660517]
 [0.97495683 0.02504317]
 [0.50982014 0.49017986]
 ...
 [0.98010306 0.01989694]
 [0.69834832 0.30165168]
 [0.22120583 0.77879417]]


#### Question 15, Calculate the Value for Each Metric

In [113]:
LR_Accuracy_Score = metrics.accuracy_score(y_log_test, predictions_log)
LR_JaccardIndex = jaccard_score(y_log_test, predictions_log)
LR_F1_Score = f1_score(y_log_test, predictions_log)
LR_Log_Loss = log_loss(y_log_test, predictions_log)

Report_log = {"Metrics Logistic Regression":["Accuracy Score", 
                                         "JaccardIndex", 
                                         "F1Score", 
                                         "Log-Loss"], 
          "Result": [LR_Accuracy_Score, 
                     LR_JaccardIndex, 
                     LR_F1_Score, 
                     LR_Log_Loss]}
print(pd.DataFrame(Report_log))

  Metrics Logistic Regression    Result
0              Accuracy Score  0.836641
1                JaccardIndex  0.509174
2                     F1Score  0.674772
3                    Log-Loss  5.888047


#### Question 16, SVM Model

In [114]:
SVM = svm.SVC(kernel = "rbf")
SVM.fit(x_log_train, y_log_train)

#### Question 17, Predict Using SVM Model

In [115]:
predictions_svm = SVM.predict(x_log_test)
print(predictions_svm)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

#### Question 18, Calculate the Value for Each Metric

In [116]:
SVM_Accuracy_Score = metrics.accuracy_score(y_log_test, predictions_svm)
SVM_JaccardIndex = jaccard_score(y_log_test, predictions_svm)
SVM_F1_Score = f1_score(y_log_test, predictions_svm)

Report_svm = {"Metrics SVM":["Accuracy Score", 
                          "JaccardIndex", 
                          "F1-Score"], 
           "Result": [SVM_Accuracy_Score, 
                      SVM_JaccardIndex, 
                      SVM_F1_Score]}
print(pd.DataFrame(Report_svm))

      Metrics SVM    Result
0  Accuracy Score  0.722137
1    JaccardIndex  0.000000
2        F1-Score  0.000000


#### Question 19, all of the above models

In [117]:
Report_ALL = {"Total Metrics SVM": ["Accuracy", "JaccardIndex", "F1Score", "LogLoss"], 
              "Result Metrics KNN": [KNN_Accuracy_Score, KNN_JaccardIndex, KNN_F1_Score,""], 
              "Result Metrics Tree": [Tree_Accuracy_Score, Tree_JaccardIndex, Tree_F1_Score,""], 
              "Result Metrics Logistic Reg": [LR_Accuracy_Score, LR_JaccardIndex, LR_F1_Score, LR_Log_Loss], 
              "Results Metrics SVM":[SVM_Accuracy_Score, SVM_JaccardIndex, SVM_F1_Score,""]}
pd.DataFrame(Report_ALL)

Unnamed: 0,Total Metrics SVM,Result Metrics KNN,Result Metrics Tree,Result Metrics Logistic Reg,Results Metrics SVM
0,Accuracy,0.818321,0.748092,0.836641,0.722137
1,JaccardIndex,0.790123,0.382022,0.509174,0.0
2,F1Score,0.59661,0.552846,0.674772,0.0
3,LogLoss,,,5.888047,
