# Import the required libraries

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics

# Importing the Dataset

In [2]:
df = pd.read_csv(r'C:\Users\Sidharth\OneDrive\Desktop\Mushfiq\IBM\Weather_Data.csv')
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


# Data Preprocessing

In [3]:
# Checking the Dimensions of Dataset: 
print(df.shape)

(3271, 22)


In [4]:
# A concise summary of a Dataset:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3271 entries, 0 to 3270
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           3271 non-null   object 
 1   MinTemp        3271 non-null   float64
 2   MaxTemp        3271 non-null   float64
 3   Rainfall       3271 non-null   float64
 4   Evaporation    3271 non-null   float64
 5   Sunshine       3271 non-null   float64
 6   WindGustDir    3271 non-null   object 
 7   WindGustSpeed  3271 non-null   int64  
 8   WindDir9am     3271 non-null   object 
 9   WindDir3pm     3271 non-null   object 
 10  WindSpeed9am   3271 non-null   int64  
 11  WindSpeed3pm   3271 non-null   int64  
 12  Humidity9am    3271 non-null   int64  
 13  Humidity3pm    3271 non-null   int64  
 14  Pressure9am    3271 non-null   float64
 15  Pressure3pm    3271 non-null   float64
 16  Cloud9am       3271 non-null   int64  
 17  Cloud3pm       3271 non-null   int64  
 18  Temp9am 

In [5]:
#descriptive statistics
print(df.describe(exclude=[object]))

           MinTemp      MaxTemp     Rainfall  Evaporation     Sunshine  \
count  3271.000000  3271.000000  3271.000000  3271.000000  3271.000000   
mean     14.877102    23.005564     3.342158     5.175787     7.168970   
std       4.554710     4.483752     9.917746     2.757684     3.815966   
min       4.300000    11.700000     0.000000     0.000000     0.000000   
25%      11.000000    19.600000     0.000000     3.200000     4.250000   
50%      14.900000    22.800000     0.000000     4.800000     8.300000   
75%      18.800000    26.000000     1.400000     7.000000    10.200000   
max      27.600000    45.800000   119.400000    18.400000    13.600000   

       WindGustSpeed  WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  \
count    3271.000000   3271.000000   3271.000000  3271.000000  3271.000000   
mean       41.476307     15.077041     19.294405    68.243962    54.698563   
std        10.806951      7.043825      7.453331    15.086127    16.279241   
min        17.000000 

In [6]:
print(df.describe(include=[object]))

            Date WindGustDir WindDir9am WindDir3pm RainToday RainTomorrow
count       3271        3271       3271       3271      3271         3271
unique      3271          16         16         16         2            2
top     2/1/2008           W          W          E        No           No
freq           1        1425       1260        624      2422         2422


In [7]:
#One Hot Encoding
#First, we need to perform one hot encoding to convert categorical variables to binary variables.
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

In [8]:
#replace the values of the 'RainTomorrow' column changing them from a categorical column to a binary column
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)

# Training Data and Test Data

In [9]:
df_sydney_processed.drop('Date',axis=1,inplace=True)

In [10]:
df_sydney_processed = df_sydney_processed.astype(float)

In [11]:
features = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
Y = df_sydney_processed['RainTomorrow']

# Linear Regression

In [12]:
#Enter Your Code, Execute and take the Screenshot
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size=0.2, random_state=10)

In [13]:
print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)

Train set: (2616, 66) (2616,)
Test set: (655, 66) (655,)


In [14]:
LinearReg = LinearRegression()
LinearReg.fit(x_train, y_train)
print ('Coefficients: ', LinearReg.coef_)

Coefficients:  [-2.36934171e-02  1.30126722e-02  7.29834513e-04  6.48717815e-03
 -3.51598649e-02  4.23718996e-03  1.82950068e-03  7.89684297e-04
  9.55909163e-04  8.56086738e-03  7.69398838e-03 -9.23985228e-03
 -8.87055336e-03  1.00506153e-02  1.44671142e-02 -3.48675855e-03
 -1.04898853e+10 -1.04898853e+10  7.14886639e+09  7.14886639e+09
  7.14886639e+09  7.14886639e+09  7.14886639e+09  7.14886639e+09
  7.14886639e+09  7.14886639e+09  7.14886639e+09  7.14886639e+09
  7.14886639e+09  7.14886639e+09  7.14886639e+09  7.14886639e+09
  7.14886639e+09  7.14886639e+09 -2.10308944e+10 -2.10308944e+10
 -2.10308944e+10 -2.10308944e+10 -2.10308944e+10 -2.10308944e+10
 -2.10308944e+10 -2.10308944e+10 -2.10308944e+10 -2.10308944e+10
 -2.10308944e+10 -2.10308944e+10 -2.10308944e+10 -2.10308944e+10
 -2.10308944e+10 -2.10308944e+10 -3.18090902e+09 -3.18090902e+09
 -3.18090902e+09 -3.18090902e+09 -3.18090902e+09 -3.18090902e+09
 -3.18090902e+09 -3.18090902e+09 -3.18090902e+09 -3.18090902e+09
 -3.180909

In [15]:
predictions = LinearReg.predict(x_test)
print("residual sum of squares: %.2f" % np.mean((predictions - y_test) ** 2))
print("Variance score: %.2f" % LinearReg.score(x_test, y_test))

residual sum of squares: 0.12
Variance score: 0.43


In [16]:
from sklearn.metrics import r2_score
LinearRegression_MAE = np.mean(np.absolute(predictions - y_test))
LinearRegression_MSE = np.mean((predictions - y_test) **2)
LinearRegression_R2 = r2_score(predictions, y_test)
print("Mean Absolute Error: %.2f" % LinearRegression_MAE)
print("Mean Squared Error: %.2f" % LinearRegression_MSE)
print("R2-Score: %.2f" % LinearRegression_R2)

Mean Absolute Error: 0.26
Mean Squared Error: 0.12
R2-Score: -0.38


In [17]:
from tabulate import tabulate
dict = [["LinearRegression_MAE",LinearRegression_MAE],["LinearRegression_MSE",LinearRegression_MSE],
       ["LinearRegression_R2",LinearRegression_R2]]
Report = pd.DataFrame(dict)
print(tabulate(Report))

-  --------------------  ---------
0  LinearRegression_MAE   0.256318
1  LinearRegression_MSE   0.115721
2  LinearRegression_R2   -0.3848
-  --------------------  ---------


# KNN

In [18]:
k = 4
neigh = KNeighborsClassifier(n_neighbors = k).fit(x_train,y_train)
neigh

In [21]:
predictions = neigh.predict(x_test.values)
predictions[0:5]



array([0., 0., 1., 0., 0.])

In [22]:
KNN_Accuracy_Score = metrics.accuracy_score(predictions, y_test)
KNN_JaccardIndex = metrics.jaccard_score(predictions, y_test)
KNN_F1_Score = metrics.f1_score(predictions, y_test)
print("KNN_Accuracy_Score: %.2f" % KNN_Accuracy_Score)
print("KNN_JaccardIndex: %.2f" % KNN_JaccardIndex)
print("KNN_F1_Score: %.2f" % KNN_F1_Score)

KNN_Accuracy_Score: 0.82
KNN_JaccardIndex: 0.43
KNN_F1_Score: 0.60


# Decision Tree

In [23]:
Tree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 4)
Tree.fit(x_train, y_train)

In [24]:
predictions = Tree.predict(x_test)

In [25]:
Tree_Accuracy_Score = metrics.accuracy_score(predictions, y_test)
Tree_JaccardIndex = metrics.jaccard_score(predictions, y_test)
Tree_F1_Score = metrics.f1_score(predictions, y_test)
print("Tree_Accuracy_Score: %.2f" % Tree_Accuracy_Score)
print("Tree_JaccardIndex: %.2f" % Tree_JaccardIndex)
print("Tree_F1_Score: %.2f" % Tree_F1_Score)

Tree_Accuracy_Score: 0.82
Tree_JaccardIndex: 0.48
Tree_F1_Score: 0.65


# logistic Regression

In [26]:
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size = 0.2, random_state = 1)
print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)

Train set: (2616, 66) (2616,)
Test set: (655, 66) (655,)


In [27]:
LR = LogisticRegression(solver = "liblinear")
LR.fit(x_train,y_train)

In [28]:
predictions = LR.predict(x_test)

In [29]:
LR_Accuracy_Score = metrics.accuracy_score(predictions, y_test)
LR_JaccardIndex = metrics.jaccard_score(predictions, y_test)
LR_F1_Score = metrics.f1_score(predictions, y_test)
LR_Log_Loss = metrics.log_loss(predictions, y_test)
print("LR_Accuracy_Score: %.2f" % LR_Accuracy_Score)
print("LR_JaccardIndex: %.2f" % LR_JaccardIndex)
print("LR_F1_Score: %.2f" % LR_F1_Score)
print("LR_Log_Loss: %.2f" % LR_Log_Loss)

LR_Accuracy_Score: 0.83
LR_JaccardIndex: 0.50
LR_F1_Score: 0.67
LR_Log_Loss: 6.00


# SVM

In [30]:
SVM = svm.SVC(kernel = 'linear')
SVM.fit(x_train,y_train)

In [31]:
predictions = SVM.predict(x_test)

In [32]:
SVM_Accuracy_Score = metrics.accuracy_score(predictions, y_test)
SVM_JaccardIndex = metrics.jaccard_score(predictions, y_test)
SVM_F1_Score = metrics.f1_score(predictions, y_test)
print("SVM_Accuracy_Score: %.2f" % SVM_Accuracy_Score)
print("SVM_JaccardIndex: %.2f" % SVM_JaccardIndex)
print("SVM_F1_Score: %.2f" % SVM_F1_Score)

SVM_Accuracy_Score: 0.83
SVM_JaccardIndex: 0.50
SVM_F1_Score: 0.66


# Report

In [36]:
from tabulate import tabulate
#dict = [["LinearRegression_MAE",LinearRegression_MAE],["LinearRegression_MSE",LinearRegression_MSE],
#       ["LinearRegression_R2",LinearRegression_R2]]
dict1 = {'LinearRegression' : [LinearRegression_MAE,LinearRegression_MSE,LinearRegression_R2],
         'KNN' : [KNN_Accuracy_Score,KNN_JaccardIndex,KNN_F1_Score],
         'DecisionTree' : [Tree_Accuracy_Score,Tree_JaccardIndex,Tree_F1_Score],
         'LogisticRegression' : [LR_Accuracy_Score,LR_JaccardIndex,LR_F1_Score],
         'SVM' : [SVM_Accuracy_Score,SVM_JaccardIndex,SVM_F1_Score]
        }

dict2 = [["LinearRegression_MAE",LinearRegression_MAE],["LinearRegression_MSE",LinearRegression_MSE],
         ["LinearRegression_R2",LinearRegression_R2],
         ["KNN_Accuracy_Score",KNN_Accuracy_Score],["KNN_JaccardIndex",KNN_JaccardIndex],
         ["KNN_F1_Score",KNN_F1_Score],
         ["Tree_Accuracy_Score",Tree_Accuracy_Score],["Tree_JaccardIndex",Tree_JaccardIndex],
         ["Tree_F1_Score",Tree_F1_Score],
         ["LR_Accuracy_Score",LR_Accuracy_Score],["LR_JaccardIndex",LR_JaccardIndex],
         ["LR_F1_Score",LR_F1_Score],["LR_log_Loss",LR_Log_Loss],
         ["SVM_Accuracy_Score",SVM_Accuracy_Score],["SVM_JaccardIndex",SVM_JaccardIndex],
         ["SVM_F1_Score",SVM_F1_Score]]
Report2 = pd.DataFrame(data=dict2)
#print(tabulate(Report, headers = ['Accuracy','Jaccard Index','F1-Score', 'LogLoss']))
print(tabulate(Report2))

--  --------------------  ---------
 0  LinearRegression_MAE   0.256318
 1  LinearRegression_MSE   0.115721
 2  LinearRegression_R2   -0.3848
 3  KNN_Accuracy_Score     0.818321
 4  KNN_JaccardIndex       0.425121
 5  KNN_F1_Score           0.59661
 6  Tree_Accuracy_Score    0.818321
 7  Tree_JaccardIndex      0.480349
 8  Tree_F1_Score          0.648968
 9  LR_Accuracy_Score      0.833588
10  LR_JaccardIndex        0.5
11  LR_F1_Score            0.666667
12  LR_log_Loss            5.9981
13  SVM_Accuracy_Score     0.832061
14  SVM_JaccardIndex       0.495413
15  SVM_F1_Score           0.662577
--  --------------------  ---------
