## **Import the required libraries**


In [1]:
# All Libraries required for this lab are listed below. The libraries pre-installed on Skills Network Labs are commented.
#!pip install -qy pandas==1.3.4 numpy==1.21.4 seaborn==0.9.0 matplotlib==3.5.0 scikit-learn==0.20.1
# Note: If your environment doesn't support "!mamba install", use "!pip install"

In [2]:
# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics

In [4]:
#!pip install jupyterlite-pyodide-kernel

### Importing the Dataset


In [5]:
df = pd.read_csv("Weather_dataa.csv")

In [6]:
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


### Data Preprocessing


#### One Hot Encoding


Perform one hot encoding to convert categorical variables to binary variables.


In [7]:
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

Next, we replace the values of the 'RainTomorrow' column changing them from a categorical column to a binary column. We do not use the `get_dummies` method because we would end up with two columns for 'RainTomorrow' and we do not want, since 'RainTomorrow' is our target.


In [8]:
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)

### Training Data and Test Data


Now, we set our 'features' or x values and our Y or target variable.


In [9]:
df_sydney_processed.drop('Date',axis=1,inplace=True)

In [10]:
df_sydney_processed = df_sydney_processed.astype(float)

In [11]:
features = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
Y = df_sydney_processed['RainTomorrow']

### Linear Regression


#### Use the `train_test_split` function to split the `features` and `Y` dataframes with a `test_size` of `0.2` and the `random_state` set to `10`.


In [12]:
x_train, x_test, y_train, y_test = train_test_split(features, Y)

#### Create and train a Linear Regression model called LinearReg using the training data (`x_train`, `y_train`).


In [13]:
LinearReg = LinearRegression().fit(x_train, y_train)

#### Now use the `predict` method on the testing data (`x_test`) and save it to the array `predictions`.


In [14]:
predictions = LinearReg.predict(x_test)

#### Using the `predictions` and the `y_test` dataframe calculate the value for each metric using the appropriate function.


In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

LinearRegression_MAE = mean_absolute_error(y_test, predictions)
LinearRegression_MSE = mean_squared_error(y_test, predictions)
LinearRegression_R2 = r2_score(y_test, predictions)

#### Show the MAE, MSE, and R2 in a tabular format using data frame for the linear model.


In [16]:
Report1 = pd.DataFrame({
    "Metric": ["Linear Reg MAE", "Linear Reg MSE", "Linear Reg R²"],
    "Value": [LinearRegression_MAE, LinearRegression_MSE, LinearRegression_R2]
})

print(Report1)

           Metric     Value
0  Linear Reg MAE  0.258230
1  Linear Reg MSE  0.118023
2   Linear Reg R²  0.352222


### KNN


#### Create and train a KNN model called KNN using the training data (`x_train`, `y_train`) with the `n_neighbors` parameter set to `4`.


In [17]:
KNN = KNeighborsClassifier(n_neighbors=4).fit(x_train, y_train)

#### Now use the `predict` method on the testing data (`x_test`) and save it to the array `predictions`.


In [18]:
predictions = KNN.predict(x_test)

#### Using the `predictions` and the `y_test` dataframe calculate the value for each metric using the appropriate function.


In [19]:
KNN_Accuracy_Score = accuracy_score(y_test, predictions)
KNN_JaccardIndex = jaccard_score(y_test, predictions)
KNN_F1_Score = f1_score(y_test, predictions)

Report2 = pd.DataFrame({
    "Metric": ["KNN Accuracy Score", "KNN Jaccard Index", "KNN F1 Score"],
    "Value": [KNN_Accuracy_Score, KNN_JaccardIndex, KNN_F1_Score]
})

print(Report2)

               Metric     Value
0  KNN Accuracy Score  0.828851
1   KNN Jaccard Index  0.404255
2        KNN F1 Score  0.575758


### Decision Tree


#### Create and train a Decision Tree model called Tree using the training data (`x_train`, `y_train`).


In [20]:
Tree = DecisionTreeClassifier().fit(x_train, y_train)

#### Now use the `predict` method on the testing data (`x_test`) and save it to the array `predictions`.


In [21]:
predictions = Tree.predict(x_test)

#### Using the `predictions` and the `y_test` dataframe calculate the value for each metric using the appropriate function.


In [22]:
Tree_Accuracy_Score = accuracy_score(y_test, predictions)
Tree_JaccardIndex = jaccard_score(y_test, predictions)
Tree_F1_Score = f1_score(y_test, predictions)

Report3 = pd.DataFrame({
    "Metric": ["Tree Accuracy Score", "Tree Jaccard Index", "Tree F1 Score"],
    "Value": [Tree_Accuracy_Score, Tree_JaccardIndex, Tree_F1_Score]
})

print(Report3)

                Metric     Value
0  Tree Accuracy Score  0.773839
1   Tree Jaccard Index  0.391447
2        Tree F1 Score  0.562648


### Logistic Regression


#### Use the `train_test_split` function to split the `features` and `Y` dataframes with a `test_size` of `0.2` and the `random_state` set to `1`.


In [23]:
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size=0.2, random_state=1)

#### Q13) Create and train a LogisticRegression model called LR using the training data (`x_train`, `y_train`) with the `solver` parameter set to `liblinear`.


In [24]:
LR = LogisticRegression(solver='liblinear').fit(x_train, y_train)

#### Now, use the `predict` and `predict_proba` methods on the testing data (`x_test`) and save it as 2 arrays `predictions` and `predict_proba`.


In [25]:
predictions = LR.predict(x_test)

In [26]:
predict_proba = LR.predict_proba(x_test)

#### Using the `predictions`, `predict_proba` and the `y_test` dataframe calculate the value for each metric using the appropriate function.


In [27]:
LR_Accuracy_Score = accuracy_score(y_test, predictions)
LR_JaccardIndex = jaccard_score(y_test, predictions)
LR_F1_Score = f1_score(y_test, predictions)
LR_Log_Loss = log_loss(y_test, predict_proba)

Report4 = pd.DataFrame({
    "Metric": ["LR Accuracy Score", "LR Jaccard Index", "LR F1 Score", "LR Log Loss"],
    "Value": [LR_Accuracy_Score, LR_JaccardIndex, LR_F1_Score, LR_Log_Loss]
})

print(Report4)

              Metric     Value
0  LR Accuracy Score  0.835115
1   LR Jaccard Index  0.504587
2        LR F1 Score  0.670732
3        LR Log Loss  0.381427


### SVM


#### Create and train a SVM model called SVM using the training data (`x_train`, `y_train`).


In [28]:
SVM = SVC().fit(x_train, y_train)

#### Now use the `predict` method on the testing data (`x_test`) and save it to the array `predictions`.


In [29]:
predictions = SVM.predict(x_test)

#### Using the `predictions` and the `y_test` dataframe calculate the value for each metric using the appropriate function.


In [30]:
SVM_Accuracy_Score = accuracy_score(y_test, predictions)
SVM_JaccardIndex = jaccard_score(y_test, predictions)
SVM_F1_Score = f1_score(y_test, predictions)

Report5 = pd.DataFrame({
    "Metric": ["SVM Accuracy Score", "SVM Jaccard Index", "SVM F1 Score"],
    "Value": [SVM_Accuracy_Score, SVM_JaccardIndex, SVM_F1_Score]
})

print(Report5)

               Metric     Value
0  SVM Accuracy Score  0.722137
1   SVM Jaccard Index  0.000000
2        SVM F1 Score  0.000000


#### Show the Accuracy,Jaccard Index,F1-Score and LogLoss in a tabular format using data frame for all of the above models.

\*LogLoss is only for Logistic Regression Model


In [31]:
Final_Report = pd.concat([Report1, Report2, Report3, Report4, Report5], ignore_index=True)

print(Final_Report)

                 Metric     Value
0        Linear Reg MAE  0.258230
1        Linear Reg MSE  0.118023
2         Linear Reg R²  0.352222
3    KNN Accuracy Score  0.828851
4     KNN Jaccard Index  0.404255
5          KNN F1 Score  0.575758
6   Tree Accuracy Score  0.773839
7    Tree Jaccard Index  0.391447
8         Tree F1 Score  0.562648
9     LR Accuracy Score  0.835115
10     LR Jaccard Index  0.504587
11          LR F1 Score  0.670732
12          LR Log Loss  0.381427
13   SVM Accuracy Score  0.722137
14    SVM Jaccard Index  0.000000
15         SVM F1 Score  0.000000
