In [1]:
// Import necessary library headers.
#include <mlpack/xeus-cling.hpp>
#include <mlpack/core.hpp>
#include <mlpack/core/data/split_data.hpp>
#include <mlpack/methods/random_forest/random_forest.hpp>
#include <mlpack/core/data/scaler_methods/standard_scaler.hpp>

In [2]:
#define WITHOUT_NUMPY 1
#include "matplotlibcpp.h"
#include "xwidgets/ximage.hpp"
#include "../utils/preprocess.hpp"
#include "../utils/plot.hpp"

namespace plt = matplotlibcpp;

In [3]:
using namespace mlpack;

In [4]:
using namespace mlpack::data;

In [5]:
using namespace mlpack::tree;

### Part 1 - Modelling using Imbalanced Dataset

### Visualize the Missing Values

In [6]:
missing("weatherAUS.csv", "PuBu", "Part-1 Missing values before imputation");
auto img = xw::image_from_file("Part-1 Missing values before imputation.png").finalize();
img

A Jupyter widget with unique id: 859a9f36432f46acb113df9cd9eb02ec

The above visualization shows that high number of missing values in: Sunshine, Evaporation, Cloud9am and Cloud3pm

In [7]:
// Perform imputation on the original dataset using "mean" imputation policy.
impute("weatherAUS.csv");

Drop the dataset header using sed, sed is a Unix utility that parses and transforms text.

In [8]:
!cat weatherAUS_mean_imputed.csv | sed 1d > weatherAUS_trim.csv

Drop columns 1 ("Date") as it is not required and causes issues while loading the data.

In [9]:
!cut -d, -f1 --complement weatherAUS_trim.csv > weatherAUS_trim2.csv

Rename the newly created csv file.

In [10]:
!rm weatherAUS_trim.csv

In [11]:
!mv weatherAUS_trim2.csv weatherAUS_trim.csv

In [12]:
// Load the preprocessed dataset into armadillo matrix.
arma::mat weatherData;
mlpack::data::DatasetInfo info;

// Manually set the columns with contain categorical data in DatasetInfo.
info.Type(0) = mlpack::data::Datatype::categorical;
info.Type(6) = mlpack::data::Datatype::categorical;
info.Type(8) = mlpack::data::Datatype::categorical;
info.Type(9) = mlpack::data::Datatype::categorical;
info.Type(20) = mlpack::data::Datatype::categorical;
info.Type(21) = mlpack::data::Datatype::categorical;

data::Load("weatherAUS_trim.csv", weatherData, info);

In [13]:
data::Save("weatherAUSEnc.csv", weatherData);

In [16]:
!sed -i '1iLocation,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow' weatherAUSEnc.csv

In [17]:
// Inspect the first 5 examples in the dataset
std::cout << std::setw(15) << "Location" << std::setw(10) << "MinTemp" << std::setw(13) << "MaxTemp" 
          << std::setw(12) << "Rainfall" << std::setw(15) << "Evaporation" << std::setw(12) 
          << "Sunshine" << std::setw(14) << "WindGust" << std::setw(15) << "WindGustSpeed"
          << std::setw(12) << "WindDir9am" << std::endl;
std::cout << weatherData.submat(0, 0, weatherData.n_rows-1, 5).t() << std::endl;

       Location   MinTemp      MaxTemp    Rainfall    Evaporation    Sunshine      WindGust  WindGustSpeed  WindDir9am
            0   1.3400e+01   2.2900e+01   6.0000e-01   5.4682e+00   7.6112e+00            0   4.4000e+01            0            0   2.0000e+01   2.4000e+01   7.1000e+01   2.2000e+01   1.0077e+03   1.0071e+03   8.0000e+00   4.5099e+00   1.6900e+01   2.1800e+01            0            0
            0   7.4000e+00   2.5100e+01            0   5.4682e+00   7.6112e+00   1.0000e+00   4.4000e+01   1.0000e+00   1.0000e+00   4.0000e+00   2.2000e+01   4.4000e+01   2.5000e+01   1.0106e+03   1.0078e+03   4.4475e+00   4.5099e+00   1.7200e+01   2.4300e+01            0            0
            0   1.2900e+01   2.5700e+01            0   5.4682e+00   7.6112e+00   2.0000e+00   4.6000e+01            0   1.0000e+00   1.9000e+01   2.6000e+01   3.8000e+01   3.0000e+01   1.0076e+03   1.0087e+03   4.4475e+00   2.0000e+00   2.1000e+01   2.3200e+01            0            0
            0   9.20

In [18]:
// Visualize the distribution of target classes
countplot("weatherAUS_mean_imputed.csv", "RainTomorrow", "", "Part-1 Distribution of target class");
auto img = xw::image_from_file("Part-1 Distribution of target class.png").finalize();
img

A Jupyter widget with unique id: 779ba021d19b4333b7a0e7757f84f27b

### EDA

In [19]:
countplot("weatherAUS_mean_imputed.csv", "WindDir9am", "", "Part-1 Direction of wind at 9 am");
auto img = xw::image_from_file("Part-1 Direction of wind at 9 am.png").finalize();
img

A Jupyter widget with unique id: 15a18dd227df4db290304a9d85ddab9b

In [20]:
countplot("weatherAUS_mean_imputed.csv", "WindDir3pm", "", "Part-1 Direction of wind at 3 pm");
auto img = xw::image_from_file("Part-1 Direction of wind at 3 pm.png").finalize();
img

A Jupyter widget with unique id: 126a3d5f0fc24efeb2861fc85fe6c2b9

In [21]:
countplot("weatherAUS_mean_imputed.csv", "WindGustDir", "", "Part-1 Direction of wind Gust");
auto img = xw::image_from_file("Part-1 Direction of wind Gust.png").finalize();
img

A Jupyter widget with unique id: ab091e7725c84fae8c3b522eb367ebd8

### Visualize Correlation

In [22]:
heatmap("weatherAUSEnc.csv", "coolwarm", "Part-1 Correlation Heatmap", 1);
auto img = xw::image_from_file("Part-1 Correlation Heatmap.png").finalize();
img

A Jupyter widget with unique id: 84a3e07d7f134f4f8c464b6865c6bde7

As we can observe from the above heatmap, there is high correlation between the following features:
* MinTemp & MaxTemp
* MinTemp & Temp9am
* MinTemp & Temp3pm
* MaxTemp & Temp9am
* MaxTemp & Temp3pm
* Temp3pm & Temp9am
* Pressure9am & Pressure3pm
* Evaporation & MaxTemp

In [23]:
arma::Row<size_t> targets = arma::conv_to<arma::Row<size_t>>::from(weatherData.row(weatherData.n_rows - 1));
weatherData.shed_row(weatherData.n_rows-1);

In [24]:
arma::mat Xtrain, Xtest;
arma::Row<size_t> Ytrain, Ytest;

In [25]:
mlpack::data::Split(weatherData, targets, Xtrain, Xtest, Ytrain, Ytest, 0.25);

In [26]:
arma::mat XtrainScaled, XtestScaled;

In [27]:
StandardScaler scale;
scale.Fit(Xtrain);
scale.Transform(Xtrain, XtrainScaled);
scale.Transform(Xtest, XtestScaled);

In [28]:
RandomForest<> rf(XtrainScaled, Ytrain, 2, 100);

In [30]:
arma::Row<size_t> output;
rf.Classify(XtestScaled, output);

In [32]:
double precision(const size_t truePos, const size_t falsePos)
{
    return (double)truePos / (double)(truePos + falsePos);
}

In [31]:
double recall(const size_t truePos, const size_t falseNeg)
{
    return (double)truePos / (double)(truePos + falseNeg);
}

In [33]:
double f1score(const size_t truePos, const size_t falsePos, const size_t falseNeg)
{
    double prec = precision(truePos, falsePos);
    double rec = precision(truePos, falseNeg);
    return (prec * rec) / (prec + rec);
}

In [34]:
void classification_report(const arma::Row<size_t>& yPreds, const arma::Row<size_t>& yTrue)
{
    arma::Row<size_t> uniqs = arma::unique(yTrue);
    for(auto val: uniqs)
    {
        const size_t truePos = 
    }
}

In [38]:
classification_report(output, Ytest);

0
1


In [35]:
const size_t correct = arma::accu(output == Ytest);

In [36]:
std::cout <<  "Accuracy: " << (double)correct / (double)Ytest.n_elem << std::endl;

Accuracy: 0.857033


### Part 2 - Modelling using Random Oversampling

In [28]:
resample("weatherAUS.csv", "RainTomorrow", "No", "Yes", "oversample", "Date", 123);

In [29]:
// Visualize the distribution of target classes
countplot("weatherAUS_oversampled.csv", "RainTomorrow", "", "Part-2 Oversampled Population");
auto img = xw::image_from_file("Part-2 Oversampled Population.png").finalize();
img

A Jupyter widget with unique id: a6889c6b9395401d85bba3d4c8b03601

### Visualize the Missing Values

In [30]:
missing("weatherAUS_oversampled.csv", "PuBu", "Part-2 Missing values before imputation");
auto img = xw::image_from_file("Part-2 Missing values before imputation.png").finalize();
img

A Jupyter widget with unique id: dbd1f475d38b4bd8a2237bc230b126d3

In [31]:
// Imputation using mean
impute("weatherAUS_oversampled.csv");

In [32]:
!cat weatherAUS_oversampled_mean_imputed.csv | sed 1d > weatherAUS_os_imp.csv

In [33]:
!cut -d, -f1 --complement weatherAUS_os_imp.csv > weatherAUS_trim2.csv

In [34]:
!rm weatherAUS_trim.csv

In [35]:
!mv weatherAUS_trim2.csv weatherAUS_trim.csv

In [36]:
arma::mat overSampled;
mlpack::data::DatasetInfo info;

info.Type(0) = mlpack::data::Datatype::categorical;
info.Type(6) = mlpack::data::Datatype::categorical;
info.Type(8) = mlpack::data::Datatype::categorical;
info.Type(9) = mlpack::data::Datatype::categorical;
info.Type(20) = mlpack::data::Datatype::categorical;
info.Type(21) = mlpack::data::Datatype::categorical;

data::Load("weatherAUS_trim.csv", overSampled, info);

In [37]:
arma::Row<size_t> targets = arma::conv_to<arma::Row<size_t>>::from(overSampled.row(overSampled.n_rows - 1));
overSampled.shed_row(overSampled.n_rows-1);

In [38]:
arma::mat Xtrain, Xtest;
arma::Row<size_t> Ytrain, Ytest;

In [39]:
mlpack::data::Split(overSampled, targets, Xtrain, Xtest, Ytrain, Ytest, 0.25);

In [40]:
arma::mat XtrainScaled, XtestScaled;

In [41]:
StandardScaler scale;
scale.Fit(Xtrain);
scale.Transform(Xtrain, XtrainScaled);
scale.Transform(Xtest, XtestScaled);

In [42]:
RandomForest<> rf(XtrainScaled, Ytrain, 2, 100);

In [43]:
arma::Row<size_t> output;
rf.Classify(XtestScaled, output);

In [44]:
const size_t correct = arma::accu(output == Ytest);

In [20]:
std::cout <<  "Accuracy: " << (double)correct / (double)Ytest.n_elem << std::endl;

[1minput_line_25:2:40: [0m[0;1;31merror: [0m[1muse of undeclared identifier 'correct'[0m
 std::cout <<  "Accuracy: " << (double)correct / (double)Ytest.n_elem << std::endl;
[0;1;32m                                       ^
[0m

Interpreter Error: 

### Part 3 - Modelling using Synthetic Minority Over Sampling Technique

In [6]:
impute("weatherAUS.csv");

In [7]:
!cat weatherAUS_mean_imputed.csv | sed 1d > weatherAUS_mean_imp.csv

In [61]:
!cut -d, -f1 --complement weatherAUS_mean_imp.csv > weatherAUS_trim2.csv

In [62]:
!rm weatherAUS_trim.csv

In [63]:
!mv weatherAUS_trim2.csv weatherAUS_trim.csv

In [64]:
arma::mat smote;
mlpack::data::DatasetInfo info;

info.Type(0) = mlpack::data::Datatype::categorical;
info.Type(6) = mlpack::data::Datatype::categorical;
info.Type(8) = mlpack::data::Datatype::categorical;
info.Type(9) = mlpack::data::Datatype::categorical;
info.Type(20) = mlpack::data::Datatype::categorical;
info.Type(21) = mlpack::data::Datatype::categorical;

data::Load("weatherAUS_trim.csv", smote, info);

In [65]:
mlpack::data::Save("smote_in.csv", smote);

In [10]:
resample("smote_in.csv", "RainTomorrow", "No", "Yes", "smote", "Date", 123);

In [11]:
!cat smote_in_smotesampled.csv | sed 1d > smote_in_smotesampled_woh.csv

In [12]:
arma::mat smoteEnc;
mlpack::data::DatasetInfo info;

info.Type(0) = mlpack::data::Datatype::categorical;
info.Type(6) = mlpack::data::Datatype::categorical;
info.Type(8) = mlpack::data::Datatype::categorical;
info.Type(9) = mlpack::data::Datatype::categorical;
info.Type(20) = mlpack::data::Datatype::categorical;
info.Type(21) = mlpack::data::Datatype::categorical;

data::Load("smote_in_smotesampled_woh.csv", smoteEnc, info);

In [None]:
countplot("weatherAUS_mean_imputed.csv", "RainTomorrow", "", "Part-1 Distribution of target class");
auto img = xw::image_from_file("Part-1 Distribution of target class.png").finalize();
img

In [13]:
arma::Row<size_t> targets = arma::conv_to<arma::Row<size_t>>::from(smoteEnc.row(smoteEnc.n_rows - 1));
smoteEnc.shed_row(smoteEnc.n_rows-1);

In [14]:
arma::mat Xtrain, Xtest;
arma::Row<size_t> Ytrain, Ytest;

In [15]:
mlpack::data::Split(smoteEnc, targets, Xtrain, Xtest, Ytrain, Ytest, 0.25);

In [16]:
arma::mat XtrainScaled, XtestScaled;

In [17]:
StandardScaler scale;
scale.Fit(Xtrain);
scale.Transform(Xtrain, XtrainScaled);
scale.Transform(Xtest, XtestScaled);

In [18]:
RandomForest<> rf(XtrainScaled, Ytrain, 2, 100);

In [19]:
arma::Row<size_t> output;
rf.Classify(XtestScaled, output);

In [24]:
const size_t correct = arma::accu(output == Ytest);

In [28]:
(double)correct / (double)Ytest.n_elem

0.90697470