In [1]:
// Import necessary library headers.
#include <mlpack/xeus-cling.hpp>
#include <mlpack/core.hpp>
#include <mlpack/core/data/split_data.hpp>
#include <mlpack/methods/decision_tree/decision_tree.hpp>
#include <mlpack/core/data/scaler_methods/standard_scaler.hpp>

In [2]:
#define WITHOUT_NUMPY 1
#include "matplotlibcpp.h"
#include "xwidgets/ximage.hpp"
#include "../utils/preprocess.hpp"
#include "../utils/plot.hpp"

namespace plt = matplotlibcpp;

In [3]:
using namespace mlpack;

In [4]:
using namespace mlpack::data;

In [5]:
using namespace mlpack::tree;

In [6]:
!cat LoanDefault.csv | sed 1d > LoanDefault_trim.csv

In [7]:
!cut -d, -f1 --complement LoanDefault_trim.csv > LoanDefault_trim2.csv

In [9]:
!rm LoanDefault_trim.csv

In [10]:
!mv LoanDefault_trim2.csv LoanDefault_trim.csv

In [11]:
// Load the preprocessed dataset into armadillo matrix.
arma::mat loanData;
data::Load("LoanDefault_trim.csv", loanData);

In [19]:
// Inspect the first 5 examples in the dataset
std::cout << std::setw(12) << "Employed" << std::setw(15) << "Bank Balance" << std::setw(15) << "Annual Salary" 
          << std::setw(12) << "Default" << std::endl;
std::cout << loanData.submat(0, 0, loanData.n_rows-1, 5).t() << std::endl;

    Employed   Bank Balance  Annual Salary     Default
   1.0000e+00   8.7544e+03   5.3234e+05            0
            0   9.8062e+03   1.4527e+05            0
   1.0000e+00   1.2883e+04   3.8121e+05            0
   1.0000e+00   6.3510e+03   4.2845e+05            0
   1.0000e+00   9.4279e+03   4.6156e+05            0
            0   1.1035e+04   8.9899e+04            0



In [20]:
// Visualize the distribution of target classes
countplot("LoanDefault.csv", "Defaulted?", "", "Part-1 Distribution of target class");
auto img = xw::image_from_file("Part-1 Distribution of target class.png").finalize();
img

A Jupyter widget with unique id: 7f92595a62d24fdcb209eb1b52c50019

In [21]:
countplot("LoanDefault.csv", "Defaulted?", "Employed", "Part-1 Distribution of target class & Employed");
auto img = xw::image_from_file("Part-1 Distribution of target class & Employed.png").finalize();
img

A Jupyter widget with unique id: 8b8618a44eeb4c62959fbc34d6896716

In [23]:
!cut -d, -f1 --complement LoanDefault.csv > LoanDefaultEnc.csv

In [24]:
heatmap("LoanDefaultEnc.csv", "coolwarm", "Part-1 Correlation Heatmap", 1);
auto img = xw::image_from_file("Part-1 Correlation Heatmap.png").finalize();
img

A Jupyter widget with unique id: 2800d4169ab842249cd6045fbe1bab8d

In [25]:
arma::Row<size_t> targets = arma::conv_to<arma::Row<size_t>>::from(loanData.row(loanData.n_rows - 1));
loanData.shed_row(loanData.n_rows-1);

In [26]:
arma::mat Xtrain, Xtest;
arma::Row<size_t> Ytrain, Ytest;

In [28]:
mlpack::data::Split(loanData, targets, Xtrain, Xtest, Ytrain, Ytest, 0.25);

In [30]:
DecisionTree<> dt(Xtrain, Ytrain, 2);

In [36]:
arma::Row<size_t> output;
arma::mat probs;
dt.Classify(Xtest, output, probs);

In [39]:
double accuracy(const arma::Row<size_t>& yPreds, const arma::Row<size_t>& yTrue)
{
    const size_t correct = arma::accu(yPreds == yTrue);
    return (double)correct / (double)yTrue.n_elem;
}

In [40]:
std::cout <<  "Accuracy: " << accuracy(output, Ytest) << std::endl;

Accuracy: 0.9712


### Part 2 - Modelling using Random Oversampling

In [None]:
resample("LoanDefault.csv", "Defaulted", "No", "Yes", "oversample", "Date", 123);