Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

making specifying k necessary and svm using dlibml #101

Merged
merged 6 commits into from Aug 25, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Expand Up @@ -187,6 +187,7 @@ endif
# git version of mlpack is used.)
#cd methods/mlpack/src/ && ./build_scripts.sh
# Compile the DLIBML scripts.
g++ -O2 -std=c++11 methods/dlibml/src/SVM.cpp -o methods/dlibml/dlibml_svm -I"$(INCLUDEPATH)" -L"$(LIBPATH)" -ldlib -lmlpack -lboost_program_options -lblas -llapack
g++ -O2 -std=c++11 methods/dlibml/src/ANN.cpp -o methods/dlibml/dlibml_ann -I"$(INCLUDEPATH)" -L"$(LIBPATH)" -ldlib -lmlpack -lboost_program_options -lblas -llapack
g++ -O2 -std=c++11 methods/dlibml/src/ALLKNN.cpp -o methods/dlibml/dlibml_allknn -I"$(INCLUDEPATH)" -L"$(LIBPATH)" -ldlib -lmlpack -lboost_program_options -lblas -llapack
g++ -O2 -std=c++11 methods/dlibml/src/KMEANS.cpp -o methods/dlibml/dlibml_kmeans -I"$(INCLUDEPATH)" -L"$(LIBPATH)" -ldlib -lmlpack -lboost_program_options -lblas -llapack
Expand Down
29 changes: 28 additions & 1 deletion config.yaml
Expand Up @@ -1312,6 +1312,8 @@ methods:
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
options:
k: 5

QDA:
run: ['metric']
Expand Down Expand Up @@ -1767,6 +1769,9 @@ methods:
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]

options:
k: 5

DECISIONTREE:
run: ['metric']
Expand Down Expand Up @@ -2235,6 +2240,10 @@ methods:
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]

options:
k: 5

DTC:
run: ['timing', 'metric']
script: methods/shogun/decision_tree.py
Expand Down Expand Up @@ -3000,7 +3009,7 @@ methods:
options:
clusters: 7

- files: [ ['datasets/isolet.csv', 'datasets/isolet_centroids.csv'] ]
- files: [ ['datasets/isolet.csv', 'datasets/isolet_centroids.csv'] ]
options:
clusters: 26

Expand All @@ -3012,3 +3021,21 @@ methods:
- files: [ ['datasets/1000000-10-randu.csv'] ]
options:
clusters: 75
SVM:
run: ['metric']
script: methods/dlibml/SVM.py
format: [csv, txt, arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]

144 changes: 144 additions & 0 deletions methods/dlibml/SVM.py
@@ -0,0 +1,144 @@
'''
@file SVM.py
Class to benchmark the SVM method with dlib-ml.
'''

import os
import sys
import inspect

# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)

from profiler import *

import shlex
import subprocess
import re
import collections

from log import *
from timer import *
from definitions import *
from misc import *

import numpy as np

'''
This class implements the SVM benchmark.
'''
class SVM(object):

'''
Create the SVM benchmark instance, show some informations
and return the instance.
@param dataset - Input dataset to perform SVM on.
@param timeout - The time until the timeout. Default no timeout.
@param path - Path to the dlib executable.
@param verbose - Display informational messages.
'''
def __init__(self, dataset, timeout=0, path=os.environ["DLIBML_PATH"],
verbose = True):
self.verbose = verbose
self.dataset = dataset
self.path = path
self.timeout = timeout

'''
Perform SVM. If the method has been successfully completed
return the elapsed time in seconds.
@param options - Extra options for the method.
@return - Elapsed time in seconds or a negative value if the method was not
successful.
'''
def RunMetrics(self, options):
Log.Info("Perform SVM.", self.verbose)

optionsStr = ""
if "kernel" in options:
optionsStr = "-k " + str(options.pop("kernel"))
else:
optionsStr = "-k " + "rbf"

if "C" in options:
optionsStr += " -c " + str(options.pop("C"))
else:
optionsStr += " -c " + "0.1"

if "coef" in options:
optionsStr += " -g " + str(options.pop("coef"))
else:
optionsStr += " -g " + "1"

if "degree" in options:
optionsStr += " -d " + str(options.pop("degree"))
else:
optionsStr += " -d " + "2"


if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")

cmd = shlex.split(self.path + "dlibml_svm -t " + self.dataset[0] + " -T " +
self.dataset[1] + " -v " + optionsStr)


# Run command with the nessecary arguments and return its output as a byte
# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1

# Datastructure to store the results.
metrics = {}

# Parse data: runtime.
predictions = np.genfromtxt("predictions.csv", delimiter = ',')
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
timer = self.parseTimer(s)

if timer != -1:
metrics['Runtime'] = timer.runtime
confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)


Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)

return metrics

'''
Parse the timer data form a given string.
@param data - String to parse timer data from.
@return - Namedtuple that contains the timer data or -1 in case of an error.
'''
def parseTimer(self, data):
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(r"""
.*?runtime: (?P<runtime>.*?)s.*
""", re.VERBOSE|re.MULTILINE|re.DOTALL)

match = pattern.match(data.decode())
if not match:
Log.Fatal("Can't parse the data: wrong format")
return -1
else:
# Create a namedtuple and return the timer data.
timer = collections.namedtuple("timer", ["runtime"])
return timer(float(match.group("runtime")))
120 changes: 120 additions & 0 deletions methods/dlibml/src/SVM.cpp
@@ -0,0 +1,120 @@
#include <dlib/svm_threaded.h>
#include <dlib/rand.h>
#include <mlpack/core.hpp>
#include <mlpack/core/util/timers.hpp>
#include <string>

using namespace mlpack;
using namespace std;
using namespace dlib;

// Information about the program itself.
PROGRAM_INFO("Support Vector Machines",
"This program will perform SVM with the DLib-ml library");

// Define our input parameters that this program will take.
PARAM_STRING_IN("training_file", "File containing the training dataset.",
"t", "");
PARAM_STRING_IN("test_file", "File containing the test dataset.",
"T", "");
PARAM_STRING_IN("kernel", "Name of the kernel to be used.", "k", "");
PARAM_DOUBLE_IN("C", "Bandwidth", "c", 0);
PARAM_DOUBLE_IN("g", "Coef", "g", 0);
PARAM_DOUBLE_IN("d", "Degree", "d", 0);

int main(int argc, char** argv)
{
// Parse command line options.
CLI::ParseCommandLine(argc, argv);

// Get all the parameters.
const string trainFile = CLI::GetParam<string>("training_file");
const string testFile = CLI::GetParam<string>("test_file");

const string kernel = CLI::GetParam<string>("k");
size_t c = CLI::GetParam<double>("c");
size_t g = CLI::GetParam<double>("g");
size_t d = CLI::GetParam<double>("d");

arma::mat trainData;
arma::mat testData; // So it doesn't go out of scope.

data::Load(trainFile, trainData, true);

Log::Info << "Loaded train data from '" << trainFile << "' ("
<< trainData.n_rows << " x " << trainData.n_cols << ")." << endl;

data::Load(testFile, testData, true);

Log::Info << "Loaded test data from '" << testFile << "' ("
<< testData.n_rows << " x " << testData.n_cols << ")." << endl;

typedef matrix<double, 0, 1> sample_type;

std::vector<sample_type> train;
std::vector<sample_type> test;
std::vector<double> labels;

typedef one_vs_one_trainer<any_trainer<sample_type> > ovo_trainer;
ovo_trainer trainer;

typedef polynomial_kernel<sample_type> poly_kernel;
typedef radial_basis_kernel<sample_type> rbf_kernel;

krr_trainer<rbf_kernel> rbf_trainer;
svm_nu_trainer<poly_kernel> poly_trainer;
if (kernel.compare("polynomial") == 0)
{
poly_trainer.set_kernel(poly_kernel(c, g, d));
trainer.set_trainer(poly_trainer, 1, 2);
}
else if (kernel.compare("rbf") == 0)
{
rbf_trainer.set_kernel(rbf_kernel(c));
trainer.set_trainer(rbf_trainer);
}

sample_type m;
m.set_size(trainData.n_rows);
for (size_t i = 0; i < trainData.n_cols; ++i)
{
for (size_t j = 0; j < trainData.n_rows - 1; ++j)
m(j) = trainData(j, i);

train.push_back(m);
}
sample_type m2;
m2.set_size(1);
for (size_t j = 0; j < trainData.n_cols; ++j)
{
labels.push_back(trainData(trainData.n_rows - 1, j));
}

sample_type te;
te.set_size(testData.n_rows);


for (size_t i = 0; i < testData.n_cols; ++i)
{
for (size_t j = 0; j < testData.n_rows; ++j)
te(j) = testData(j, i);

test.push_back(te);
}

Timer::Start("runtime");

one_vs_one_decision_function<ovo_trainer> df = trainer.train(train, labels);
arma::mat predictions(1, test.size());

for(size_t i = 0; i < test.size(); i++)
{
predictions(i) = df(test[i]);
}

Timer::Stop("runtime");

data::Save("predictions.csv", predictions);

}

3 changes: 3 additions & 0 deletions methods/matlab/knc.py
Expand Up @@ -73,6 +73,9 @@ def RunMetrics(self, options):
self.opts = {}
if "k" in options:
self.opts["n_neighbors"] = int(options.pop("k"))
else:
Log.Fatal("Required parameter 'k' not specified!")
raise Exception("missing parameter")
# No options accepted for this task.
if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
Expand Down
3 changes: 3 additions & 0 deletions methods/scikit/knc.py
Expand Up @@ -82,6 +82,9 @@ def RunKNCScikit(q):
self.opts = {}
if "k" in options:
self.opts["n_neighbors"] = int(options.pop("k"))
else:
Log.Fatal("Required parameter 'k' not specified!")
raise Exception("missing parameter")
if "algorithm" in options:
self.opts["algorithm"] = str(options.pop("algorithm"))
if "leaf_size" in options:
Expand Down
3 changes: 2 additions & 1 deletion methods/shogun/knc.py
Expand Up @@ -62,7 +62,8 @@ def BuildModel(self, data, labels, options):
if "k" in options:
n_neighbors = int(options.pop("k"))
else:
n_neighbors = 5
Log.Fatal("Required parameter 'k' not specified!")
raise Exception("missing parameter")

if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
Expand Down
4 changes: 2 additions & 2 deletions tests/benchmark_knc.py
Expand Up @@ -44,7 +44,7 @@ def test_Constructor(self):
Test the 'RunMetrics' function.
'''
def test_RunMetrics(self):
result = self.instance.RunMetrics({})
result = self.instance.RunMetrics({"k": 5})
self.assertTrue(result["Runtime"] > 0)
self.assertTrue(result["Avg Accuracy"] > 0)
self.assertTrue(result["MultiClass Precision"] > 0)
Expand Down Expand Up @@ -79,7 +79,7 @@ def test_Constructor(self):
Test the 'RunMetrics' function.
'''
def test_RunMetrics(self):
result = self.instance.RunMetrics({})
result = self.instance.RunMetrics({"k": 5})
self.assertTrue(result["Runtime"] > 0)

'''
Expand Down