mlpack · zoq · Aug 25, 2017 · Jul 31, 2017 · Aug 5, 2017 · Aug 9, 2017
diff --git a/Makefile b/Makefile
@@ -187,6 +187,7 @@ endif
 	# git version of mlpack is used.)
 	#cd methods/mlpack/src/ && ./build_scripts.sh
 	# Compile the DLIBML scripts.
+	g++ -O2 -std=c++11 methods/dlibml/src/SVM.cpp -o methods/dlibml/dlibml_svm -I"$(INCLUDEPATH)" -L"$(LIBPATH)" -ldlib -lmlpack -lboost_program_options -lblas -llapack 
 	g++ -O2 -std=c++11 methods/dlibml/src/ANN.cpp -o methods/dlibml/dlibml_ann -I"$(INCLUDEPATH)" -L"$(LIBPATH)" -ldlib -lmlpack -lboost_program_options -lblas -llapack
 	g++ -O2 -std=c++11 methods/dlibml/src/ALLKNN.cpp -o methods/dlibml/dlibml_allknn -I"$(INCLUDEPATH)" -L"$(LIBPATH)" -ldlib -lmlpack -lboost_program_options -lblas -llapack
 	g++ -O2 -std=c++11 methods/dlibml/src/KMEANS.cpp -o methods/dlibml/dlibml_kmeans -I"$(INCLUDEPATH)" -L"$(LIBPATH)" -ldlib -lmlpack -lboost_program_options -lblas -llapack 

diff --git a/config.yaml b/config.yaml
@@ -1312,6 +1312,8 @@ methods:
                            ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
                            ['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
                            ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
+                  options:
+                     k: 5
 
     QDA:
             run: ['metric']
@@ -1767,6 +1769,9 @@ methods:
                            ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
                            ['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
                            ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
+
+                  options:
+                     k: 5
 
     DECISIONTREE:
             run: ['metric']
@@ -2235,6 +2240,10 @@ methods:
                            ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
                            ['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
                            ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
+
+                  options:
+                     k: 5
+
     DTC:
             run: ['timing', 'metric']
             script: methods/shogun/decision_tree.py
@@ -3000,7 +3009,7 @@ methods:
               options:
                 clusters: 7
 
-            - files: [ ['datasets/isolet.csv', 'datasets/isolet_centroids.csv'] ]
+           - files: [ ['datasets/isolet.csv', 'datasets/isolet_centroids.csv'] ]
               options:
                 clusters: 26
 
@@ -3012,3 +3021,21 @@ methods:
             - files: [ ['datasets/1000000-10-randu.csv'] ]
               options:
                 clusters: 75
+    SVM:
+            run: ['metric']
+            script: methods/dlibml/SVM.py
+            format: [csv, txt, arff]
+            datasets:
+                - files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
+                           ['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
+                           ['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
+                           ['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
+                           ['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
+                           ['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
+                           ['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
+                           ['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
+                           ['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
+                           ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
+                           ['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
+                           ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
+
diff --git a/methods/dlibml/SVM.py b/methods/dlibml/SVM.py
@@ -0,0 +1,144 @@
+'''
+  @file SVM.py
+  Class to benchmark the SVM method with dlib-ml.
+'''
+
+import os
+import sys
+import inspect
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+  os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+  sys.path.insert(0, cmd_subfolder)
+
+from profiler import *
+
+import shlex
+import subprocess
+import re
+import collections
+
+from log import *
+from timer import *
+from definitions import *
+from misc import *
+
+import numpy as np
+
+'''
+This class implements the SVM benchmark.
+'''
+class SVM(object):
+
+  '''
+  Create the SVM benchmark instance, show some informations
+  and return the instance.
+  @param dataset - Input dataset to perform SVM on.
+  @param timeout - The time until the timeout. Default no timeout.
+  @param path - Path to the dlib executable.
+  @param verbose - Display informational messages.
+  '''
+  def __init__(self, dataset, timeout=0, path=os.environ["DLIBML_PATH"],
+        verbose = True):
+    self.verbose = verbose
+    self.dataset = dataset
+    self.path = path
+    self.timeout = timeout
+
+  '''
+  Perform SVM. If the method has been successfully completed
+  return the elapsed time in seconds.
+  @param options - Extra options for the method.
+  @return - Elapsed time in seconds or a negative value if the method was not
+  successful.
+  '''
+  def RunMetrics(self, options):
+    Log.Info("Perform SVM.", self.verbose)
+
+    optionsStr = ""
+    if "kernel" in options:
+      optionsStr = "-k " + str(options.pop("kernel"))
+    else:
+      optionsStr = "-k " + "rbf"
+
+    if "C" in options:
+      optionsStr += " -c " + str(options.pop("C"))
+    else:
+      optionsStr += " -c " + "0.1"
+
+    if "coef" in options:
+      optionsStr += " -g " + str(options.pop("coef"))
+    else:
+      optionsStr += " -g " + "1"
+
+    if "degree" in options:
+      optionsStr += " -d " + str(options.pop("degree"))
+    else:
+      optionsStr += " -d " + "2"
+
+
+    if len(options) > 0:
+      Log.Fatal("Unknown parameters: " + str(options))
+      raise Exception("unknown parameters")
+
+    cmd = shlex.split(self.path + "dlibml_svm -t " + self.dataset[0] + " -T " +
+          self.dataset[1] + " -v " + optionsStr)
+
+
+    # Run command with the nessecary arguments and return its output as a byte
+    # string. We have untrusted input so we disable all shell based features.
+    try:
+      s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
+          timeout=self.timeout)
+    except subprocess.TimeoutExpired as e:
+      Log.Warn(str(e))
+      return -2
+    except Exception as e:
+      Log.Fatal("Could not execute command: " + str(cmd))
+      return -1
+
+    # Datastructure to store the results.
+    metrics = {}
+
+    # Parse data: runtime.
+    predictions = np.genfromtxt("predictions.csv", delimiter = ',')
+    truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
+    timer = self.parseTimer(s)
+
+    if timer != -1:
+      metrics['Runtime'] = timer.runtime
+      confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
+      metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
+      metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
+      metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
+      metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
+      metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)
+
+
+      Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)
+
+    return metrics
+
+  '''
+  Parse the timer data form a given string.
+  @param data - String to parse timer data from.
+  @return - Namedtuple that contains the timer data or -1 in case of an error.
+  '''
+  def parseTimer(self, data):
+    # Compile the regular expression pattern into a regular expression object to
+    # parse the timer data.
+    pattern = re.compile(r"""
+        .*?runtime: (?P<runtime>.*?)s.*
+        """, re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+    match = pattern.match(data.decode())
+    if not match:
+      Log.Fatal("Can't parse the data: wrong format")
+      return -1
+    else:
+      # Create a namedtuple and return the timer data.
+      timer = collections.namedtuple("timer", ["runtime"])
+      return timer(float(match.group("runtime")))
diff --git a/methods/dlibml/src/SVM.cpp b/methods/dlibml/src/SVM.cpp
@@ -0,0 +1,120 @@
+#include <dlib/svm_threaded.h>
+#include <dlib/rand.h>
+#include <mlpack/core.hpp>
+#include <mlpack/core/util/timers.hpp>
+#include <string>
+
+using namespace mlpack;
+using namespace std;
+using namespace dlib;
+
+// Information about the program itself.
+PROGRAM_INFO("Support Vector Machines",
+    "This program will perform SVM with the DLib-ml library");
+
+// Define our input parameters that this program will take.
+PARAM_STRING_IN("training_file", "File containing the training dataset.",
+    "t", "");
+PARAM_STRING_IN("test_file", "File containing the test dataset.",
+    "T", "");
+PARAM_STRING_IN("kernel", "Name of the kernel to be used.", "k", "");
+PARAM_DOUBLE_IN("C", "Bandwidth", "c", 0);
+PARAM_DOUBLE_IN("g", "Coef", "g", 0);
+PARAM_DOUBLE_IN("d", "Degree", "d", 0);
+
+int main(int argc, char** argv)
+{
+  // Parse command line options.
+  CLI::ParseCommandLine(argc, argv);
+
+  // Get all the parameters.
+  const string trainFile = CLI::GetParam<string>("training_file");
+  const string testFile = CLI::GetParam<string>("test_file");
+
+  const string kernel = CLI::GetParam<string>("k");
+  size_t c = CLI::GetParam<double>("c");
+  size_t g = CLI::GetParam<double>("g");
+  size_t d = CLI::GetParam<double>("d");
+
+  arma::mat trainData;
+  arma::mat testData; // So it doesn't go out of scope.
+
+  data::Load(trainFile, trainData, true);
+
+  Log::Info << "Loaded train data from '" << trainFile << "' ("
+      << trainData.n_rows << " x " << trainData.n_cols << ")." << endl;
+
+  data::Load(testFile, testData, true);
+
+  Log::Info << "Loaded test data from '" << testFile << "' ("
+      << testData.n_rows << " x " << testData.n_cols << ")." << endl;
+
+  typedef matrix<double, 0, 1> sample_type;
+
+  std::vector<sample_type> train;
+  std::vector<sample_type> test;
+  std::vector<double> labels;
+
+  typedef one_vs_one_trainer<any_trainer<sample_type> > ovo_trainer;
+  ovo_trainer trainer;
+
+  typedef polynomial_kernel<sample_type> poly_kernel;
+  typedef radial_basis_kernel<sample_type> rbf_kernel;
+
+  krr_trainer<rbf_kernel> rbf_trainer;
+  svm_nu_trainer<poly_kernel> poly_trainer;
+  if (kernel.compare("polynomial") == 0)
+  { 
+     poly_trainer.set_kernel(poly_kernel(c, g, d));
+     trainer.set_trainer(poly_trainer, 1, 2);
+  }
+  else if (kernel.compare("rbf") == 0)
+  {  
+    rbf_trainer.set_kernel(rbf_kernel(c));
+    trainer.set_trainer(rbf_trainer);
+  }
+
+  sample_type m;
+  m.set_size(trainData.n_rows);
+  for (size_t i = 0; i < trainData.n_cols; ++i)
+  {
+    for (size_t j = 0; j < trainData.n_rows - 1; ++j)
+      m(j) = trainData(j, i);
+
+    train.push_back(m);
+  }
+  sample_type m2;
+  m2.set_size(1);
+  for (size_t j = 0; j < trainData.n_cols; ++j)
+  {
+    labels.push_back(trainData(trainData.n_rows - 1, j));
+  }
+
+  sample_type te;
+  te.set_size(testData.n_rows);
+
+
+  for (size_t i = 0; i < testData.n_cols; ++i)
+  {
+    for (size_t j = 0; j < testData.n_rows; ++j)
+      te(j) = testData(j, i);
+
+    test.push_back(te);
+  }
+
+  Timer::Start("runtime");
+
+  one_vs_one_decision_function<ovo_trainer> df = trainer.train(train, labels);
+  arma::mat predictions(1, test.size());
+
+  for(size_t i = 0; i < test.size(); i++)
+  {
+    predictions(i) = df(test[i]);
+  }
+
+  Timer::Stop("runtime");
+
+  data::Save("predictions.csv", predictions);
+
+}
+
diff --git a/methods/matlab/knc.py b/methods/matlab/knc.py
@@ -73,6 +73,9 @@ def RunMetrics(self, options):
     self.opts = {}
     if "k" in options:
       self.opts["n_neighbors"] = int(options.pop("k"))
+    else:
+      Log.Fatal("Required parameter 'k' not specified!")
+      raise Exception("missing parameter")
     # No options accepted for this task.
     if len(options) > 0:
       Log.Fatal("Unknown parameters: " + str(options))

diff --git a/methods/scikit/knc.py b/methods/scikit/knc.py
@@ -82,6 +82,9 @@ def RunKNCScikit(q):
       self.opts = {}
       if "k" in options:
         self.opts["n_neighbors"] = int(options.pop("k"))
+      else:
+        Log.Fatal("Required parameter 'k' not specified!")
+        raise Exception("missing parameter")
       if "algorithm" in options:
         self.opts["algorithm"] = str(options.pop("algorithm"))
       if "leaf_size" in options:

diff --git a/methods/shogun/knc.py b/methods/shogun/knc.py
@@ -62,7 +62,8 @@ def BuildModel(self, data, labels, options):
     if "k" in options:
       n_neighbors = int(options.pop("k"))
     else:
-      n_neighbors = 5
+      Log.Fatal("Required parameter 'k' not specified!")
+      raise Exception("missing parameter")
 
     if len(options) > 0:
       Log.Fatal("Unknown parameters: " + str(options))

diff --git a/tests/benchmark_knc.py b/tests/benchmark_knc.py
@@ -44,7 +44,7 @@ def test_Constructor(self):
   Test the 'RunMetrics' function.
   '''
   def test_RunMetrics(self):
-    result = self.instance.RunMetrics({})
+    result = self.instance.RunMetrics({"k": 5})
     self.assertTrue(result["Runtime"] > 0)
     self.assertTrue(result["Avg Accuracy"] > 0)
     self.assertTrue(result["MultiClass Precision"] > 0)
@@ -79,7 +79,7 @@ def test_Constructor(self):
   Test the 'RunMetrics' function.
   '''
   def test_RunMetrics(self):
-    result = self.instance.RunMetrics({})
+    result = self.instance.RunMetrics({"k": 5})
     self.assertTrue(result["Runtime"] > 0)
 
 '''