Permalink
Browse files

Merge pull request #95 from Iron-Stark/weka_update

Updating Weka implementations.
  • Loading branch information...
rcurtin committed Jul 29, 2017
2 parents 58bcaca + 54b27e6 commit 09ed4ee3d05b6a346d6dcf1a123fc278bd61442e
View
@@ -2235,15 +2235,6 @@ methods:
options:
new_dimensionality: 2
whiten: True
NBC:
run: ['metric']
iteration: 3
script: methods/weka/nbc.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv'],
['datasets/transfusion_train.csv', 'datasets/transfusion_test.csv'],
['datasets/madelon_train.csv', 'datasets/madelon_test.csv'] ]
KMEANS:
run: ['metric']
iteration: 3
@@ -2310,6 +2301,94 @@ methods:
['datasets/ticdata2000.csv'], ['datasets/TomsHardware.csv'],
['datasets/madelon_train.csv', 'datasets/madelon_test.csv'],
['datasets/arcene_train.csv', 'datasets/arcene_test.csv'] ]
DTC:
run: ['metric']
script: methods/weka/dtc.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv']
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
LogisticRegression:
run: ['metric']
iteration: 3
script: methods/weka/logistic_regression.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
NBC:
run: ['metric']
iteration: 3
script: methods/weka/nbc.py
format: [arff]
datasets:
- files: [ ['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
RANDOMFOREST:
run: ['metric']
script: methods/weka/random_forest.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
PERCEPTRON:
run: ['metric']
script: methods/weka/perceptron.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
---
# ANN:
# A Library for Approximate Nearest Neighbor Searching
@@ -0,0 +1,139 @@
'''
@file dtc.py
Class to benchmark the weka Decision Stump Classifier method.
'''
import os
import sys
import inspect
# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)
#Import the metrics definitions path.
metrics_folder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics")))
if metrics_folder not in sys.path:
sys.path.insert(0, metrics_folder)
from log import *
from profiler import *
from definitions import *
from misc import *
import shlex
import subprocess
import re
import collections
import numpy as np
'''
This class implements the Decision Stump Classifier benchmark.
'''
class DECISIONSTUMP(object):
'''
Create the Decision Stump Classifier benchmark instance.
@param dataset - Input dataset to perform DECISIONSTUMP on.
@param timeout - The time until the timeout. Default no timeout.
@param path - Path to the mlpack executable.
@param verbose - Display informational messages.
'''
def __init__(self, dataset, timeout=0, path=os.environ["JAVAPATH"],
verbose=True):
self.verbose = verbose
self.dataset = dataset
self.path = path
self.timeout = timeout
def __del__(self):
Log.Info("Clean up.", self.verbose)
filelist = ["weka_predicted.csv"]
for f in filelist:
if os.path.isfile(f):
os.remove(f)
'''
Decision Stump Classifier. If the method has been successfully completed return
the elapsed time in seconds.
@param options - Extra options for the method.
@return - Elapsed time in seconds or a negative value if the method was not
successful.
'''
def RunMetrics(self, options):
Log.Info("Perform DECISIONSTUMP.", self.verbose)
if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")
if len(self.dataset) < 2:
Log.Fatal("This method requires two or more datasets.")
return -1
# Split the command using shell-like syntax.
cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
":methods/weka" + " DECISIONSTUMP -t " + self.dataset[0] + " -T " +
self.dataset[1])
# Run command with the necessary arguments and return its output as a byte
# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1
# Datastructure to store the results.
metrics = {}
# Parse data: runtime.
timer = self.parseTimer(s)
if timer != -1:
predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
metrics['Runtime'] = timer.total_time
confusionMatrix = Metrics.ConfusionMatrix(truelabels, self.predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, self.predictions)
Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)
return metrics
'''
Parse the timer data form a given string.
@param data - String to parse timer data from.
@return - Namedtuple that contains the timer data or -1 in case of an error.
'''
def parseTimer(self, data):
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(r"""
.*?total_time: (?P<total_time>.*?)s.*?
""", re.VERBOSE|re.MULTILINE|re.DOTALL)
match = pattern.match(data.decode())
if not match:
Log.Fatal("Can't parse the data: wrong format")
return -1
else:
# Create a namedtuple and return the timer data.
timer = collections.namedtuple("timer", ["total_time"])
if match.group("total_time").count(".") == 1:
return timer(float(match.group("total_time")))
else:
return timer(float(match.group("total_time").replace(",", ".")))
View
@@ -0,0 +1,143 @@
'''
@file dtc.py
Class to benchmark the weka Decision Tree Classifier method.
'''
import os
import sys
import inspect
# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)
#Import the metrics definitions path.
metrics_folder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics")))
if metrics_folder not in sys.path:
sys.path.insert(0, metrics_folder)
from log import *
from profiler import *
from definitions import *
from misc import *
import shlex
import subprocess
import re
import collections
import numpy as np
'''
This class implements the Decision Tree Classifier benchmark.
'''
class DTC(object):
'''
Create the Decision Tree Classifier benchmark instance.
@param dataset - Input dataset to perform DTC on.
@param timeout - The time until the timeout. Default no timeout.
@param path - Path to the mlpack executable.
@param verbose - Display informational messages.
'''
def __init__(self, dataset, timeout=0, path=os.environ["JAVAPATH"],
verbose=True):
self.verbose = verbose
self.dataset = dataset
self.path = path
self.timeout = timeout
def __del__(self):
Log.Info("Clean up.", self.verbose)
filelist = ["weka_predicted.csv"]
for f in filelist:
if os.path.isfile(f):
os.remove(f)
'''
Decision Tree Classifier. If the method has been successfully completed return
the elapsed time in seconds.
@param options - Extra options for the method.
@return - Elapsed time in seconds or a negative value if the method was not
successful.
'''
def RunMetrics(self, options):
Log.Info("Perform DTC.", self.verbose)
opts = {}
if "minimum_leaf_size" in options:
opts["minimum_leaf_size"] = int(options.pop("minimum_leaf_size"))
else:
opts["minimum_leaf_size"] = 2
if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")
if len(self.dataset) < 2:
Log.Fatal("This method requires two or more datasets.")
return -1
# Split the command using shell-like syntax.
cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
":methods/weka" + " DTC -t " + self.dataset[0] + " -T " +
self.dataset[1] + " -M " + str(opts["minimum_leaf_size"]))
# Run command with the nessecary arguments and return its output as a byte
# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1
# Datastructure to store the results.
metrics = {}
# Parse data: runtime.
timer = self.parseTimer(s)
if timer != -1:
predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
metrics['Runtime'] = timer.total_time
confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)
Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)
return metrics
'''
Parse the timer data form a given string.
@param data - String to parse timer data from.
@return - Namedtuple that contains the timer data or -1 in case of an error.
'''
def parseTimer(self, data):
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(r"""
.*?total_time: (?P<total_time>.*?)s.*?
""", re.VERBOSE|re.MULTILINE|re.DOTALL)
match = pattern.match(data.decode())
if not match:
Log.Fatal("Can't parse the data: wrong format")
return -1
else:
# Create a namedtuple and return the timer data.
timer = collections.namedtuple("timer", ["total_time"])
if match.group("total_time").count(".") == 1:
return timer(float(match.group("total_time")))
else:
return timer(float(match.group("total_time").replace(",", ".")))
Oops, something went wrong.

0 comments on commit 09ed4ee

Please sign in to comment.