New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating Weka implementations. #95

Merged
merged 11 commits into from Jul 29, 2017
Next

Update weka implementation

  • Loading branch information...
rcurtin authored and Iron-Stark committed Jul 17, 2017
commit d7d050c9bd4364f386eb96f1d6eb5a0cbf8fc65a
View
@@ -2235,15 +2235,6 @@ methods:
options:
new_dimensionality: 2
whiten: True
NBC:
run: ['metric']
iteration: 3
script: methods/weka/nbc.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv'],
['datasets/transfusion_train.csv', 'datasets/transfusion_test.csv'],
['datasets/madelon_train.csv', 'datasets/madelon_test.csv'] ]
KMEANS:
run: ['metric']
iteration: 3
@@ -2310,6 +2301,94 @@ methods:
['datasets/ticdata2000.csv'], ['datasets/TomsHardware.csv'],
['datasets/madelon_train.csv', 'datasets/madelon_test.csv'],
['datasets/arcene_train.csv', 'datasets/arcene_test.csv'] ]
DTC:
run: ['metric']
script: methods/weka/dtc.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv']
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
LOGISTICREGRESSION:

This comment has been minimized.

@rcurtin

rcurtin Jul 25, 2017

Member

Isn't this called LogisticRegression in the other blocks?

@rcurtin

rcurtin Jul 25, 2017

Member

Isn't this called LogisticRegression in the other blocks?

run: ['metric']
iteration: 3
script: methods/weka/logistic_regression.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
NBC:
run: ['metric']
iteration: 3
script: methods/weka/nbc.py
format: [arff]
datasets:
- files: [ ['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
RANDOMFOREST:
run: ['metric']
script: methods/weka/random_forest.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
PERCEPTRON:
run: ['metric']
script: methods/weka/perceptron.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
---
# ANN:
# A Library for Approximate Nearest Neighbor Searching
@@ -0,0 +1,132 @@
'''
@file dtc.py
Class to benchmark the weka Decision Stump Classifier method.
'''
import os
import sys
import inspect
# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)
#Import the metrics definitions path.
metrics_folder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics")))
if metrics_folder not in sys.path:
sys.path.insert(0, metrics_folder)
from log import *
from profiler import *
from definitions import *
from misc import *
import shlex
import subprocess
import re
import collections
import numpy as np
'''
This class implements the Decision Stump Classifier benchmark.
'''
class DTC(object):

This comment has been minimized.

@zoq

zoq Jul 23, 2017

Member

We should probably rename the class here, to avoid confusions with the Decision Tree Classifier class, what do you think?

@zoq

zoq Jul 23, 2017

Member

We should probably rename the class here, to avoid confusions with the Decision Tree Classifier class, what do you think?

'''
Create the Decision Stump Classifier benchmark instance.
@param dataset - Input dataset to perform DECISIONSTUMP on.
@param timeout - The time until the timeout. Default no timeout.
@param path - Path to the mlpack executable.
@param verbose - Display informational messages.
'''
def __init__(self, dataset, timeout=0, path=os.environ["JAVAPATH"],
verbose=True):
self.verbose = verbose
self.dataset = dataset
self.path = path
self.timeout = timeout

This comment has been minimized.

@zoq

zoq Jul 23, 2017

Member

I think we should add a destructor here to clean up, e.g. remove the weka_predicted.csv file.

@zoq

zoq Jul 23, 2017

Member

I think we should add a destructor here to clean up, e.g. remove the weka_predicted.csv file.

'''
Decision Stump Classifier. If the method has been successfully completed return
the elapsed time in seconds.
@param options - Extra options for the method.
@return - Elapsed time in seconds or a negative value if the method was not
successful.
'''
def RunMetrics(self, options):
Log.Info("Perform DECISIONSTUMP.", self.verbose)
if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")
if len(self.dataset) < 2:
Log.Fatal("This method requires two or more datasets.")
return -1
# Split the command using shell-like syntax.
cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
":methods/weka" + " DECISIONSTUMP -t " + self.dataset[0] + " -T " +
self.dataset[1])
# Run command with the nessecary arguments and return its output as a byte

This comment has been minimized.

@rcurtin

rcurtin Jul 25, 2017

Member

Typo: "necessary" :)

@rcurtin

rcurtin Jul 25, 2017

Member

Typo: "necessary" :)

# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1
# Datastructure to store the results.
metrics = {}
# Parse data: runtime.
timer = self.parseTimer(s)
if timer != -1:
predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
metrics['Runtime'] = timer.total_time
confusionMatrix = Metrics.ConfusionMatrix(truelabels, self.predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, self.predictions)
Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)
return metrics
'''
Parse the timer data form a given string.
@param data - String to parse timer data from.
@return - Namedtuple that contains the timer data or -1 in case of an error.
'''
def parseTimer(self, data):
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(r"""
.*?total_time: (?P<total_time>.*?)s.*?
""", re.VERBOSE|re.MULTILINE|re.DOTALL)
match = pattern.match(data.decode())
if not match:
Log.Fatal("Can't parse the data: wrong format")
return -1
else:
# Create a namedtuple and return the timer data.
timer = collections.namedtuple("timer", ["total_time"])
if match.group("total_time").count(".") == 1:
return timer(float(match.group("total_time")))
else:
return timer(float(match.group("total_time").replace(",", ".")))
View
@@ -0,0 +1,132 @@
'''
@file dtc.py
Class to benchmark the weka Decision Tree Classifier method.
'''
import os
import sys
import inspect
# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)
#Import the metrics definitions path.
metrics_folder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics")))
if metrics_folder not in sys.path:
sys.path.insert(0, metrics_folder)
from log import *
from profiler import *
from definitions import *
from misc import *
import shlex
import subprocess
import re
import collections
import numpy as np
'''
This class implements the Decision Tree Classifier benchmark.
'''
class DTC(object):
'''
Create the Decision Tree Classifier benchmark instance.
@param dataset - Input dataset to perform DTC on.
@param timeout - The time until the timeout. Default no timeout.
@param path - Path to the mlpack executable.
@param verbose - Display informational messages.
'''
def __init__(self, dataset, timeout=0, path=os.environ["JAVAPATH"],
verbose=True):
self.verbose = verbose
self.dataset = dataset
self.path = path
self.timeout = timeout
'''
Decision Tree Classifier. If the method has been successfully completed return
the elapsed time in seconds.
@param options - Extra options for the method.
@return - Elapsed time in seconds or a negative value if the method was not
successful.
'''
def RunMetrics(self, options):
Log.Info("Perform DTC.", self.verbose)
if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")

This comment has been minimized.

@rcurtin

rcurtin Jul 25, 2017

Member

We should at least add a minimum leaf size parameter here. I realize that the other decision tree benchmark implementations don't support that, but they should, so we can at least start with this one.

@rcurtin

rcurtin Jul 25, 2017

Member

We should at least add a minimum leaf size parameter here. I realize that the other decision tree benchmark implementations don't support that, but they should, so we can at least start with this one.

This comment has been minimized.

@Iron-Stark

Iron-Stark Jul 25, 2017

Contributor

Can I please know how do I use the options in the weka code. I see

-M number
Set minimum number of instances per leaf. (Default: 2)

is available to specify but where does this fit in the code.

@Iron-Stark

Iron-Stark Jul 25, 2017

Contributor

Can I please know how do I use the options in the weka code. I see

-M number
Set minimum number of instances per leaf. (Default: 2)

is available to specify but where does this fit in the code.

This comment has been minimized.

@zoq

zoq Jul 26, 2017

Member

In case of J48 you can write:

J48 cModel = new J48();
cModel.setOptions(weka.core.Utils.splitOptions("-M 2"));

You can also add more options e.g.: weka.core.Utils.splitOptions("-M 2 -C 0.3").

@zoq

zoq Jul 26, 2017

Member

In case of J48 you can write:

J48 cModel = new J48();
cModel.setOptions(weka.core.Utils.splitOptions("-M 2"));

You can also add more options e.g.: weka.core.Utils.splitOptions("-M 2 -C 0.3").

if len(self.dataset) < 2:
Log.Fatal("This method requires two or more datasets.")
return -1
# Split the command using shell-like syntax.
cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
":methods/weka" + " DTC -t " + self.dataset[0] + " -T " +
self.dataset[1])
# Run command with the nessecary arguments and return its output as a byte
# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1
# Datastructure to store the results.
metrics = {}
# Parse data: runtime.
timer = self.parseTimer(s)
if timer != -1:
predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
metrics['Runtime'] = timer.total_time
confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)
Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)
return metrics
'''
Parse the timer data form a given string.
@param data - String to parse timer data from.
@return - Namedtuple that contains the timer data or -1 in case of an error.
'''
def parseTimer(self, data):
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(r"""
.*?total_time: (?P<total_time>.*?)s.*?
""", re.VERBOSE|re.MULTILINE|re.DOTALL)
match = pattern.match(data.decode())
if not match:
Log.Fatal("Can't parse the data: wrong format")
return -1
else:
# Create a namedtuple and return the timer data.
timer = collections.namedtuple("timer", ["total_time"])
if match.group("total_time").count(".") == 1:
return timer(float(match.group("total_time")))
else:
return timer(float(match.group("total_time").replace(",", ".")))
Oops, something went wrong.
You are viewing a condensed version of this merge commit. You can view the full changes here.
ProTip! Use n and p to navigate between commits in a pull request.