Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating Weka implementations. #95

Merged
merged 11 commits into from
Jul 29, 2017
97 changes: 88 additions & 9 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2235,15 +2235,6 @@ methods:
options:
new_dimensionality: 2
whiten: True
NBC:
run: ['metric']
iteration: 3
script: methods/weka/nbc.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv'],
['datasets/transfusion_train.csv', 'datasets/transfusion_test.csv'],
['datasets/madelon_train.csv', 'datasets/madelon_test.csv'] ]
KMEANS:
run: ['metric']
iteration: 3
Expand Down Expand Up @@ -2310,6 +2301,94 @@ methods:
['datasets/ticdata2000.csv'], ['datasets/TomsHardware.csv'],
['datasets/madelon_train.csv', 'datasets/madelon_test.csv'],
['datasets/arcene_train.csv', 'datasets/arcene_test.csv'] ]
DTC:
run: ['metric']
script: methods/weka/dtc.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv']
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
LOGISTICREGRESSION:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this called LogisticRegression in the other blocks?

run: ['metric']
iteration: 3
script: methods/weka/logistic_regression.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
NBC:
run: ['metric']
iteration: 3
script: methods/weka/nbc.py
format: [arff]
datasets:
- files: [ ['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
RANDOMFOREST:
run: ['metric']
script: methods/weka/random_forest.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]

PERCEPTRON:
run: ['metric']
script: methods/weka/perceptron.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]

---
# ANN:
# A Library for Approximate Nearest Neighbor Searching
Expand Down
132 changes: 132 additions & 0 deletions methods/weka/decision_stump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
'''
@file dtc.py
Class to benchmark the weka Decision Stump Classifier method.
'''

import os
import sys
import inspect

# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)

#Import the metrics definitions path.
metrics_folder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics")))
if metrics_folder not in sys.path:
sys.path.insert(0, metrics_folder)

from log import *
from profiler import *
from definitions import *
from misc import *

import shlex
import subprocess
import re
import collections
import numpy as np

'''
This class implements the Decision Stump Classifier benchmark.
'''
class DTC(object):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably rename the class here, to avoid confusions with the Decision Tree Classifier class, what do you think?


'''
Create the Decision Stump Classifier benchmark instance.
@param dataset - Input dataset to perform DECISIONSTUMP on.
@param timeout - The time until the timeout. Default no timeout.
@param path - Path to the mlpack executable.
@param verbose - Display informational messages.
'''
def __init__(self, dataset, timeout=0, path=os.environ["JAVAPATH"],
verbose=True):
self.verbose = verbose
self.dataset = dataset
self.path = path
self.timeout = timeout

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add a destructor here to clean up, e.g. remove the weka_predicted.csv file.

'''
Decision Stump Classifier. If the method has been successfully completed return
the elapsed time in seconds.
@param options - Extra options for the method.
@return - Elapsed time in seconds or a negative value if the method was not
successful.
'''
def RunMetrics(self, options):
Log.Info("Perform DECISIONSTUMP.", self.verbose)

if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")

if len(self.dataset) < 2:
Log.Fatal("This method requires two or more datasets.")
return -1

# Split the command using shell-like syntax.
cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
":methods/weka" + " DECISIONSTUMP -t " + self.dataset[0] + " -T " +
self.dataset[1])

# Run command with the nessecary arguments and return its output as a byte
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: "necessary" :)

# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1

# Datastructure to store the results.
metrics = {}

# Parse data: runtime.
timer = self.parseTimer(s)

if timer != -1:
predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
metrics['Runtime'] = timer.total_time
confusionMatrix = Metrics.ConfusionMatrix(truelabels, self.predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, self.predictions)

Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)

return metrics

'''
Parse the timer data form a given string.
@param data - String to parse timer data from.
@return - Namedtuple that contains the timer data or -1 in case of an error.
'''
def parseTimer(self, data):
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(r"""
.*?total_time: (?P<total_time>.*?)s.*?
""", re.VERBOSE|re.MULTILINE|re.DOTALL)

match = pattern.match(data.decode())
if not match:
Log.Fatal("Can't parse the data: wrong format")
return -1
else:
# Create a namedtuple and return the timer data.
timer = collections.namedtuple("timer", ["total_time"])

if match.group("total_time").count(".") == 1:
return timer(float(match.group("total_time")))
else:
return timer(float(match.group("total_time").replace(",", ".")))
132 changes: 132 additions & 0 deletions methods/weka/dtc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
'''
@file dtc.py
Class to benchmark the weka Decision Tree Classifier method.
'''

import os
import sys
import inspect

# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)

#Import the metrics definitions path.
metrics_folder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics")))
if metrics_folder not in sys.path:
sys.path.insert(0, metrics_folder)

from log import *
from profiler import *
from definitions import *
from misc import *

import shlex
import subprocess
import re
import collections
import numpy as np

'''
This class implements the Decision Tree Classifier benchmark.
'''
class DTC(object):

'''
Create the Decision Tree Classifier benchmark instance.
@param dataset - Input dataset to perform DTC on.
@param timeout - The time until the timeout. Default no timeout.
@param path - Path to the mlpack executable.
@param verbose - Display informational messages.
'''
def __init__(self, dataset, timeout=0, path=os.environ["JAVAPATH"],
verbose=True):
self.verbose = verbose
self.dataset = dataset
self.path = path
self.timeout = timeout

'''
Decision Tree Classifier. If the method has been successfully completed return
the elapsed time in seconds.
@param options - Extra options for the method.
@return - Elapsed time in seconds or a negative value if the method was not
successful.
'''
def RunMetrics(self, options):
Log.Info("Perform DTC.", self.verbose)

if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should at least add a minimum leaf size parameter here. I realize that the other decision tree benchmark implementations don't support that, but they should, so we can at least start with this one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can I please know how do I use the options in the weka code. I see

-M number
Set minimum number of instances per leaf. (Default: 2)

is available to specify but where does this fit in the code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case of J48 you can write:

J48 cModel = new J48();
cModel.setOptions(weka.core.Utils.splitOptions("-M 2"));

You can also add more options e.g.: weka.core.Utils.splitOptions("-M 2 -C 0.3").


if len(self.dataset) < 2:
Log.Fatal("This method requires two or more datasets.")
return -1

# Split the command using shell-like syntax.
cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
":methods/weka" + " DTC -t " + self.dataset[0] + " -T " +
self.dataset[1])

# Run command with the nessecary arguments and return its output as a byte
# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1

# Datastructure to store the results.
metrics = {}

# Parse data: runtime.
timer = self.parseTimer(s)

if timer != -1:
predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')

metrics['Runtime'] = timer.total_time
confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)
Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)

return metrics

'''
Parse the timer data form a given string.
@param data - String to parse timer data from.
@return - Namedtuple that contains the timer data or -1 in case of an error.
'''
def parseTimer(self, data):
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(r"""
.*?total_time: (?P<total_time>.*?)s.*?
""", re.VERBOSE|re.MULTILINE|re.DOTALL)

match = pattern.match(data.decode())
if not match:
Log.Fatal("Can't parse the data: wrong format")
return -1
else:
# Create a namedtuple and return the timer data.
timer = collections.namedtuple("timer", ["total_time"])

if match.group("total_time").count(".") == 1:
return timer(float(match.group("total_time")))
else:
return timer(float(match.group("total_time").replace(",", ".")))
Loading