New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating Weka implementations. #95

Merged
merged 11 commits into from Jul 29, 2017
Copy path View file
@@ -2235,15 +2235,6 @@ methods:
options:
new_dimensionality: 2
whiten: True
NBC:
run: ['metric']
iteration: 3
script: methods/weka/nbc.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv'],
['datasets/transfusion_train.csv', 'datasets/transfusion_test.csv'],
['datasets/madelon_train.csv', 'datasets/madelon_test.csv'] ]
KMEANS:
run: ['metric']
iteration: 3
@@ -2310,6 +2301,94 @@ methods:
['datasets/ticdata2000.csv'], ['datasets/TomsHardware.csv'],
['datasets/madelon_train.csv', 'datasets/madelon_test.csv'],
['datasets/arcene_train.csv', 'datasets/arcene_test.csv'] ]
DTC:
run: ['metric']
script: methods/weka/dtc.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv']
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
LogisticRegression:
run: ['metric']
iteration: 3
script: methods/weka/logistic_regression.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
NBC:
run: ['metric']
iteration: 3
script: methods/weka/nbc.py
format: [arff]
datasets:
- files: [ ['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
RANDOMFOREST:
run: ['metric']
script: methods/weka/random_forest.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
PERCEPTRON:
run: ['metric']
script: methods/weka/perceptron.py
format: [arff]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
---
# ANN:
# A Library for Approximate Nearest Neighbor Searching
Copy path View file
@@ -0,0 +1,139 @@
'''
@file dtc.py
Class to benchmark the weka Decision Stump Classifier method.
'''
import os
import sys
import inspect
# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)
#Import the metrics definitions path.
metrics_folder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics")))
if metrics_folder not in sys.path:
sys.path.insert(0, metrics_folder)
from log import *
from profiler import *
from definitions import *
from misc import *
import shlex
import subprocess
import re
import collections
import numpy as np
'''
This class implements the Decision Stump Classifier benchmark.
'''
class DECISIONSTUMP(object):
'''
Create the Decision Stump Classifier benchmark instance.
@param dataset - Input dataset to perform DECISIONSTUMP on.
@param timeout - The time until the timeout. Default no timeout.
@param path - Path to the mlpack executable.
@param verbose - Display informational messages.
'''
def __init__(self, dataset, timeout=0, path=os.environ["JAVAPATH"],
verbose=True):
self.verbose = verbose
self.dataset = dataset
self.path = path
self.timeout = timeout

This comment has been minimized.

@zoq

zoq Jul 23, 2017

Member

I think we should add a destructor here to clean up, e.g. remove the weka_predicted.csv file.

def __del__(self):
Log.Info("Clean up.", self.verbose)
filelist = ["weka_predicted.csv"]
for f in filelist:
if os.path.isfile(f):
os.remove(f)
'''
Decision Stump Classifier. If the method has been successfully completed return
the elapsed time in seconds.
@param options - Extra options for the method.
@return - Elapsed time in seconds or a negative value if the method was not
successful.
'''
def RunMetrics(self, options):
Log.Info("Perform DECISIONSTUMP.", self.verbose)
if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")
if len(self.dataset) < 2:
Log.Fatal("This method requires two or more datasets.")
return -1
# Split the command using shell-like syntax.
cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
":methods/weka" + " DECISIONSTUMP -t " + self.dataset[0] + " -T " +
self.dataset[1])
# Run command with the necessary arguments and return its output as a byte
# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1
# Datastructure to store the results.
metrics = {}
# Parse data: runtime.
timer = self.parseTimer(s)
if timer != -1:
predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
metrics['Runtime'] = timer.total_time
confusionMatrix = Metrics.ConfusionMatrix(truelabels, self.predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, self.predictions)
Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)
return metrics
'''
Parse the timer data form a given string.
@param data - String to parse timer data from.
@return - Namedtuple that contains the timer data or -1 in case of an error.
'''
def parseTimer(self, data):
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(r"""
.*?total_time: (?P<total_time>.*?)s.*?
""", re.VERBOSE|re.MULTILINE|re.DOTALL)
match = pattern.match(data.decode())
if not match:
Log.Fatal("Can't parse the data: wrong format")
return -1
else:
# Create a namedtuple and return the timer data.
timer = collections.namedtuple("timer", ["total_time"])
if match.group("total_time").count(".") == 1:
return timer(float(match.group("total_time")))
else:
return timer(float(match.group("total_time").replace(",", ".")))
Copy path View file
@@ -0,0 +1,143 @@
'''
@file dtc.py
Class to benchmark the weka Decision Tree Classifier method.
'''
import os
import sys
import inspect
# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)
#Import the metrics definitions path.
metrics_folder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics")))
if metrics_folder not in sys.path:
sys.path.insert(0, metrics_folder)
from log import *
from profiler import *
from definitions import *
from misc import *
import shlex
import subprocess
import re
import collections
import numpy as np
'''
This class implements the Decision Tree Classifier benchmark.
'''
class DTC(object):
'''
Create the Decision Tree Classifier benchmark instance.
@param dataset - Input dataset to perform DTC on.
@param timeout - The time until the timeout. Default no timeout.
@param path - Path to the mlpack executable.
@param verbose - Display informational messages.
'''
def __init__(self, dataset, timeout=0, path=os.environ["JAVAPATH"],
verbose=True):
self.verbose = verbose
self.dataset = dataset
self.path = path
self.timeout = timeout
def __del__(self):
Log.Info("Clean up.", self.verbose)
filelist = ["weka_predicted.csv"]
for f in filelist:
if os.path.isfile(f):
os.remove(f)
'''
Decision Tree Classifier. If the method has been successfully completed return
the elapsed time in seconds.
@param options - Extra options for the method.
@return - Elapsed time in seconds or a negative value if the method was not
successful.
'''
def RunMetrics(self, options):
Log.Info("Perform DTC.", self.verbose)
opts = {}
if "minimum_leaf_size" in options:
opts["minimum_leaf_size"] = int(options.pop("minimum_leaf_size"))
else:
opts["minimum_leaf_size"] = 2
if len(options) > 0:
Log.Fatal("Unknown parameters: " + str(options))
raise Exception("unknown parameters")
if len(self.dataset) < 2:
Log.Fatal("This method requires two or more datasets.")
return -1
# Split the command using shell-like syntax.
cmd = shlex.split("java -classpath " + self.path + "/weka.jar" +
":methods/weka" + " DTC -t " + self.dataset[0] + " -T " +
self.dataset[1] + " -M " + str(opts["minimum_leaf_size"]))
# Run command with the nessecary arguments and return its output as a byte
# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1
# Datastructure to store the results.
metrics = {}
# Parse data: runtime.
timer = self.parseTimer(s)
if timer != -1:
predictions = np.genfromtxt("weka_predicted.csv", delimiter=',')
truelabels = np.genfromtxt(self.dataset[2], delimiter = ',')
metrics['Runtime'] = timer.total_time
confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictions)
metrics['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
metrics['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
metrics['Precision'] = Metrics.AvgPrecision(confusionMatrix)
metrics['Recall'] = Metrics.AvgRecall(confusionMatrix)
metrics['MSE'] = Metrics.SimpleMeanSquaredError(truelabels, predictions)
Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)
return metrics
'''
Parse the timer data form a given string.
@param data - String to parse timer data from.
@return - Namedtuple that contains the timer data or -1 in case of an error.
'''
def parseTimer(self, data):
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(r"""
.*?total_time: (?P<total_time>.*?)s.*?
""", re.VERBOSE|re.MULTILINE|re.DOTALL)
match = pattern.match(data.decode())
if not match:
Log.Fatal("Can't parse the data: wrong format")
return -1
else:
# Create a namedtuple and return the timer data.
timer = collections.namedtuple("timer", ["total_time"])
if match.group("total_time").count(".") == 1:
return timer(float(match.group("total_time")))
else:
return timer(float(match.group("total_time").replace(",", ".")))
Oops, something went wrong.
ProTip! Use n and p to navigate between commits in a pull request.