mlpack · zoq · Jun 7, 2017 · May 25, 2017 · May 30, 2017 · Jun 3, 2017
diff --git a/config.yaml b/config.yaml
@@ -809,6 +809,15 @@ methods:
 # Scikit-Learn: machine learning in Python
 library: scikit
 methods:
+    ICA:
+        run: ['metric']
+        iteration: 3
+        script: methods/scikit/ica.py
+        format: [csv, txt]
+        datasets:
+            - files: ['datasets/iris.csv', 'datasets/wine.csv',
+                      'datasets/cities.csv', 'datasets/diabetes_X.csv']
+
     PCA:
         run: ['metric']
         iteration: 3
@@ -1173,7 +1182,6 @@ methods:
                       'datasets/vehicle.csv', 'datasets/USCensus1990.csv',
                       'datasets/optdigits.csv', 'datasets/isolet.csv',
                       'datasets/TomsHardware.csv', 'datasets/covtype.csv']
-              options: '-s 42'
     LinearRegression:
         run: ['metric']
         iteration: 3
@@ -1214,6 +1222,24 @@ methods:
             - files: [ ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
                        ['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'] ]
               options: '-t 50.0'
+    LogisticRegression:
+        run: ['metric']
+        iteration: 3
+        script: methods/scikit/logistic_regression.py
+        format: [csv,txt]
+        datasets:
+            - files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
+                       ['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
+                       ['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
+                       ['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
+                       ['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
+                       ['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
+                       ['datasets/reuters_train.csv', 'datasets/reuters_test.csv', 'datasets/reuters_labels.csv'],
+                       ['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
+                       ['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
+                       ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
+                       ['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
+                       ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
 ---
 # mlpy is a Python module for Machine Learning built on top of NumPy/SciPy
 # and the GNU Scientific Libraries.

diff --git a/methods/scikit/LSHForest.py b/methods/scikit/LSHForest.py
@@ -45,8 +45,8 @@ def __init__(self, dataset, timeout=0, verbose=True):
     self.dataset = dataset
     self.timeout = timeout
     self.model = None
-    self.n = 10
-    self.k = 5
+    self.n_estimators = 10
+    self.n_neighbors = 5
 
   '''
   Build the model for the Approximate Nearest Neighbors.
@@ -57,8 +57,12 @@ def __init__(self, dataset, timeout=0, verbose=True):
   '''
   def BuildModel(self, data, labels):
     # Create and train the classifier.
-    lshf = LSHForest(n_estimators = self.n,
-                     n_neighbors = self.k)
+    lshf = LSHForest(n_estimators = self.n_estimators,
+                     min_hash_match = self.min_hash_match,
+                     n_candidates = self.n_candidates,
+                     radius_cutoff_ratio = self.radius_cutoff_ratio,
+                     radius = self.radius,
+                     n_neighbors = self.n_neighbors)
     lshf.fit(data)
     return lshf
 
@@ -76,16 +80,30 @@ def RunAnnScikit(q):
       Log.Info("Loading dataset", self.verbose)
       trainData, labels = SplitTrainData(self.dataset)
       testData = LoadDataset(self.dataset[1])
-      n = re.search("-n (\d+)", options) #Number of Estimators.
-      k = re.search("-k (\d+)", options) #Number of Neighbors.
-      self.n = 10 if not n else int(n.group(1)) 
-      self.k = 5 if not k else int(k.group(1)) 
+      #Number of trees in the LSH Forest.
+      n_estimators = re.search("-n (\d+)", options)
+      #Number of neighbors to be returned from the query function.
+      n_neighbors = re.search("-k (\d+)", options)
+      #Lowest hash length to be searched when candidate selection is performed.
+      min_hash_match = re.search("-H (\d+)", options)
+      #Minimum number of candidates evaluated per estimator.
+      n_candidates = re.search("--n_candidates (\d+)", options)
+      #Radius from data point to its neighbors.
+      radius = re.search("--radius (\d+)", options)
+      #A value ranges from 0 to 1.
+      radius_cutoff_ratio = re.search("--radius_cutoff_ratio (\d+)", options)
+      self.n_estimators = 10 if not n_estimators else int(n_estimators.group(1))
+      self.n_neighbors = 5 if not n_neighbors else int(n_neighbors.group(1))
+      self.min_hash_match = 4 if not min_hash_match else int(min_hash_match.group(1))
+      self.n_candidates = 10 if not n_candidates else int(n_candidates.group(1))
+      self.radius = 1.0 if not radius else float(radius.group(1))
+      self.radius_cutoff_ratio = 0.9 if not radius_cutoff_ratio else float(radius_cutoff_ratio.group(1))
       try:
         with totalTimer:
           self.model = self.BuildModel(trainData, labels)
           # Run Approximate on the test dataset.
           distances,indices = self.model.kneighbors(testData,
-                                                    n_neighbors = self.k)
+                                                    n_neighbors = self.n_neighbors)
       except Exception as e:
         Log.Debug(str(e))
         q.put(-1)

diff --git a/methods/scikit/allknn.py b/methods/scikit/allknn.py
@@ -64,7 +64,14 @@ def RunAllKnnScikit(q):
         # Get all the parameters.
         k = re.search("-k (\d+)", options)
         leafSize = re.search("-l (\d+)", options)
-
+        radius = re.search("--radius (\d+)", options)
+        tree_type = re.search("-t (\s+)", options)
+        metric = re.search("--metric (\s+)", options)
+        # Parameter for the Minkowski metric. 
+        # When p=1 it is equivalent to using manhattan_distance and euclidean for p=2. 
+        # For arbitrary p, minkowski_distance is used.
+        p = re.search("-p (\d+)", options)
+        n_jobs = re.search("--n_jobs (\d+)", options)
         if not k:
           Log.Fatal("Required option: Number of furthest neighbors to find.")
           q.put(-1)
@@ -78,18 +85,45 @@ def RunAllKnnScikit(q):
             return -1
 
         if not leafSize:
-          l = 20
+          leafSize = 20
         elif int(leafSize.group(1)) < 0:
           Log.Fatal("Invalid leaf size: " + str(leafSize.group(1)) + ". Must" +
               " be greater than or equal to 0.")
           q.put(-1)
           return -1
         else:
-          l = int(leafSize.group(1))
+          leafSize = int(leafSize.group(1))
+        if not tree_type:
+            tree_type = 'kd_tree'
+        elif str(tree_type.group(1)):
+            tree_type = str(tree_type.group(1))
+            if tree_type !='auto' or tree_type !='ball_tree' or tree_type != 'kd_tree' or tree_type != 'brute':
+                Log.Fatal("Invalid tree type: "+ str(tree_type.group(1)) 
+                          + ". Must be either auto, ball_tree, kd_tree or brute.")
+                q.put(-1)
+                return -1
+        radius = 1.0 if not radius else float(radius.group(1))
+        p = 2 if not p else int(p.group(1))
+        if not metric:
+            metric = 'minkowski'
+        elif metric.group(1):
+            metric = str(metric.group(1))
+            if metric not in ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']:
+                Log.Fatal("Invalid metric type: "+ str(metric.group(1))
+                          + ". Must be either cityblock, cosine, euclidean, l1, l2 or manhattan")
+                q.put(-1)
+                return -1
+        n_jobs = 1 if not n_jobs else int(n_jobs.group(1))
 
         try:
           # Perform All K-Nearest-Neighbors.
-          model = NearestNeighbors(n_neighbors=k, algorithm='kd_tree', leaf_size=l)
+          model = NearestNeighbors(n_neighbors=k, 
+                                   algorithm=tree_type, 
+                                   leaf_size=leafSize, 
+                                   radius=radius, 
+                                   metric=metric, 
+                                   p=p, 
+                                   n_jobs=n_jobs)
           model.fit(referenceData)
 
           if len(self.dataset) == 2:

diff --git a/methods/scikit/elastic_net.py b/methods/scikit/elastic_net.py
@@ -47,8 +47,18 @@ def __init__(self, dataset, timeout=0, verbose=True):
     self.dataset = dataset
     self.timeout = timeout
     self.model = None
-    self.rho = 0.5
+    self.rho = 1.0
     self.alpha = 0.5
+    self.fit_intercept = True
+    self.normalize = False
+    self.precompute = False
+    self.max_iter = 1000
+    self.copy_X = True
+    self.tol = 0.0001
+    self.warm_start = False
+    self.positive = False
+    self.selection = 'cyclic'
+
 
   '''
   Build the model for the Elastic Net Classifier.
@@ -60,7 +70,16 @@ def __init__(self, dataset, timeout=0, verbose=True):
   def BuildModel(self, data, labels):
     # Create and train the classifier.
     elasticNet = SElasticNet(alpha=self.rho,
-                             l1_ratio=self.alpha)
+                             l1_ratio=self.alpha,
+                             fit_intercept = self.fit_intercept,
+                             normalize = self.normalize,
+                             precompute = self.precompute,
+                             max_iter = self.max_iter,
+                             copy_X = self.copy_X,
+                             tol = self.tol,
+                             warm_start = self.warm_start,
+                             positive = self.positive,
+                             selection = self.selection)
     elasticNet.fit(data, labels)
     return elasticNet
 
@@ -81,9 +100,20 @@ def RunElasticNetScikit(q):
 
       r = re.search("-r (\d+)", options)
       a = re.search("-a (\d+)", options)
+      max_iter = re.search("--max_iter (\d+)", options)
+      tol = re.search("--tol (\d+)", options)
+      selection = re.search("--selection (\s+)", options)
 
-      self.rho = 0.5 if not r else int(r.group(1))
+      self.rho = 1.0 if not r else int(r.group(1))
       self.alpha = 0.5 if not r else int(a.group(1))
+      self.max_iter = 1000 if not max_iter else int(max_iter.group(1))
+      self.tol = 0.0001 if not tol else float(tol.group(1))
+      self.selection = 'cyclic' if not selection else str(selection.group(1))
+      if self.selection not in ['cyclic','random']:
+          Log.Fatal("Invalid selection: " + str(selection.group(1)) 
+                    + ". Must be either cyclic or random")
+          q.put(-1)
+          return -1
 
       try:
         with totalTimer:

diff --git a/methods/scikit/gmm.py b/methods/scikit/gmm.py
@@ -55,18 +55,24 @@ def RunGMMScikit(q):
 
       # Get all the parameters.
       g = re.search("-g (\d+)", options)
-      n = re.search("-n (\d+)", options)
-      s = re.search("-n (\d+)", options)
-
+      s = re.search("-s (\d+)", options)
+      tol = re.search("-T (\d+)", options)
+      n_init = re.search("-t (\d+)", options)
+      max_iter = re.search("-n (\d+)", options)
       g = 1 if not g else int(g.group(1))
-      n = 250 if not n else int(n.group(1))
       s = 0 if not s else int(s.group(1))
+      tol = 0.001 if not tol else float(tol.group(1))
+      max_iter = 100 if not max_iter else int(max_iter.group(1))
+      n_init = 1 if not n_init else int(n_init.group(1))
 
       try:
         # Create the Gaussian Mixture Model
 	      # Some params changed to match mlpack defaults.
-        model = mixture.GaussianMixture(n_components=g, covariance_type='full',
-            random_state=s, n_iter=n, n_init=10, thresh=1e-10)
+        model = mixture.GaussianMixture(n_components=g,
+                                        random_state=s,
+                                        n_init=n_init,
+                                        tol=tol,
+					max_iter = max_iter)
         with totalTimer:
           model.fit(dataPoints)
       except Exception as e:

diff --git a/methods/scikit/ica.py b/methods/scikit/ica.py
@@ -54,12 +54,36 @@ def RunICAScikit(q):
       data = np.genfromtxt(self.dataset, delimiter=',')
 
       s = re.search('-s (\d+)', options)
+      n_components = re.search('--n_components (\d+)', options)
+      algorithm = re.search('--algorithm (\s+)', options)
+      fun = re.search('--fun (\s+)', options)
+      max_iter = re.search('--max_iter (\d+)', options)
+      tol = re.search('--tol (\d+)', options)
+
       s = 0 if not s else int(s.group(1))
+      n_components = None if not n_components else int(n_components.group(1))
+      algorithm = 'parallel' if not algorithm else str(algorithm.group(1))
+      if algorithm not in ['parallel','deflation']:
+          Log.Fatal("Invalid value for algorithm: "+ str(algorithm.group(1))+" .Must be either parallel or deflation")
+          q.put(-1)
+          return -1
+      fun = 'logcosh' if not fun else str(fun.group(1))
+      if fun not in ['logcosh','exp','cube']:
+          Log.Fatal("Invalid value for fun: "+ str(fun.group(1))+" .Must be either logcosh,exp or cube")
+          q.put(-1)
+          return -1
+      max_iter = 200 if not max_iter else int(max_iter.group(1))
+      tol = 0.0001 if not tol else float(tol.group(1))
 
       try:
         # Perform ICA.
         with totalTimer:
-          model = FastICA(random_state=s)
+          model = FastICA(n_components = n_components,
+                          algorithm = algorithm,
+                          fun = fun,
+                          max_iter = max_iter,
+                          tol = tol,
+                          random_state = s)
           ic = model.fit(data).transform(data)
       except Exception as e:
         q.put(-1)

diff --git a/methods/scikit/kernel_pca.py b/methods/scikit/kernel_pca.py
@@ -84,6 +84,10 @@ def RunKPCAScikit(q):
             degree = 1 if not degree else int(degree.group(1))
 
             model = KernelPCA(n_components=d, kernel="poly", degree=degree)
+          elif kernel.group(1) == "cosine":
+            model = KernelPCA(n_components=d, kernel="cosine", degree=degree)
+          elif kernel.group(1) == "gaussian":
+            model = KernelPCA(n_components=d, kernel="rbf", degree=degree)
           else:
             Log.Fatal("Invalid kernel type (" + kernel.group(1) + "); valid " +
                 "choices are 'linear', 'hyptan' and 'polynomial'.")

diff --git a/methods/scikit/kmeans.py b/methods/scikit/kmeans.py
@@ -64,6 +64,7 @@ def RunKMeansScikit(q):
       clusters = re.search("-c (\d+)", options)
       maxIterations = re.search("-m (\d+)", options)
       seed = re.search("-s (\d+)", options)
+      algorithm = re.search("-a (\s+)", options)
 
       # Now do validation of options.
       if not clusters and len(self.dataset) != 2:
@@ -77,18 +78,35 @@ def RunKMeansScikit(q):
         return -1
 
       m = 1000 if not maxIterations else int(maxIterations.group(1))
-
+      algorithm = 'auto' if not algorithm else str(algorithm.group(1))
+
+      if algorithm not in ['naive','elkan','auto']:
+       Log.Fatal("Invalid value for algorithm: "+algorithm+" must be either elkan or naive")
+       q.put(-1)
+       return -1
+      if algorithm == 'naive':
+       algorithm = 'auto'
       try:
         # Create the KMeans object and perform K-Means clustering.
         with totalTimer:
           if len(self.dataset) == 2:
-            kmeans = KMeans(n_clusters=int(clusters.group(1)), init=centroids,
-                n_init=1, max_iter=m)
+            kmeans = KMeans(n_clusters=int(clusters.group(1)),
+                            init=centroids,
+                            n_init=1,
+                            max_iter=m,
+                            algorithm=algorithm)
           elif seed:
-            kmeans = KMeans(n_clusters=int(clusters.group(1)), init='random',
-                n_init=1, max_iter=m, random_state=int(seed.group(1)))
+            kmeans = KMeans(n_clusters=int(clusters.group(1)),
+                            init='random',
+                            n_init=1,
+                            max_iter=m,
+                            random_state=int(seed.group(1)),
+                            algorithm=algorithm)
           else:
-            kmeans = KMeans(n_clusters=int(clusters.group(1)), n_init=1, max_iter=m)
+            kmeans = KMeans(n_clusters=int(clusters.group(1)),
+                            n_init=1,
+                            max_iter=m,
+                            algorithm=algorithm)
 
           kmeans.fit(data)
           labels = kmeans.labels_

diff --git a/methods/scikit/logistic_regression.py b/methods/scikit/logistic_regression.py
@@ -47,6 +47,8 @@ def __init__(self, dataset, timeout=0, verbose=True):
     self.dataset = dataset
     self.timeout = timeout
     self.model = None
+    self.e = 1e-4
+    self.n = 100
 
   '''
   Build the model for the Logistic Regression.
@@ -57,7 +59,8 @@ def __init__(self, dataset, timeout=0, verbose=True):
   '''
   def BuildModel(self, data, responses):
     # Create and train the classifier.
-    lr = SLogisticRegression()
+    lr = SLogisticRegression(max_iter = self.n, 
+                             tol = self.e)
     lr.fit(data, responses)
     return lr
 
@@ -80,6 +83,10 @@ def RunLogisticRegressionScikit(q):
 
       # Use the last row of the training set as the responses.
       X, y = SplitTrainData(self.dataset)
+      e = re.search("-e (\d+)", options) #Tolerance
+      n = re.search("-n (\d+)", options) #Max_iterations
+      self.e = 1e-4 if not e else float(e.group(1))
+      self.n = 100 if not n else int(n.group(1))
 
       try:
         with totalTimer: