Refactor bench PCA.

luispedro · Jun 28, 2011 · 97c8d64 · 97c8d64
1 parent e70ec0d
commit 97c8d64
Show file tree

Hide file tree

Showing 3 changed files with 144 additions and 63 deletions.
diff --git a/benchmarks/bench_lassolars.py b/benchmarks/bench_lassolars.py
@@ -45,9 +45,6 @@ def bench_pymvpa(X, y, T, valid):
 #    mvpa_pred = mvpa_clf.predict(X)
     return None, datetime.now() - tstart
 
-
-
-
 if __name__ == '__main__':
     import sys, misc
 

diff --git a/benchmarks/bench_pca.py b/benchmarks/bench_pca.py
@@ -3,51 +3,60 @@
 import numpy as np
 from datetime import datetime
 
-#
-#       .. Load dataset ..
-#
-from misc import load_data, bench
-print 'Loading data ...'
-X, y, T = load_data()
-print 'Done, %s samples with %s features loaded into ' \
-      'memory' % X.shape
 n_components = 9
 
-
-
-def bench_skl():
+def explained_variance(X, W):
+    """
+    We compute explained variance from the principal directions W using the
+    principle that W are the eigenvectors for the covariance matrix dot(X.T,
+    X).
+    """
+    mean = np.mean(X, axis=0)
+    X -= mean
+    C = np.dot(X.T, X)
+    s = np.zeros(W.shape[0])
+    for i in range(W.shape[0]):
+        s[i] = np.dot(np.dot(W[i], C.T), W[i].T) / np.dot(W[i], W[i].T)
+    return s / X.shape[0]
+
+
+def bench_skl(X, y, T, valid):
 #
 #       .. scikits.learn ..
 #
-    from scikits.learn import pca as skl_pca
+    from scikits.learn import decomposition
     start = datetime.now()
-    clf = skl_pca.RandomizedPCA(n_components=n_components)
+    clf = decomposition.RandomizedPCA(n_components=n_components)
     clf.fit(X)
-    return datetime.now() - start
+    ev = explained_variance(X, clf.components_).sum()
+    return ev, datetime.now() - start
 
 
-def bench_pybrain():
+def bench_pybrain(X, y, T, valid):
 #
 #       .. pybrain ..
 #
-    from pybrain.auxiliary import pca as pybrain_pca
+    from pybrain.auxiliary import pca
     start = datetime.now()
-    pybrain_pca.pca(X, n_components)
-    return datetime.now() - start
+    W = pca.pPca(X, n_components)
+    ev = explained_variance(X, W).sum()
+    return ev, datetime.now() - start
 
 
-def bench_mdp():
+def bench_mdp(X, y, T, valid):
 #
 #       .. MDP ..
 #
     from mdp.nodes import PCANode
     start = datetime.now()
-    mdp_clf = PCANode(output_dim=n_components)
-    mdp_clf.train(X)
-    return datetime.now() - start
+    clf = PCANode(output_dim=n_components)
+    clf.train(X)
+    clf.stop_training()
+    ev = explained_variance(X, clf.v.T).sum()
+    return ev, datetime.now() - start
 
 
-def bench_pymvpa():
+def bench_pymvpa(X, y, T, valid):
 #
 #       .. PyMVPA ..
 #
@@ -57,42 +66,63 @@ def bench_pymvpa():
     clf = MVPA_PCA(output_dim=n_components)
     data = dataset_wizard(samples=X)
     clf.train(data)
-    return datetime.now() - start
+    ev = explained_variance(X, clf.proj.T).sum()
+    return ev, datetime.now() - start
 
-def bench_milk():
+
+def bench_milk(X, y, T, valid):
 #
 #       .. milk ..
 #
-    from milk.unsupervised import pca as milk_pca
+    from milk.unsupervised import pca
     start = datetime.now()
-    _ = milk_pca(X)
-    return datetime.now() - start
+    Y, W = pca(X, zscore=False)
+    ev = explained_variance(X, W).sum()
+    return ev, datetime.now() - start
 
 
 if __name__ == '__main__':
+    import sys, misc
 
     # don't bother me with warnings
     import warnings; warnings.simplefilter('ignore')
     np.seterr(all='ignore')
 
     print __doc__ + '\n'
+    if not len(sys.argv) == 2:
+        print misc.USAGE % __file__
+        sys.exit(-1)
+    else:
+        dataset = sys.argv[1]
+
+    print 'Loading data ...'
+    data = misc.load_data(dataset)
 
-    res_mdp = bench(bench_mdp)
+    print 'Done, %s samples with %s features loaded into ' \
+      'memory' % data[0].shape
+
+    score, res_mdp = misc.bench(bench_mdp, data)
     print 'MDP: mean %s, std %s' % (
         np.mean(res_mdp), np.std(res_mdp))
+    print 'Explained variance: %s\n'% score
 
-    res_skl = bench(bench_skl)
-    print 'scikits.learn: mean %s, std %s' % (
+    score, res_skl = misc.bench(bench_skl, data)
+    print 'scikits.learn: mean %.2f, std %.2f' % (
         np.mean(res_skl), np.std(res_skl))
+    print 'Explained variance: %s\n'% score
 
-    res_pybrain = bench(bench_pybrain)
+    score, res_pybrain = misc.bench(bench_pybrain, data)
     print 'Pybrain: mean %s, std %s' % (
         np.mean(res_pybrain), np.std(res_pybrain))
+    print 'Explained variance: %s\n'% score
 
-    res_milk = bench(bench_milk)
+    score, res_milk = misc.bench(bench_milk, data)
     print 'milk: mean %s, std %s' % (
         np.mean(res_milk), np.std(res_milk))
+    print 'Explained variance: %s\n'% score
 
-    res_pymvpa = bench(bench_pymvpa)
+    score, res_pymvpa = misc.bench(bench_pymvpa, data)
     print 'PyMVPA: mean %s, std %s' % (
         np.mean(res_pymvpa), np.std(res_pymvpa))
+    print 'Explained variance: %s\n'% score
+
diff --git a/doc/index.rst b/doc/index.rst
@@ -29,7 +29,7 @@ We used the latest released version as of June 2011:
   - scikits.learn 0.8
   - MDP 3.1
   - MLPy 2.2.2
-  - PyMVPA 0.6.0.dev
+  - PyMVPA 0.6.0~rc3
   - Shogun 0.10.0
 
 I ran it on an Intel(R) Core(TM)2 CPU @ 1.86GHz.
@@ -61,26 +61,24 @@ Support Vector Machines
 We used several Support Vector Machine (RBF kernel) implementations. Numbers
 represent the time in seconds (lower is better) it took to train the dataset
 and perform prediction on a test dataset. In the plot, results are normalized
-so that the fastest method has value 1.0.
-
+to have the fastest method at 1.0.
 
 
 .. table:: Results in scikits.learn ml-benchmarks
 
-     ============         =======           ======     ====     =======         ========    =============         ========
-          Dataset          PyMVPA           Shogun      MDP     Pybrain             MLPy    scikits.learn             Milk
-     ============         =======           ======     ====     =======         ========    =============         ========
-          Madelon           12.89             6.03       --          --            10.88             6.23             4.90
-          Arcene             1.37             0.42       --          --             1.75             0.41         **0.34**
-     ============         =======           ======     ====     =======         ========    =============         ========
+     ============      =======       ======     ====     =======     ========    =============      ========
+          Dataset       PyMVPA       Shogun      MDP     Pybrain         MLPy    scikits.learn          Milk
+     ============      =======       ======     ====     =======     ========    =============      ========
+          Madelon        12.89         6.03       --          --        10.88             6.23          4.90
+          Arcene          1.37         0.42       --          --         1.75             0.41      **0.34**
+     ============      =======       ======     ====     =======     ========    =============      ========
 
 
 
 .. figure:: bench_svm.png
    :scale: 60%
    :align: center
 
-
 The score by these calssfifiers in in a test dataset is.
 
 .. warning::
@@ -102,9 +100,12 @@ The score by these calssfifiers in in a test dataset is.
 K-means
 -------
 
-bla bla bla. NC = not converging.
+We run the k-means algorithm on both Madelon and Arcene dataset. To make sure
+the methods are converging, we show in the second table the inertia of all
+methods, which are mostly equivalent.
 
-.. table:: Results in scikits.learn ml-benchmarks
+
+.. table:: Timing for k-Means algorithm
 
      ============         =======       ======     ====     =======         ========    =============         ========
           Dataset         PyMVPA        Shogun      MDP     Pybrain             MLPy    scikits.learn             milk
@@ -114,6 +115,8 @@ bla bla bla. NC = not converging.
      ============         =======       ======     ====     =======         ========    =============         ========
 
 
+NC = Not Converging after one hour iteration.
+
 .. figure:: bench_kmeans.png
    :scale: 60%
    :align: center
@@ -123,18 +126,18 @@ The following table shows the inertia, criterion that the k-means algorithm mini
 
 .. table:: Inertia
 
-     ============         =======           ======     =============     =======     =============    =============     ==============
-          Inertia          PyMVPA           Shogun               MDP     Pybrain              MLPy    scikits.learn               Milk
-     ============         =======           ======     =============     =======     =============    =============     ==============
-          Madelon              --               --                --          --       739171883.6      745421891.3                 --
-           Arcene              --               --     1403820558.52          --     1429740165.89      745421891.3      1451970835.28
-     ============         =======           ======     =============     =======     =============    =============     ==============
+     ============         =======    ======     =============     =======     =============    =============     ==============
+          Dataset          PyMVPA    Shogun               MDP     Pybrain              MLPy    scikits.learn               Milk
+     ============         =======    ======     =============     =======     =============    =============     ==============
+          Madelon              --        --                --          --       739171883.6      745421891.3                 --
+           Arcene              --        --     1403820558.52          --     1429740165.89      745421891.3      1451970835.28
+     ============         =======    ======     =============     =======     =============    =============     ==============
 
 
 Elastic Net
 -----------
 
-Bla bla bla bla
+We solve the elastic net using a coordinate descent algorithm on both Madelon and Arcene dataset.
 
 
 .. table:: Results in scikits.learn ml-benchmarks
@@ -155,12 +158,63 @@ Bla bla bla bla
 Lasso (LARS algorithm)
 ----------------------
 
+We solve the Lasso model by Least Angle Regression (LARS) algorithm. MLPy and
+scikits.learn use a pure Python implementation, while PyMVPA uses bindings to
+R code.
+
+We also show the Means Squared error as a sanity check for the model. Note
+that some NaN arise, probably due to collinearity in the data.
+
+
+.. table:: Timing
+
+     ============         =======  =============    =============
+          Dataset          PyMVPA           MLPy    scikits.learn
+     ============         =======  =============    =============
+          Madelon           33.45           72.2             0.88
+           Arcene              NW           3.75      745421891.3
+     ============         =======  =============    =============
+
+
+.. table:: Means Squared Error on a test dataset.
+
+     ============  =======  =============    =============
+          Dataset   PyMVPA           MLPy    scikits.learn
+     ============  =======  =============    =============
+          Madelon      NaN         682.32           680.91
+           Arcene       NW            NaN            66.61
+     ============  =======  =============    =============
+
+
+Principal Component Analysis
+----------------------------
+
+We run principal component analysis on both datasets. In the libraries that
+support it (scikit-learn, MDP, PyMVPA), we number of components in the
+projection to 9.
+
+.. table:: Timing PCA
+
+     ============     =======   ====    =======    ========    =============    ========
+          Dataset      PyMVPA    MDP    Pybrain        MLPy    scikits.learn        milk
+     ============     =======   ====    =======    ========    =============    ========
+          Madelon        0.48   0.50       6.51        0.79             1.36        2.66
+           Arcene          --     --         --        0.81             1.77     **1.0**
+     ============     =======   ====    =======    ========    =============    ========
+
+
+.. table:: explained variance
+
+     ============     =======   ========     ========    ========    =============         =========
+          Dataset      PyMVPA        MDP      Pybrain        MLPy    scikits.learn              milk
+     ============     =======   ========     ========    ========    =============         =========
+          Madelon          --   136705.5     228941.0        0.79         135788.2         455715.83
+           Arcene          --         --           --        0.81             1.77          **1.0**
+     ============     =======   ========     ========    ========    =============         =========
+
 
-     ============         =======  =============     =======     =============    =============     ==============
-              MSE          PyMVPA            MDP     Pybrain              MLPy    scikits.learn               Milk
-     ============         =======  =============     =======     =============    =============     ==============
-          Madelon              --             --          --       739171883.6      745421891.3                 --
-           Arcene              --  1403820558.52          --     1429740165.89      745421891.3      1451970835.28
-     ============         =======  =============     =======     =============    =============     ==============
+Misc
+----
 
-TODO
+Author : Fabian Pedregosa
+License : Simplified BSD