Skip to content

Commit

Permalink
Refactor bench PCA.
Browse files Browse the repository at this point in the history
  • Loading branch information
Fabian Pedregosa committed Jun 28, 2011
1 parent e70ec0d commit 97c8d64
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 63 deletions.
3 changes: 0 additions & 3 deletions benchmarks/bench_lassolars.py
Expand Up @@ -45,9 +45,6 @@ def bench_pymvpa(X, y, T, valid):
# mvpa_pred = mvpa_clf.predict(X)
return None, datetime.now() - tstart




if __name__ == '__main__':
import sys, misc

Expand Down
98 changes: 64 additions & 34 deletions benchmarks/bench_pca.py
Expand Up @@ -3,51 +3,60 @@
import numpy as np
from datetime import datetime

#
# .. Load dataset ..
#
from misc import load_data, bench
print 'Loading data ...'
X, y, T = load_data()
print 'Done, %s samples with %s features loaded into ' \
'memory' % X.shape
n_components = 9



def bench_skl():
def explained_variance(X, W):
"""
We compute explained variance from the principal directions W using the
principle that W are the eigenvectors for the covariance matrix dot(X.T,
X).
"""
mean = np.mean(X, axis=0)
X -= mean
C = np.dot(X.T, X)
s = np.zeros(W.shape[0])
for i in range(W.shape[0]):
s[i] = np.dot(np.dot(W[i], C.T), W[i].T) / np.dot(W[i], W[i].T)
return s / X.shape[0]


def bench_skl(X, y, T, valid):
#
# .. scikits.learn ..
#
from scikits.learn import pca as skl_pca
from scikits.learn import decomposition
start = datetime.now()
clf = skl_pca.RandomizedPCA(n_components=n_components)
clf = decomposition.RandomizedPCA(n_components=n_components)
clf.fit(X)
return datetime.now() - start
ev = explained_variance(X, clf.components_).sum()
return ev, datetime.now() - start


def bench_pybrain():
def bench_pybrain(X, y, T, valid):
#
# .. pybrain ..
#
from pybrain.auxiliary import pca as pybrain_pca
from pybrain.auxiliary import pca
start = datetime.now()
pybrain_pca.pca(X, n_components)
return datetime.now() - start
W = pca.pPca(X, n_components)
ev = explained_variance(X, W).sum()
return ev, datetime.now() - start


def bench_mdp():
def bench_mdp(X, y, T, valid):
#
# .. MDP ..
#
from mdp.nodes import PCANode
start = datetime.now()
mdp_clf = PCANode(output_dim=n_components)
mdp_clf.train(X)
return datetime.now() - start
clf = PCANode(output_dim=n_components)
clf.train(X)
clf.stop_training()
ev = explained_variance(X, clf.v.T).sum()
return ev, datetime.now() - start


def bench_pymvpa():
def bench_pymvpa(X, y, T, valid):
#
# .. PyMVPA ..
#
Expand All @@ -57,42 +66,63 @@ def bench_pymvpa():
clf = MVPA_PCA(output_dim=n_components)
data = dataset_wizard(samples=X)
clf.train(data)
return datetime.now() - start
ev = explained_variance(X, clf.proj.T).sum()
return ev, datetime.now() - start

def bench_milk():

def bench_milk(X, y, T, valid):
#
# .. milk ..
#
from milk.unsupervised import pca as milk_pca
from milk.unsupervised import pca
start = datetime.now()
_ = milk_pca(X)
return datetime.now() - start
Y, W = pca(X, zscore=False)
ev = explained_variance(X, W).sum()
return ev, datetime.now() - start


if __name__ == '__main__':
import sys, misc

# don't bother me with warnings
import warnings; warnings.simplefilter('ignore')
np.seterr(all='ignore')

print __doc__ + '\n'
if not len(sys.argv) == 2:
print misc.USAGE % __file__
sys.exit(-1)
else:
dataset = sys.argv[1]

print 'Loading data ...'
data = misc.load_data(dataset)

res_mdp = bench(bench_mdp)
print 'Done, %s samples with %s features loaded into ' \
'memory' % data[0].shape

score, res_mdp = misc.bench(bench_mdp, data)
print 'MDP: mean %s, std %s' % (
np.mean(res_mdp), np.std(res_mdp))
print 'Explained variance: %s\n'% score

res_skl = bench(bench_skl)
print 'scikits.learn: mean %s, std %s' % (
score, res_skl = misc.bench(bench_skl, data)
print 'scikits.learn: mean %.2f, std %.2f' % (
np.mean(res_skl), np.std(res_skl))
print 'Explained variance: %s\n'% score

res_pybrain = bench(bench_pybrain)
score, res_pybrain = misc.bench(bench_pybrain, data)
print 'Pybrain: mean %s, std %s' % (
np.mean(res_pybrain), np.std(res_pybrain))
print 'Explained variance: %s\n'% score

res_milk = bench(bench_milk)
score, res_milk = misc.bench(bench_milk, data)
print 'milk: mean %s, std %s' % (
np.mean(res_milk), np.std(res_milk))
print 'Explained variance: %s\n'% score

res_pymvpa = bench(bench_pymvpa)
score, res_pymvpa = misc.bench(bench_pymvpa, data)
print 'PyMVPA: mean %s, std %s' % (
np.mean(res_pymvpa), np.std(res_pymvpa))
print 'Explained variance: %s\n'% score

106 changes: 80 additions & 26 deletions doc/index.rst
Expand Up @@ -29,7 +29,7 @@ We used the latest released version as of June 2011:
- scikits.learn 0.8
- MDP 3.1
- MLPy 2.2.2
- PyMVPA 0.6.0.dev
- PyMVPA 0.6.0~rc3
- Shogun 0.10.0

I ran it on an Intel(R) Core(TM)2 CPU @ 1.86GHz.
Expand Down Expand Up @@ -61,26 +61,24 @@ Support Vector Machines
We used several Support Vector Machine (RBF kernel) implementations. Numbers
represent the time in seconds (lower is better) it took to train the dataset
and perform prediction on a test dataset. In the plot, results are normalized
so that the fastest method has value 1.0.

to have the fastest method at 1.0.


.. table:: Results in scikits.learn ml-benchmarks

============ ======= ====== ==== ======= ======== ============= ========
Dataset PyMVPA Shogun MDP Pybrain MLPy scikits.learn Milk
============ ======= ====== ==== ======= ======== ============= ========
Madelon 12.89 6.03 -- -- 10.88 6.23 4.90
Arcene 1.37 0.42 -- -- 1.75 0.41 **0.34**
============ ======= ====== ==== ======= ======== ============= ========
============ ======= ====== ==== ======= ======== ============= ========
Dataset PyMVPA Shogun MDP Pybrain MLPy scikits.learn Milk
============ ======= ====== ==== ======= ======== ============= ========
Madelon 12.89 6.03 -- -- 10.88 6.23 4.90
Arcene 1.37 0.42 -- -- 1.75 0.41 **0.34**
============ ======= ====== ==== ======= ======== ============= ========



.. figure:: bench_svm.png
:scale: 60%
:align: center


The score by these calssfifiers in in a test dataset is.

.. warning::
Expand All @@ -102,9 +100,12 @@ The score by these calssfifiers in in a test dataset is.
K-means
-------

bla bla bla. NC = not converging.
We run the k-means algorithm on both Madelon and Arcene dataset. To make sure
the methods are converging, we show in the second table the inertia of all
methods, which are mostly equivalent.

.. table:: Results in scikits.learn ml-benchmarks

.. table:: Timing for k-Means algorithm

============ ======= ====== ==== ======= ======== ============= ========
Dataset PyMVPA Shogun MDP Pybrain MLPy scikits.learn milk
Expand All @@ -114,6 +115,8 @@ bla bla bla. NC = not converging.
============ ======= ====== ==== ======= ======== ============= ========


NC = Not Converging after one hour iteration.

.. figure:: bench_kmeans.png
:scale: 60%
:align: center
Expand All @@ -123,18 +126,18 @@ The following table shows the inertia, criterion that the k-means algorithm mini

.. table:: Inertia

============ ======= ====== ============= ======= ============= ============= ==============
Inertia PyMVPA Shogun MDP Pybrain MLPy scikits.learn Milk
============ ======= ====== ============= ======= ============= ============= ==============
Madelon -- -- -- -- 739171883.6 745421891.3 --
Arcene -- -- 1403820558.52 -- 1429740165.89 745421891.3 1451970835.28
============ ======= ====== ============= ======= ============= ============= ==============
============ ======= ====== ============= ======= ============= ============= ==============
Dataset PyMVPA Shogun MDP Pybrain MLPy scikits.learn Milk
============ ======= ====== ============= ======= ============= ============= ==============
Madelon -- -- -- -- 739171883.6 745421891.3 --
Arcene -- -- 1403820558.52 -- 1429740165.89 745421891.3 1451970835.28
============ ======= ====== ============= ======= ============= ============= ==============


Elastic Net
-----------

Bla bla bla bla
We solve the elastic net using a coordinate descent algorithm on both Madelon and Arcene dataset.


.. table:: Results in scikits.learn ml-benchmarks
Expand All @@ -155,12 +158,63 @@ Bla bla bla bla
Lasso (LARS algorithm)
----------------------

We solve the Lasso model by Least Angle Regression (LARS) algorithm. MLPy and
scikits.learn use a pure Python implementation, while PyMVPA uses bindings to
R code.

We also show the Means Squared error as a sanity check for the model. Note
that some NaN arise, probably due to collinearity in the data.


.. table:: Timing

============ ======= ============= =============
Dataset PyMVPA MLPy scikits.learn
============ ======= ============= =============
Madelon 33.45 72.2 0.88
Arcene NW 3.75 745421891.3
============ ======= ============= =============


.. table:: Means Squared Error on a test dataset.

============ ======= ============= =============
Dataset PyMVPA MLPy scikits.learn
============ ======= ============= =============
Madelon NaN 682.32 680.91
Arcene NW NaN 66.61
============ ======= ============= =============


Principal Component Analysis
----------------------------

We run principal component analysis on both datasets. In the libraries that
support it (scikit-learn, MDP, PyMVPA), we number of components in the
projection to 9.

.. table:: Timing PCA

============ ======= ==== ======= ======== ============= ========
Dataset PyMVPA MDP Pybrain MLPy scikits.learn milk
============ ======= ==== ======= ======== ============= ========
Madelon 0.48 0.50 6.51 0.79 1.36 2.66
Arcene -- -- -- 0.81 1.77 **1.0**
============ ======= ==== ======= ======== ============= ========


.. table:: explained variance

============ ======= ======== ======== ======== ============= =========
Dataset PyMVPA MDP Pybrain MLPy scikits.learn milk
============ ======= ======== ======== ======== ============= =========
Madelon -- 136705.5 228941.0 0.79 135788.2 455715.83
Arcene -- -- -- 0.81 1.77 **1.0**
============ ======= ======== ======== ======== ============= =========


============ ======= ============= ======= ============= ============= ==============
MSE PyMVPA MDP Pybrain MLPy scikits.learn Milk
============ ======= ============= ======= ============= ============= ==============
Madelon -- -- -- 739171883.6 745421891.3 --
Arcene -- 1403820558.52 -- 1429740165.89 745421891.3 1451970835.28
============ ======= ============= ======= ============= ============= ==============
Misc
----

TODO
Author : Fabian Pedregosa
License : Simplified BSD

0 comments on commit 97c8d64

Please sign in to comment.