# memkite/DeepLearningBibliography

Switch branches/tags
Nothing to show
Fetching contributors…
Cannot retrieve contributors at this time
6964 lines (6326 sloc) 505 KB
 %Aigaion2 BibTeX export from LISA - Publications %Tuesday 31 March 2015 11:46:28 AM @TECHREPORT{Alain+al-arxiv-2012, author = {Alain, Guillaume and Bengio, Yoshua}, title = {What Regularized Auto-Encoders Learn from the Data Generating Distribution}, number = {Arxiv report 1211.4246}, year = {2012}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, abstract = {What do auto-encoders learn about the underlying data generating distribution? Recent work suggests that some auto-encoder variants do a good job of capturing the local manifold structure of data. This paper clarifies some of these previous intuitive observations by showing that minimizing a particular form of regularized reconstruction error yields a reconstruction function that locally characterizes the shape of the data generating density. We show that the auto-encoder captures the score (derivative of the log-density with respect to the input), along with the second derivative of the density and the local mean associated with the unknown data-generating density. This is the second result linking denoising auto-encoders and score matching, but in way that is different from previous work, and can be applied to the case when the auto-encoder reconstruction function does not necessarily correspond to the derivative of an energy function. The theorems provided here are completely generic and do not depend on the parametrization of the auto-encoder: they show what the auto-encoder would tend to if given enough capacity and examples. These results are for a contractive training criterion we show to be similar to the denoising auto-encoder training criterion with small corruption noise, but with contraction applied on the whole reconstruction function rather than just encoder. Similarly to score matching, one can consider the proposed training criterion as a convenient alternative to maximum likelihood, i.e., one not involving a partition function.} } @TECHREPORT{Alain+al-arxiv-2015, author = {Alain, Guillaume and Bengio, Yoshua and Yao, Li and Yosinski, Jason and Thibodeau-Laufer, Eric and Zhang, Saizheng and Vincent, Pascal}, title = {GSNs : Generative Stochastic Networks}, number = {Arxiv report 1503.05571}, year = {2015}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, url = {http://arxiv.org/abs/1503.05571}, abstract = {We introduce a novel training principle for probabilistic models that is an alternative to maximum likelihood. The proposed Generative Stochastic Networks (GSN) framework is based on learning the transition operator of a {Markov} chain whose stationary distribution estimates the data distribution. Because the transition distribution is a conditional distribution generally involving a small move, it has fewer dominant modes, being unimodal in the limit of small moves. Thus, it is easier to learn, more like learning to perform supervised function approximation, with gradients that can be obtained by back-propagation. The theorems provided here generalize recent work on the probabilistic interpretation of denoising auto-encoders and provide an interesting justification for dependency networks and generalized pseudolikelihood (along with defining an appropriate joint distribution and sampling mechanism, even when the conditionals are not consistent). We study how GSNs can be used with missing inputs and can be used to sample subsets of variables given the rest. Successful experiments are conducted, validating these theoretical results, on two image datasets and with a particular architecture that mimics the Deep {Boltzmann} Machine Gibbs sampler but allows training to proceed with backprop, without the need for layerwise pretraining.} } @INPROCEEDINGS{Alain+Bengio-ICLR2013, author = {Alain, Guillaume and Bengio, Yoshua}, title = {What Regularized Auto-Encoders Learn from the Data Generating Distribution}, booktitle = {International Conference on Learning Representations (ICLR'2013)}, year = {2013}, abstract = {What do auto-encoders learn about the underlying data generating distribution? Recent work suggests that some auto-encoder variants do a good job of capturing the local manifold structure of data. This paper clarifies some of these previous intuitive observations by showing that minimizing a particular form of regularized reconstruction error yields a reconstruction function that locally characterizes the shape of the data generating density. We show that the auto-encoder captures the score (derivative of the log-density with respect to the input), along with the second derivative of the density and the local mean associated with the unknown data-generating density. This is the second result linking denoising auto-encoders and score matching, but in way that is different from previous work, and can be applied to the case when the auto-encoder reconstruction function does not necessarily correspond to the derivative of an energy function. The theorems provided here are completely generic and do not depend on the parametrization of the auto-encoder: they show what the auto-encoder would tend to if given enough capacity and examples. These results are for a contractive training criterion we show to be similar to the denoising auto-encoder training criterion with small corruption noise, but with contraction applied on the whole reconstruction function rather than just encoder. Similarly to score matching, one can consider the proposed training criterion as a convenient alternative to maximum likelihood, i.e., one not involving a partition function.} } @ARTICLE{alain-jmlr14, author = {Alain, Guillaume and Bengio, Yoshua}, title = {What Regularized Auto-Encoders Learn from the Data-Generating Distribution}, volume = {15}, year = {2014}, pages = {3563-3593}, crossref = {JMLR}, abstract = {What do auto-encoders learn about the underlying data-generating distribution? Recent work suggests that some auto-encoder variants do a good job of capturing the local manifold structure of data. This paper clarifies some of these previous observations by showing that minimizing a particular form of regularized reconstruction error yields a reconstruction function that locally characterizes the shape of the data- generating density. We show that the auto-encoder captures the score (derivative of the log-density with respect to the input). It contradicts previous interpretations of reconstruction error as an energy function. Unlike previous results, the theorems provided here are completely generic and do not depend on the parameterization of the auto-encoder: they show what the auto- encoder would tend to if given enough capacity and examples. These results are for a contractive training criterion we show to be similar to the denoising auto-encoder training criterion with small corruption noise, but with contraction applied on the whole reconstruction function rather than just encoder. Similarly to score matching, one can consider the proposed training criterion as a convenient alternative to maximum likelihood because it does not involve a partition function. Finally, we show how an approximate Metropolis-Hastings MCMC can be setup to recover samples from the estimated distribution, and this is confirmed in sampling experiments.} } @TECHREPORT{ARXIV-2010, author = {Bastien, Fr{\'{e}}d{\'{e}}ric and Bengio, Yoshua and Bergeron, Arnaud and Boulanger-Lewandowski, Nicolas and Breuel, Thomas and Chherawala, Youssouf and Cisse, Moustapha and C{\^{o}}t{\'{e}}, Myriam and Erhan, Dumitru and Eustache, Jeremy and Glorot, Xavier and Muller, Xavier and Pannetier Lebeuf, Sylvain and Pascanu, Razvan and Rifai, Salah and Savard, Fran{\c c}ois and Sicard, Guillaume}, keywords = {Computer Vision and Pattern Recognition, Learning, Neural and Evolutionary Computing}, title = {Deep Self-Taught Learning for Handwritten Character Recognition}, number = {Arxiv report 1009.3589}, year = {2010}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, abstract = {Recent theoretical and empirical work in statistical machine learning has demonstrated the importance of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple non-linear transformations. Self-taught learning (exploiting unlabeled examples or examples from other distributions) has already been applied to deep learners, but mostly to show the advantage of unlabeled examples. Here we explore the advantage brought by {\em out-of-distribution examples}. For this purpose we developed a powerful generator of stochastic variations and noise processes for character images, including not only affine transformations but also slant, local elastic deformations, changes in thickness, background images, grey level changes, contrast, occlusion, and various types of noise. The out-of-distribution examples are obtained from these highly distorted images or by including examples of object classes different from those in the target test set. We show that {\em deep learners benefit more from out-of-distribution examples than a corresponding shallow learner}, at least in the area of handwritten character recognition. In fact, we show that they beat previously published results and reach human-level performance on both handwritten digit classification and 62-class handwritten character recognition.} } @TECHREPORT{ARXIV-2011, author = {Bordes, Antoine and Glorot, Xavier and Weston, Jason and Bengio, Yoshua}, title = {Towards Open-Text Semantic Parsing via Multi-Task Learning of Structured Embeddings}, number = {Arxiv report 1107.3663}, year = {2011}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, url = {http://arxiv.org/abs/1107.3663}, abstract = {Open-text (or open-domain) semantic parsers are designed to interpret any statement in natural language by inferring a corresponding meaning representation (MR). Unfortunately, large scale systems cannot be easily machine-learned due to lack of directly supervised data. We propose here a method that learns to assign MRs to a wide range of text (using a dictionary of more than 70,000 words, which are mapped to more than 40,000 entities) thanks to a training scheme that combines learning from WordNet and ConceptNet with learning from raw text. The model learns structured embeddings of words, entities and MRs via a multi-task training process operating on these diverse sources of data that integrates all the learnt knowledge into a single system. This work ends up combining methods for knowledge acquisition, semantic parsing, and word-sense disambiguation. Experiments on various tasks indicate that our approach is indeed successful and can form a basis for future more sophisticated systems.} } @INPROCEEDINGS{Attardi+al-2009, author = {Attardi, Giuseppe and Dell'Orletta, Felice and Simi, Maria and Turian, Joseph}, keywords = {classifier, dependency parsing, natural language, parser, perceptron}, title = {Accurate Dependency Parsing with a Stacked Multilayer Perceptron}, booktitle = {Proceeding of Evalita 2009}, series = {LNCS}, year = {2009}, publisher = {Springer}, abstract = {Abstract. DeSR is a statistical transition-based dependency parser which learns from annotated corpora which actions to perform for building parse trees while scanning a sentence. We describe recent improvements to the parser, in particular stacked parsing, exploiting a beam search strategy and using a Multilayer Perceptron classifier. For the Evalita 2009 Dependency Parsing task DesR was configured to use a combination of stacked parsers. The stacked combination achieved the best accuracy scores in both the main and pilot subtasks. The contribution to the result of various choices is analyzed, in particular for taking advantage of the peculiar features of the TUT Treebank.} } @INPROCEEDINGS{bastien+all-NIPS2011, author = {Bastien, Fr{\'{e}}d{\'{e}}ric and Bergeron, Arnaud and Kl{\"{o}}ckner, Andreas and Vincent, Pascal and Bengio, Yoshua}, title = {A Common GPU n-Dimensional Array for Python and C}, booktitle = {Big Learn workshop, NIPS'11}, year = {2011}, abstract = {Currently there are multiple incompatible array/matrix/n-dimensional base object implementations for GPUs. This hinders the sharing of GPU code and causes duplicate development work. This paper proposes and presents a ﬁrst version of a common GPU n-dimensional array (tensor) named GpuNdArray [1] that works with both CUDA and OpenCL. It will be usable from Python, C, and possibly other programming languages.} } @MISC{Bastien-Theano-2012, author = {Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Pascanu, Razvan and Bergstra, James and Goodfellow, Ian J. and Bergeron, Arnaud and Bouchard, Nicolas and Bengio, Yoshua}, title = {Theano: new features and speed improvements}, year = {2012}, howpublished = {Deep Learning and Unsupervised Feature Learning NIPS 2012 Workshop}, abstract = {Theano is a linear algebra compiler that optimizes a user’s symbolically-speciﬁed mathematical computations to produce efﬁcient low-level implementations. In this paper, we present new features and efﬁciency improvements to Theano, and benchmarks demonstrating Theano’s performance relative to Torch7, a recently introduced machine learning library, and to RNNLM, a C++ library targeted at recurrent neural networks.} } @INPROCEEDINGS{Bengio+al-2009, author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason}, title = {Curriculum Learning}, year = {2009}, crossref = {ICML09}, abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved by using a particular curriculum, i.e., the selection and order of training examples. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).} } @TECHREPORT{Bengio+al-2009-TR, author = {Bengio, Yoshua and Louradour, Jerome and Collobert, Ronan and Weston, Jason}, title = {Curriculum Learning}, number = {1330}, year = {2009}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and gradually more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them 'curriculum learning'. In the context of recent research studying the difﬁculty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that signiﬁcant improvements in generalization can be achieved. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).} } @INPROCEEDINGS{Bengio+al-AI-2011, author = {Bengio, Yoshua and Bastien, Fr{\'{e}}d{\'{e}}ric and Bergeron, Arnaud and Boulanger-Lewandowski, Nicolas and Breuel, Thomas and Chherawala, Youssouf and Cisse, Moustapha and C{\^{o}}t{\'{e}}, Myriam and Erhan, Dumitru and Eustache, Jeremy and Glorot, Xavier and Muller, Xavier and Pannetier Lebeuf, Sylvain and Pascanu, Razvan and Rifai, Salah and Savard, Fran{\c c}ois and Sicard, Guillaume}, month = apr, title = {Deep Learners Benefit More from Out-of-Distribution Examples}, booktitle = {JMLR W\&CP: Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2011)}, year = {2011}, location = {Fort Lauderdale, FL, USA}, abstract = {Recent theoretical and empirical work in statistical machine learning has demonstrated the potential of learning algorithms for deep architectures, i.e., function classes obtained by composing multiple levels of representation. The hypothesis evaluated here is that intermediate levels of representation, because they can be shared across tasks and examples from different but related distributions, can yield even more benefits. Comparative experiments were performed on a large-scale handwritten character recognition setting with 62 classes (upper case, lower case, digits), using both a multi-task setting and perturbed examples in order to obtain out-of-distribution examples. The results agree with the hypothesis, and show that a deep learner did beat previously published results and reached human-level performance.} } @ARTICLE{Bengio+al-CI-2012, author = {Bengio, Yoshua and Chapados, Nicolas and Delalleau, Olivier and Larochelle, Hugo and Saint-Mleux, Xavier}, title = {Detonation Classification from Acoustic Signature with the Restricted {Boltzmann} Machine}, journal = {Computational Intelligence}, volume = {28}, number = {2}, year = {2012}, pages = {261–288} } @MISC{Bengio+al-patent-2000, author = {Bengio, Yoshua and Bottou, {L{\'{e}}on} and {LeCun}, Yann}, title = {Module for constructing trainable modular network in which each module outputs and inputs data structured as a graph}, year = {2000}, howpublished = {U.S. Patent 6,128,606, October 3} } @MISC{Bengio+al-patent-2001, author = {Bengio, Yoshua and Bottou, {L{\'{e}}on} and G. Howard, Paul}, title = {Z-Coder : a fast adaptive binary arithmetic coder}, year = {2001}, howpublished = {U.S. Patent 6,188,334, February 13, 2001, along with patents 6,225,925, 6,281,817, and 6,476,740} } @MISC{Bengio+al-patent-94, author = {Bengio, Yoshua and {LeCun}, Yann and Nohl, Craig and Burges, Chris}, title = {Visitor Registration System Using Automatic Handwriting Recognition}, year = {1994}, howpublished = {Patent submitted in the U.S.A. in October 1994, submission number 1-16-18-1} } @INCOLLECTION{Bengio+al-spectral-2006, author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Ouimet, Marie}, editor = {Guyon, Isabelle and Gunn, Steve and Nikravesh, Masoud and Zadeh, Lofti}, title = {Spectral Dimensionality Reduction}, booktitle = {Feature Extraction, Foundations and Applications}, year = {2006}, publisher = {Springer}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/eigenfn_chapter.pdf}, abstract = {In this chapter, we study and put under a common framework a number of non-linear dimensionality reduction methods, such as Locally Linear Embedding, {Isomap}, Laplacian eigenmaps and kernel {PCA}, which are based on performing an eigen-decomposition (hence the name "spectral"). That framework also includes classical methods such as {PCA} and metric multidimensional scaling ({MDS}). It also includes the data transformation step used in spectral clustering. We show that in all of these cases the learning algorithm estimates the principal eigenfunctions of an operator that depends on the unknown data density and on a kernel that is not necessarily positive semi-definite. This helps to generalize some of these algorithms so as to predict an embedding for out-of-sample examples without having to retrain the model. It also makes it more transparent what these algorithm are minimizing on the empirical data and gives a corresponding notion of generalization error.}, cat={B},topics={HighDimensional,Kernel,Unsupervised}, } @INCOLLECTION{Bengio+al-ssl-2006, author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas}, editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander}, title = {Label Propagation and Quadratic Criterion}, booktitle = {Semi-Supervised Learning}, year = {2006}, pages = {193--216}, publisher = {{MIT} Press}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_ssl.pdf}, abstract = {Various graph-based algorithms for semi-supervised learning have been proposed in the recent literature. They rely on the idea of building a graph whose nodes are data points (labeled and unlabeled) and edges represent similarities between points. Known labels are used to propagate information through the graph in order to label all nodes. In this chapter, we show how these different algorithms can be cast into a common framework where one minimizes a quadratic cost criterion whose closed-form solution is found by solving a linear system of size n (total number of data points). The cost criterion naturally leads to an extension of such algorithms to the inductive setting, where one obtains test samples one at a time: the derived induction formula can be evaluated in O(n) time, which is much more efficient than solving again exactly the linear system (which in general costs O(kn2) time for a sparse graph where each data point has k neighbors). We also use this inductive formula to show that when the similarity between points satisfies a locality property, then the algorithms are plagued by the curse of dimensionality, with respect to the dimensionality of an underlying manifold.}, cat={B},topics={Unsupervised}, } @TECHREPORT{Bengio+al-treecurse-2007, author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence}, month = jun, title = {Decision Trees do not Generalize to New Variations}, number = {1304}, year = {2007}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+al-tr1304.pdf} } @INPROCEEDINGS{Bengio+Bengio96, author = {Bengio, Samy and Bengio, Yoshua}, editor = {Xu, L.}, title = {An {EM} Algorithm for Asynchronous Input/Output Hidden {M}arkov Models}, booktitle = {International Conference On Neural Information Processing}, year = {1996}, pages = {328--334}, address = {Hong-Kong}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/iconip96.pdf}, abstract = {In learning tasks in which input sequences are mapped to output sequences, it is often the case that the input and output sequences are not synchronous. For example, in speech recognition, acoustic sequences are longer than phoneme sequences. Input/Output Hidden {Markov} Models have already been proposed to represent the distribution of an output sequence given an input sequence of the same length. We extend here this model to the case of asynchronous sequences_ and show an Expectation-Maximization algorithm for training such models.}, topics={Markov},cat={C}, } @MISC{bengio+bergstra:2007, author = {Bengio, Yoshua and Bergstra, James}, month = jun, title = {On the Challenge of Learning Long-term Dependencies}, year = {2007}, howpublished = {CIFAR Workshop on Modelling Sequential Data} } @INCOLLECTION{Bengio+chapter2007, author = {Bengio, Yoshua and {LeCun}, Yann}, editor = {Bottou, {L{\'{e}}on} and Chapelle, Olivier and DeCoste, D. and Weston, J.}, title = {Scaling Learning Algorithms towards {AI}}, booktitle = {Large Scale Kernel Machines}, year = {2007}, publisher = {MIT Press}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio+lecun_chapter2007.pdf}, abstract = {One long-term goal of machine learning research is to produce methods that are applicable to highly complex tasks, such as perception (vision, audition), reasoning, intelligent control, and other artificially intelligent behaviors. We argue that in order to progress toward this goal, the Machine Learning community must endeavor to discover algorithms that can learn highly complex functions, with minimal need for prior knowledge, and with minimal human intervention. We present mathematical and empirical evidence suggesting that many popular approaches to non-parametric learning, particularly kernel methods, are fundamentally limited in their ability to learn complex high-dimensional functions. Our analysis focuses on two problems. First, kernel machines are shallow architectures, in which one large layer of simple template matchers is followed by a single layer of trainable coefficients. We argue that shallow architectures can be very inefficient in terms of required number of computational elements and examples. Second, we analyze a limitation of kernel machines with a local kernel, linked to the curse of dimensionality, that applies to supervised, unsupervised (manifold learning) and semi-supervised kernel machines. Using empirical results on invariant image recognition tasks, kernel methods are compared with deep architectures, in which lower-level features or concepts are progressively combined into more abstract and higher-level representations. We argue that deep architectures have the potential to generalize in non-local ways, i.e., beyond immediate neighbors, and that this is crucial in order to make progress on the kind of complex tasks required for artificial intelligence.}, cat={B},topics={HighDimensional}, } @TECHREPORT{Bengio+Courville+Vincent-arxiv2012, author = {Bengio, Yoshua and Courville, Aaron and Vincent, Pascal}, title = {Representation Learning: A Review and New Perspectives}, number = {Arxiv report 1206.5538}, year = {2012}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, url = {http://arxiv.org/abs/1206.5538} } @INCOLLECTION{Bengio+Courville-2013, author = {Bengio, Yoshua and Courville, Aaron}, title = {Deep Learning of Representations}, booktitle = {Handbook on Neural Information Processing}, volume = {49}, year = {2013}, publisher = {Springer: Berlin Heidelberg}, isbn = {978-3-642-36656-7}, doi = {10.1007/978-3-642-36657-4} } @ARTICLE{Bengio+Delalleau-2009, author = {Bengio, Yoshua and Delalleau, Olivier}, month = jun, title = {Justifying and Generalizing Contrastive Divergence}, journal = {Neural Computation}, volume = {21}, number = {6}, year = {2009}, pages = {1601--1621}, abstract = {We study an expansion of the log-likelihood in undirected graphical models such as the Restricted {Boltzmann} Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the log-likelihood obtained through this expansion. We show that its residual term converges to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence (CD) estimator of the log-likelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a mean-field approximation to the reconstruction error often used to train autoassociators and stacked auto-associators. The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain. We present theoretical and empirical evidence linking the number of Gibbs steps $k$ and the magnitude of the RBM parameters to the bias in the CD estimator. These experiments also suggest that the sign of the CD estimator is correct most of the time, even when the bias is large, so that CD-$k$ is a good descent direction even for small $k$.} } @INPROCEEDINGS{Bengio+Delalleau-ALT-2011, author = {Bengio, Yoshua and Delalleau, Olivier}, title = {On the Expressive Power of Deep Architectures}, booktitle = {Proceedings of the 22nd International Conference on Algorithmic Learning Theory}, year = {2011}, note = {Eds. Jyrki Kivinen, Csaba Szepesv{\'{a}}ri, Esko Ukkonen, and Thomas Zeugmann} } @TECHREPORT{Bengio+Delalleau-TR2007, author = {Bengio, Yoshua and Delalleau, Olivier}, keywords = {Contrastive Divergence, Restricted {Boltzmann} Machine}, title = {Justifying and Generalizing Contrastive Divergence}, number = {1311}, year = {2007}, institution = {D{\'{e}}partement d'Informatique et Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, abstract = {We study an expansion of the log-likelihood in undirected graphical models such as the Restricted {Boltzmann} Machine (RBM), where each term in the expansion is associated with a sample in a Gibbs chain alternating between two random variables (the visible vector and the hidden vector, in RBMs). We are particularly interested in estimators of the gradient of the log-likelihood obtained through this expansion. We show that its terms converge to zero, justifying the use of a truncation, i.e. running only a short Gibbs chain, which is the main idea behind the Contrastive Divergence approximation of the log-likelihood gradient. By truncating even more, we obtain a stochastic reconstruction error, related through a mean-field approximation to the reconstruction error often used to train autoassociators and stacked auto-associators. The derivation is not specific to the particular parametric forms used in RBMs, and only requires convergence of the Gibbs chain.} } @INPROCEEDINGS{Bengio+DeMori88, author = {Bengio, Yoshua and De Mori, Renato}, title = {Use of neural networks for the recognition of place of articulation}, booktitle = {International Conference on Acoustics, Speech and Signal Processing}, year = {1988}, pages = {103--106}, address = {New-York, NY}, topics={Speech},cat={C}, } @INPROCEEDINGS{Bengio+DeMori89, author = {Bengio, Yoshua and Cardin, Regis and Cosi, Piero and De Mori, Renato}, title = {Speech coding with multi-layer networks}, booktitle = {International Conference on Acoustics, Speech and Signal Processing}, year = {1989}, pages = {164--167}, address = {Glasgow, Scotland}, topics={Speech},cat={C}, } @INCOLLECTION{Bengio+DeMori90a, author = {Bengio, Yoshua and De Mori, Renato}, editor = {Sethi, I. K. and Jain, A. K.}, title = {Connectionist models and their application to automatic speech recognition}, booktitle = {Artificial Neural Networks and Statistical Pattern Recognition: Old and New Connections}, year = {1990}, pages = {175--192}, publisher = {Elsevier, Machine Intelligence and Pattern Recognition Series}, topics={Speech},cat={B}, } @ARTICLE{Bengio+Frasconi-jair95, author = {Bengio, Yoshua and Frasconi, Paolo}, title = {Diffusion of Context and Credit Information in {M}arkovian Models}, journal = {Journal of Artificial Intelligence Research}, volume = {3}, year = {1995}, pages = {249--270}, abstract = {This paper studies the problem of ergodicity of transition probability matrices in {Markovian} models, such as hidden {Markov} models ({HMM}s), and how it makes very difficult the task of learning to represent long-term context for sequential data. This phenomenon hurts the forward propagation of long-term context information, as well as learning a hidden state representation to represent long-term context, which depends on propagating credit information backwards in time. Using results from {Markov} chain theory, we show that this problem of diffusion of context and credit is reduced when the transition probabilities approach 0 or 1, i.e., the transition probability matrices are sparse and the model essentially deterministic. The results found in this paper apply to learning approaches based on continuous optimization, such as gradient descent and the Baum-Welch algorithm.}, topics={Markov,LongTerm},cat={J}, } @INPROCEEDINGS{Bengio+Frasconi-nips7-diffuse, author = {Bengio, Yoshua and Frasconi, Paolo}, title = {Diffusion of Credit in {M}arkovian Models}, year = {1995}, pages = {553--560}, crossref = {NIPS7}, abstract = {This paper studies the problem of diffusion in {Markovian} models, such as hidden {Markov} models ({HMM}s) and how it makes very difficult the task of learning of long-term dependencies in sequences. Using results from {Markov} chain theory, we show that the problem of diffusion is reduced if the transition probabilities approach 0 or 1. Under this condition, standard {HMM}s have very limited modeling capabilities, but input/output {HMM}s can still perform interesting computations.}, topics={Markov},cat={C}, } @INPROCEEDINGS{Bengio+Frasconi-nips7-iohmms, author = {Bengio, Yoshua and Frasconi, Paolo}, title = {An Input/Output {HMM} Architecture}, year = {1995}, pages = {427--434}, crossref = {NIPS7}, abstract = {We introduce a recurrent architecture having a modular structure and we formulate a training procedure based on the {EM} algorithm. The resulting model has similarities to hidden {Markov} models, but supports recurrent networks processing style and allows to exploit the supervised learning paradigm while using maximum likelihood estimation.}, topics={Markov},cat={C}, } @INPROCEEDINGS{Bengio+Frasconi-nips94, author = {Bengio, Yoshua and Frasconi, Paolo}, title = {Credit Assignment through Time: Alternatives to Backpropagation}, year = {1994}, pages = {75--82}, crossref = {NIPS6}, abstract = {Learning to recognize or predict sequences using long-term context has many applications. However, practical and theoretical problems are found in training recurrent neural networks to perform tasks in which input/output dependencies span long intervals. Starting from a mathematical analysis of the problem, we consider and compare alternative algorithms and architectures on tasks for which the span of the input/output dependencies can be controlled. Results on the new algorithms show performance qualitatively superior to that obtained with backpropagation.}, topics={LongTerm},cat={C}, } @ARTICLE{Bengio+Pouliot90, author = {Bengio, Yoshua and Pouliot, Yannick}, title = {Efficient recognition of immunoglobulin domains from amino-acid sequences using a neural network}, journal = {Computer Applications in the Biosciences}, volume = {6}, number = {2}, year = {1990}, pages = {319--324}, topics={Bioinformatic,PriorKnowledge},cat={J}, } @INPROCEEDINGS{Bengio+Senecal-2003, author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien}, title = {Quick Training of Probabilistic Neural Nets by Importance Sampling}, booktitle = {Proceedings of the conference on Artificial Intelligence and Statistics (AISTATS)}, year = {2003}, abstract = {Our previous work on statistical language modeling introduced the use of probabilistic feedforward neural networks to help dealing with the curse of dimensionality. Training this model by maximum likelihood however requires for each example to perform as many network passes as there are words in the vocabulary. Inspired by the contrastive divergence model, we propose and evaluate sampling-based methods which require network passes only for the observed "positive example'' and a few sampled negative example words. A very significant speed-up is obtained with an adaptive importance sampling.} } @ARTICLE{Bengio+Senecal-2008, author = {Bengio, Yoshua and S{\'{e}}n{\'{e}}cal, Jean-S{\'{e}}bastien}, keywords = {Energy-based models, fast training, importance sampling, language modeling, Monte Carlo methods, probabilistic neural networks}, title = {Adaptive Importance Sampling to Accelerate Training of a Neural Probabilistic Language Model}, journal = {IEEE Trans. Neural Networks}, volume = {19}, number = {4}, year = {2008}, pages = {713--722}, abstract = {Previous work on statistical language modeling has shown that it is possible to train a feedforward neural network to approximate probabilities over sequences of words, resulting in significant error reduction when compared to standard baseline models based on -grams. However, training the neural network model with the maximum-likelihood criterion requires computations proportional to the number of words in the vocabulary. In this paper, we introduce adaptive importance sampling as a way to accelerate training of the model. The idea is to use an adaptive n-gram model to track the conditional distributions produced by the neural network. We show that a very significant speedup can be obtained on standard problems.} } @INCOLLECTION{Bengio-2007, author = {Bengio, Yoshua}, editor = {Cisek, Paul and Kalaska, John and Drew, Trevor}, title = {On the Challenge of Learning Complex Functions}, booktitle = {Computational Neuroscience: Theoretical Insights into Brain Function}, series = {Progress in Brain Research}, year = {2007}, publisher = {Elsevier}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/PBR_chapter.pdf}, abstract = {A common goal of computational neuroscience and of artificial intelligence research based on statistical learning algorithms is the discovery and understanding of computational principles that could explain what we consider adaptive intelligence, in animals as well as in machines. This chapter focuses on what is required for the learning of complex behaviors. We believe it involves the learning of highly varying functions, in a mathematical sense. We bring forward two types of arguments which convey the message that many currently popular machine learning approaches to learning flexible functions have fundamental limitations that render them inappropriate for learning highly varying functions. The first issue concerns the representation of such functions with what we call shallow model architectures. We discuss limitations of shallow architectures, such as so-called kernel machines, boosting algorithms, and one-hidden-layer artificial neural networks. The second issue is more focused and concerns kernel machines with a local kernel (the type used most often in practice), that act like a collection of template matching units. We present mathematical results on such computational architectures showing that they have a limitation similar to those already proved for older non-parametric methods, and connected to the so-called curse of dimensionality. Though it has long been believed that efficient learning in deep architectures is difficult, recently proposed computational principles for learning in deep architectures may offer a breakthrough.} } @ARTICLE{Bengio-2009, author = {Bengio, Yoshua}, title = {Learning deep architectures for {AI}}, journal = {Foundations and Trends in Machine Learning}, volume = {2}, number = {1}, year = {2009}, pages = {1--127}, note = {Also published as a book. Now Publishers, 2009.}, abstract = {Theoretical results suggest that in order to learn the kind of complicated functions that can represent high-level abstractions (e.g. in vision, language, and other AI-level tasks), one may need {\insist deep architectures}. Deep architectures are composed of multiple levels of non-linear operations, such as in neural nets with many hidden layers or in complicated propositional formulae re-using many sub-formulae. Searching the parameter space of deep architectures is a difficult task, but learning algorithms such as those for Deep Belief Networks have recently been proposed to tackle this problem with notable success, beating the state-of-the-art in certain areas. This paper discusses the motivations and principles regarding learning algorithms for deep architectures, in particular those exploiting as building blocks unsupervised learning of single-layer models such as Restricted {Boltzmann} Machines, used to construct deeper models such as Deep Belief Networks.} } @TECHREPORT{Bengio-96-TR, author = {Bengio, Yoshua}, month = feb, title = {Using a Financial Training Criterion Rather than a Prediction Criterion}, number = {\#1019}, year = {1996}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengioy_TR1019.pdf}, abstract = {The application of this work is to decision taking with financial time-series, using learning algorithms. The traditional approach is to train a model using a rediction criterion, such as minimizing the squared error between predictions and actual values of a dependent variable, or maximizing the likelihood of a conditional model of the dependent variable. We find here with noisy time-series that better results can be obtained when the model is directly trained in order to optimize the financial criterion of interest. Experiments were performed on portfolio selection with 35 Canadian stocks.}, topics={Finance,Discriminant},cat={T}, } @TECHREPORT{Bengio-arxiv-mixing-2012, author = {Bengio, Yoshua and Mesnil, Gr{\'{e}}goire and Dauphin, Yann and Rifai, Salah}, title = {Better Mixing via Deep Representations}, number = {Arxiv report 1207.4404}, year = {2012}, institution = {Universit{\'{e}} de Montr{\'{e}}al} } @TECHREPORT{Bengio-arxiv-moments-2012, author = {Bengio, Yoshua and Alain, Guillaume and Rifai, Salah}, title = {Implicit Density Estimation by Local Moment Matching to Sample from Auto-Encoders}, number = {Arxiv report 1207.0057}, year = {2012}, institution = {Universit{\'{e}} de Montr{\'{e}}al} } @BOOK{Bengio-book-2009, author = {Bengio, Yoshua}, title = {Learning deep architectures for {AI}}, year = {2009}, publisher = {Now Publishers}, abstract = {Theoretical results suggest that in order to learn the kind of complicated functions that can represent high-level abstractions (e.g. in vision, language, and other AI-level tasks), one may need {\insist deep architectures}. Deep architectures are composed of multiple levels of non-linear operations, such as in neural nets with many hidden layers or in complicated propositional formulae re-using many sub-formulae. Searching the parameter space of deep architectures is a difficult task, but learning algorithms such as those for Deep Belief Networks have recently been proposed to tackle this problem with notable success, beating the state-of-the-art in certain areas. This paper discusses the motivations and principles regarding learning algorithms for deep architectures, in particular those exploiting as building blocks unsupervised learning of single-layer models such as Restricted {Boltzmann} Machines, used to construct deeper models such as Deep Belief Networks.} } @BOOK{bengio-book96, author = {Bengio, Yoshua}, title = {Neural Networks for Speech and Sequence Recognition}, year = {1996}, publisher = {International Thompson Computer Press}, address = {London, UK}, topics={Speech},cat={B}, } @INCOLLECTION{Bengio-chapter-2013, author = {Bengio, Yoshua}, month = mar, title = {Evolving Culture vs Local Minima}, booktitle = {Growing Adaptive Machines: Integrating Development and Learning in Artificial Neural Networks}, number = {also as ArXiv 1203.2990v1}, year = {2013}, pages = {T. Kowaliw, N. Bredeche \& R. Doursat, eds.}, publisher = {Springer-Verlag}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, url = {http://arxiv.org/abs/1203.2990}, abstract = {We propose a theory that relates difficulty of learning in deep architectures to culture and language. It is articulated around the following hypotheses: (1) learning in an individual human brain is hampered by the presence of effective local minima; (2) this optimization difficulty is particularly important when it comes to learning higher-level abstractions, i.e., concepts that cover a vast and highly-nonlinear span of sensory configurations; (3) such high-level abstractions are best represented in brains by the composition of many levels of representation, i.e., by deep architectures; (4) a human brain can learn such high-level abstractions if guided by the signals produced by other humans, which act as hints or indirect supervision for these high-level abstractions; and (5), language and the recombination and optimization of mental concepts provide an efficient evolutionary recombination operator, and this gives rise to rapid search in the space of communicable ideas that help humans build up better high-level internal representations of their world. These hypotheses put together imply that human culture and the evolution of ideas have been crucial to counter an optimization difficulty: this optimization difficulty would otherwise make it very difficult for human brains to capture high-level knowledge of the world. The theory is grounded in experimental observations of the difficulties of training deep artificial neural networks. Plausible consequences of this theory for the efficiency of cultural evolutions are sketched.} } @INCOLLECTION{Bengio-chapterSLSP-2013, author = {Bengio, Yoshua}, title = {Deep learning of representations: looking forward}, booktitle = {Statistical Language and Speech Processing}, series = {Lecture Notes in Computer Science}, volume = {7978}, year = {2013}, pages = {1--37}, publisher = {Springer, also in arXiv at http://arxiv.org/abs/1305.0445} } @TECHREPORT{Bengio-convex-05, author = {Bengio, Yoshua and Le Roux, Nicolas and Vincent, Pascal and Delalleau, Olivier and Marcotte, Patrice}, title = {Convex neural networks}, number = {1263}, year = {2005}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1263.pdf}, abstract = {Convexity has recently received a lot of attention in the machine learning community, and the lack of convexity has been seen as a major disadvantage of many learning algorithms, such as multi-layer artificial neural networks. We how that training multi-layer neural networks in which the number of hidden units is learned can be viewed as a convex optimization problem. This problem involves an infinite number of variables, but can be solved by incrementally inserting a hidden unit at a time, each time finding a linear classifiers that minimizes a weighted sum of errors.}, topics={Boosting},cat={T}, } @ARTICLE{Bengio-Courville-Vincent-TPAMI-2012, author = {Bengio, Yoshua and Courville, Aaron and Vincent, Pascal}, keywords = {Abstracts, AI, Artificial intelligence, autoencoder, autoencoders, {Boltzmann} machine, data representation, data structures, Deep Learning, density estimation, Feature extraction, feature learning, geometrical connections, Learning systems, machine learning, machine learning algorithms, manifold learning, Manifolds, neural nets, neural networks, probabilistic models, probability, representation learning, Speech recognition, unsupervised feature learning, unsupervised learning}, month = aug, title = {Representation Learning: A Review and New Perspectives}, journal = {Pattern Analysis and Machine Intelligence, IEEE Transactions on}, volume = {35}, number = {8}, year = {2013}, pages = {1798-1828}, issn = {0162-8828}, abstract = {The success of machine learning algorithms generally depends on data representation, and we hypothesize that this is because different representations can entangle and hide more or less the different explanatory factors of variation behind the data. Although specific domain knowledge can be used to help design representations, learning with generic priors can also be used, and the quest for AI is motivating the design of more powerful representation-learning algorithms implementing such priors. This paper reviews recent work in the area of unsupervised feature learning and deep learning, covering advances in probabilistic models, autoencoders, manifold learning, and deep networks. This motivates longer term unanswered questions about the appropriate objectives for learning good representations, for computing representations (i.e., inference), and the geometrical connections between representation learning, density estimation, and manifold learning} } @ARTICLE{Bengio-decision-trees10, author = {Bengio, Yoshua and Delalleau, Olivier and Simard, Clarence}, keywords = {curse of dimensionality, decision trees, parity function}, month = nov, title = {Decision Trees do not Generalize to New Variations}, journal = {Computational Intelligence}, volume = {26}, number = {4}, year = {2010}, pages = {449--467} } @ARTICLE{bengio-demori89, author = {Bengio, Yoshua and De Mori, Renato}, title = {Use of multilayer networks for the recognition of phonetic features and phonemes}, journal = {Computational Intelligence}, volume = {5}, year = {1989}, pages = {134--141}, topics={Speech},cat={J}, } @ARTICLE{Bengio-eigen-NC2004, author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Ouimet, Marie}, title = {Learning eigenfunctions links spectral embedding and kernel {PCA}}, journal = {Neural Computation}, volume = {16}, number = {10}, year = {2004}, pages = {2197--2219}, abstract = {In this paper, we show a direct relation between spectral embedding methods and kernel {PCA}, and how both are special cases of a more general learning problem, that of learning the principal eigenfunctions of an operator defined from a kernel and the unknown data generating density. Whereas spectral embedding methods only provided coordinates for the training points, the analysis justifies a simple extension to out-of-sample examples (the Nystr{\"{o}}m formula) for Multi-Dimensional Scaling, spectral clustering, Laplacian eigenmaps, Locally Linear Embedding ({LLE}) and {Isomap}. The analysis provides, for all such spectral embedding methods, the definition of a loss function, whose empirical average is minimized by the traditional algorithms. The asymptotic expected value of that loss defines a generalization performance and clarifies what these algorithms are trying to learn. Experiments with {LLE}, {Isomap}, spectral clustering and {MDS} show that this out-of-sample embedding formula generalizes well, with a level of error comparable to the effect of small perturbations of the training set on the embedding.}, topics={HighDimensional,Kernel,Unsupervised},cat={J}, } @INPROCEEDINGS{Bengio-et-al-ICASSP-2013, author = {Bengio, Yoshua and Boulanger-Lewandowski, Nicolas and Pascanu, Razvan}, keywords = {Deep Learning, long-term dependencies, Recurrent networks, representation learning}, title = {Advances in Optimizing Recurrent Networks}, booktitle = {Proc. ICASSP 38}, year = {2013}, abstract = {After a more than decade-long period of relatively little research activity in the area of recurrent neural networks, several new developments will be reviewed here that have allowed substantial progress both in understanding and in technical solutions towards more efﬁ- cient training of recurrent networks. These advances have been motivated by and related to the optimization issues surrounding deep learning. Although recurrent networks are extremely powerful in what they can in principle represent in terms of modeling sequences, their training is plagued by two aspects of the same issue regarding the learning of long-term dependencies. Experiments reported here evaluate the use of clipping gradients, spanning longer time ranges with leaky integration, advanced momentum techniques, using more powerful output probability models, and encouraging sparser gradients to help symmetry breaking and credit assignment. The experiments are performed on text and music data and show off the combined effects of these techniques in generally improving both training and test error.} } @INPROCEEDINGS{Bengio-et-al-ICLR2014, author = {Bengio, Yoshua and Yao, Li and Cho, Kyunghyun}, title = {Bounding the Test Log-Likelihood of Generative Models}, year = {2014}, crossref = {ICLR2014-conf}, abstract = {Several interesting generative learning algorithms involve a complex probability distribution over many random variables, involving intractable normalization constants or latent variable normalization. Some of them may even not have an analytic expression for the unnormalized probability function and no tractable approximation. This makes it difficult to estimate the quality of these models, once they have been trained, or to monitor their quality (e.g. for early stopping) while training. A previously proposed method is based on constructing a non-parametric density estimator of the model's probability function from samples generated by the model. We revisit this idea, propose a more efficient estimator, and prove that it provides a lower bound on the true test log-likelihood, and an unbiased estimator as the number of generated samples goes to infinity, although one that incorporates the effect of poor mixing (making the estimated likelihood worse, i.e., more conservative).} } @INPROCEEDINGS{Bengio-et-al-ICML2013, author = {Bengio, Yoshua and Mesnil, Gr{\'{e}}goire and Dauphin, Yann and Rifai, Salah}, title = {Better Mixing via Deep Representations}, year = {2013}, crossref = {ICML13} } @INPROCEEDINGS{bengio-et-al-ICML2014, author = {Bengio, Yoshua and Thibodeau-Laufer, Eric and Yosinski, Jason}, title = {Deep Generative Stochastic Networks Trainable by Backprop}, year = {2014}, crossref = {ICML14}, abstract = {We introduce a novel training principle for probabilistic models that is an alternative to maximum likelihood. The proposed Generative Stochastic Networks (GSN) framework is based on learning the transition operator of a Markov chain whose stationary distribution estimates the data distribution. The transition distribution of the {Markov} chain is conditional on the previous state, generally involving a small move, so this conditional distribution has fewer dominant modes, being unimodal in the limit of small moves. Thus, it is easier to learn because it is easier to approximate its partition function, more like learning to perform supervised func- tion approximation, with gradients that can be obtained by backprop. We provide theorems that generalize recent work on the probabilistic interpretation of denoising autoencoders and obtain along the way an interesting justiﬁca- tion for dependency networks and generalized pseudolikelihood, along with a deﬁnition of an appropriate joint distribution and sampling mechanism even when the conditionals are not consistent. GSNs can be used with missing inputs and can be used to sample subsets of variables given the rest. We validate these theoretical results with experiments on two image datasets using an architecture that mimics the Deep {Boltzmann} Machine Gibbs sampler but allows training to proceed with simple backprop, without the need for layerwise pretraining.} } @INPROCEEDINGS{Bengio-et-al-NIPS2013, author = {Bengio, Yoshua and Yao, Li and Alain, Guillaume and Vincent, Pascal}, title = {Generalized Denoising Auto-Encoders as Generative Models}, year = {2013}, crossref = {NIPS26} } @INPROCEEDINGS{Bengio-Gingras-nips8, author = {Bengio, Yoshua and Gingras, Fran{\c c}ois}, title = {Recurrent Neural Networks for Missing or Asynchronous Data}, year = {1996}, pages = {395--401}, crossref = {NIPS8}, abstract = {In this paper we propose recurrent neural networks with feedback into the input units for handling two types of data analysis problems. On the one hand, this scheme can be used for static data when some of the input variables are missing. On the other hand, it can also be used for sequential data, when some of the input variables are missing or are available at different frequencies. Unlike in the case of probabilistic models (e.g. Gaussian) of the missing variables, the network does not attempt to model the distribution of the missing variables given the observed variables. Instead it is a more discriminant approach that fills in the missing variables for the sole purpose of minimizing a learning criterion (e.g., to minimize an output error).}, topics={Finance,Missing},cat={C}, } @ARTICLE{Bengio-Grandvalet-JMLR-04, author = {Bengio, Yoshua and Grandvalet, Yves}, title = {No Unbiased Estimator of the Variance of K-Fold Cross-Validation}, volume = {5}, year = {2004}, pages = {1089--1105}, crossref = {JMLR}, abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that dont take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.}, topics={Comparative},cat={J}, } @TECHREPORT{bengio-hyper-TR99, author = {Bengio, Yoshua}, title = {Continuous Optimization of Hyper-Parameters}, number = {1144}, year = {1999}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hyperTR.pdf}, abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves (1) training errors on each training example and (2) some hyper-parameters, which are kept fixed during this minimization. When there is only a single hyper-parameter one can easily explore how its value aects a model selection criterion (that is not the same as the training criterion, and is used to select hyper-parameters). In this paper we present a methodology to select many hyper-parameters that is based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. We first consider the case of a training criterion that is quadratic in the parameters. In that case, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient, but this formula requires the computation of second derivatives of the training criterion}, topics={ModelSelection},cat={T}, } @INPROCEEDINGS{Bengio-icnn93, author = {Bengio, Yoshua and Frasconi, Paolo and Simard, Patrice}, title = {The problem of learning long-term dependencies in recurrent networks}, booktitle = {IEEE International Conference on Neural Networks}, year = {1993}, pages = {1183--1195}, publisher = {IEEE Press}, address = {San Francisco}, note = {(invited paper)}, topics={LongTerm},cat={C}, } @ARTICLE{Bengio-ijprai93, author = {Bengio, Yoshua}, title = {A Connectionist Approach to Speech Recognition}, journal = {International Journal on Pattern Recognition and Artificial Intelligence}, volume = {7}, number = {4}, year = {1993}, pages = {647--668}, abstract = {The task discussed in this paper is that of learning to map input sequences to output sequences. In particular, problems of phoneme recognition in continuous speech are considered, but most of the discussed techniques could be applied to other tasks, such as the recognition of sequences of handwritten characters. The systems considered in this paper are based on connectionist models, or artificial neural networks, sometimes combined with statistical techniques for recognition of sequences of patterns, stressing the integration of prior knowledge and learning. Different architectures for sequence and speech recognition are reviewed, including recurrent networks as well as hybrid systems involving hidden {Markov} models.}, topics={PriorKnowledge,Speech},cat={J}, } @TECHREPORT{Bengio-iohmms-TR99, author = {Bengio, Yoshua and Lauzon, Vincent-Philippe and Ducharme, R{\'{e}}jean}, title = {Experiments on the Application of {IOHMM}s to Model Financial Returns Series}, number = {1146}, year = {1999}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/iohmms-returnsTR.pdf}, abstract = {Input/Output Hidden {Markov} Models ({IOHMM}s) are conditional hidden {Markov} models in which the emission (and possibly the transition) probabilities can be conditionned on an input sequence. For example, these conditional distributions can be linear, logistic, or non-linear (using for example multi-layer neural networks). We compare the generalization performance of several models which are special cases of Input/Output Hidden {Markov} Models on financial time-series prediction tasks: an unconditional Gaussian, a conditional linear Gaussian, a mixture of Gaussians, a mixture of conditional linear Gaussians, a hidden {Markov} model, and various {IOHMM}s. The experiments are performed on modeling the returns of market and sector indices. Note that the unconditional Gaussian estimates the first moment with the historical average. The results show that, although for the first moment the historical average gives the best results, for the higher moments, the {IOHMM}s yielded significantly better performance, as measured by the out-of-sample likelihood.}, topics={Markov},cat={T}, } @ARTICLE{bengio-lauzon-ducharme:2000, author = {Bengio, Yoshua and Lauzon, Vincent-Philippe and Ducharme, R{\'{e}}jean}, title = {Experiments on the Application of {IOHMM}s to Model Financial Returns Series}, journal = {IEEE Transaction on Neural Networks}, volume = {12}, number = {1}, year = {2001}, pages = {113--123}, abstract = {Input/Output Hidden {Markov} Models ({IOHMM}s) are conditional hidden {Markov} models in which the emission (and possibly the transition) probabilities can be conditioned on an input sequence. For example, these conditional distributions can be logistic, or non-linear (using for example multi-layer neural networks). We compare generalization performance of several models which are special cases of Input/Output Hidden {Markov} Models on financial time-series prediction tasks: an unconditional Gaussian, a conditional linear Gaussian, a mixture of Gaussians, a mixture of conditional linear Gaussians, a hidden {Markov} model, and various {IOHMM}s. The experiments compare these models on predicting the conditional density of returns of market sector indices. Note that the unconditional Gaussian estimates the first moment the historical average. The results show that_ although for the first moment the historical average gives the best results, for the higher moments, the {IOHMM}s significantly better performance, as estimated by the out-of-sample likelihood.}, topics={Markov,Finance},cat={J}, } @INPROCEEDINGS{bengio-lecun-94, author = {Bengio, Yoshua and {LeCun}, Yann}, month = oct, title = {Word normalization for on-line handwritten word recognition}, booktitle = {Proc. of the International Conference on Pattern Recognition}, volume = {II}, year = {1994}, pages = {409--413}, publisher = {IEEE}, address = {Jerusalem}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/icpr-norm.ps}, abstract = {We introduce a new approach to normalizing words written with an electronic stylus that applies to all styles of handwriting (upper case, lower case, printed, cursive, or mixed). A geometrical model of the word spatial structure is fitted to the pen trajectory using the {EM} algorithm. The fitting process maximizes the likelihood of the trajectory given the model and a set a priors on its parameters. The method was evaluated and integrated to a recognition system that combines neural networks and hidden {Markov} models.}, topics={PriorKnowledge,Speech},cat={C}, } @TECHREPORT{Bengio-localfailure-TR-2005, author = {Bengio, Yoshua and Delalleau, Olivier and Le Roux, Nicolas}, title = {The Curse of Dimensionality for Local Kernel Machines}, number = {1258}, year = {2005}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/tr1258.pdf}, abstract = {We present a series of theoretical arguments supporting the claim that a large class of modern learning algorithms based on local kernels are sensitive to the curse of dimensionality. These include local manifold learning algorithms such as {Isomap} and {LLE}, support vector classifiers with Gaussian or other local kernels, and graph-based semisupervised learning algorithms using a local similarity function. These algorithms are shown to be local in the sense that crucial properties of the learned function at x depend mostly on the neighbors of x in the training set. This makes them sensitive to the curse of dimensionality, well studied for classical non-parametric statistical learning. There is a large class of data distributions for which non-local solutions could be expressed compactly and potentially be learned with few examples, but which will require a large number of local bases and therefore a large number of training examples when using a local learning algorithm.}, topics={HighDimensional,Kernel,Unsupervised},cat={T}, } @INPROCEEDINGS{Bengio-nips-2006, author = {Bengio, Yoshua and Lamblin, Pascal and Popovici, Dan and Larochelle, Hugo}, title = {Greedy Layer-Wise Training of Deep Networks}, year = {2007}, pages = {153--160}, crossref = {NIPS19}, abstract = {Complexity theory of circuits strongly suggests that deep architectures can be much more efficient (sometimes exponentially) than shallow architectures, in terms of computational elements required to represent some functions. Deep multi-layer neural networks have many levels of non-linearities allowing them to compactly represent highly non-linear and highly-varying functions. However, until recently it was not clear how to train such deep networks, since gradient-based optimization starting from random initialization appears to often get stuck in poor solutions. Hinton et al. recently introduced a greedy layer-wise unsupervised learning algorithm for Deep Belief Networks (DBN), a generative model with many layers of hidden causal variables. In the context of the above optimization problem, we study this algorithm empirically and explore variants to better understand its success and extend it to cases where the inputs are continuous or where the structure of the input distribution is not revealing enough about the variable to be predicted in a supervised task. Our experiments also confirm the hypothesis that the greedy layer-wise unsupervised training strategy mostly helps the optimization, by initializing weights in a region near a good local minimum, giving rise to internal distributed representations that are high-level abstractions of the input, bringing better generalization.} } @INPROCEEDINGS{Bengio-nips10, author = {Bengio, Yoshua and Bengio, Samy and Isabelle, Jean-Fran{\c c}ois and Singer, Yoram}, title = {Shared Context Probabilistic Transducers}, year = {1998}, crossref = {NIPS10}, abstract = {Recently, a model for supervised learning of probabilistic transducers represented by suffix trees was introduced. However, this algorithm tends to build very large trees, requiring very large amounts of computer memory. In this paper, we propose a new, more compact, transducer model in which one shares the parameters of distributions associated to contexts yielding similar conditional output distributions. We illustrate the advantages of the proposed algorithm with comparative experiments on inducing a noun phrase recognizer.}, topics={HighDimensional},cat={C}, } @TECHREPORT{Bengio-NLMP-TR-2005, author = {Bengio, Yoshua and Larochelle, Hugo}, title = {Non-Local Manifold Parzen Windows}, number = {1264}, year = {2005}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/NLMP-techreport.pdf}, abstract = {In order to escape from the curse of dimensionality, we claim that one can learn non-local functions, in the sense that the value and shape of the learned function at x must be inferred using examples that may be far from x. With this objective, we present a non-local non-parametric density estimator. It builds upon previously proposed Gaussian mixture models with regularized covariance matrices to take into account the local shape of the manifold. It also builds upon recent work on non-local estimators of the tangent plane of a manifold, which are able to generalize in places with little training data, unlike traditional, local, non-parametric models.}, topics={HighDimensional,Kernel,Unsupervised},cat={T}, } @INPROCEEDINGS{Bengio-nncm96, author = {Bengio, Yoshua}, editor = {Weigend, A.S. and Abu-Mostafa, Y.S. and Refenes, A. -P. N.}, title = {Training A Neural Network with a Financial Criterion Rather than a Prediction Criterion}, booktitle = {Proceedings of the Fourth International Conference on Neural Networks in the Capital Markets ({NNCM}-96)}, year = {1997}, pages = {433--443}, publisher = {World Scientific}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nncm.pdf}, abstract = {A common approach to quantitative decision taking with financial time-series is to train a model using a prediction criterion (e.g., squared error). We find on a portfolio selection problem that better results can be obtained when the model is directly trained in order to optimize the financial criterion of interest, with a differentiable decision module.}, topics={Finance,PriorKnowledge,Discriminant},cat={C}, } @TECHREPORT{Bengio-NonStat-Hyper-TR, author = {Bengio, Yoshua and Dugas, Charles}, title = {Learning Simple Non-Stationarities with Hyper-Parameters}, number = {1145}, year = {1999}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/nonstatTR.pdf}, abstract = {We consider sequential data that is sampled from an unknown process, so that the data are not necessarily i.i.d.. Most approaches to machine learning assume that data points are i.i.d.. Instead we consider a measure of generalization that does not make this assumption, and we consider in this context a recently proposed approach to optimizing hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to hyper-parameters. Here we use hyper-parameters that control a function that gives different weights to different time steps in the historical data sequence. The approach is successfully applied to modeling thev olatility of stock returns one month ahead. Comparative experiments with more traditional methods are presented.}, topics={ModelSelection,Finance},cat={T}, } @ARTICLE{Bengio-scholarpedia-2007, author = {Bengio, Yoshua}, title = {Neural net language models}, journal = {Scholarpedia}, volume = {3}, number = {1}, year = {2008}, pages = {3881}, abstract = {A language model is a function, or an algorithm for learning such a function, that captures the salient statistical characteristics of the distribution of sequences of words in a natural language, typically allowing one to make probabilistic predictions of the next word given preceding ones. A neural network language model is a language model based on Neural Networks , exploiting their ability to learn distributed representations to reduce the impact of the curse of dimensionality. In the context of learning algorithms, the curse of dimensionality refers to the need for huge numbers of training examples when learning highly complex functions. When the number of input variables increases, the number of required examples can grow exponentially. The curse of dimensionality arises when a huge number of different combinations of values of the input variables must be discriminated from each other, and the learning algorithm needs at least one example per relevant combination of values. In the context of language models, the problem comes from the huge number of possible sequences of words, e.g., with a sequence of 10 words taken from a vocabulary of 100,000 there are 10^{50} possible sequences... A distributed representation of a symbol is a tuple (or vector) of features which characterize the meaning of the symbol, and are not mutually exclusive. If a human were to choose the features of a word, he might pick grammatical features like gender or plurality, as well as semantic features like animate" or invisible. With a neural network language model, one relies on the learning algorithm to discover these features, and the features are continuous-valued (making the optimization problem involved in learning much simpler). The basic idea is to learn to associate each word in the dictionary with a continuous-valued vector representation. Each word corresponds to a point in a feature space. One can imagine that each dimension of that space corresponds to a semantic or grammatical characteristic of words. The hope is that functionally similar words get to be closer to each other in that space, at least along some directions. A sequence of words can thus be transformed into a sequence of these learned feature vectors. The neural network learns to map that sequence of feature vectors to a prediction of interest, such as the probability distribution over the next word in the sequence. What pushes the learned word features to correspond to a form of semantic and grammatical similarity is that when two words are functionally similar, they can be replaced by one another in the same context, helping the neural network to compactly represent a function that makes good predictions on the training set, the set of word sequences used to train the model. The advantage of this distributed representation approach is that it allows the model to generalize well to sequences that are not in the set of training word sequences, but that are similar in terms of their features, i.e., their distributed representation. Because neural networks tend to map nearby inputs to nearby outputs, the predictions corresponding to word sequences with similar features are mapped to similar predictions. Because many different combinations of feature values are possible, a very large set of possible meanings can be represented compactly, allowing a model with a comparatively small number of parameters to fit a large training set.} } @TECHREPORT{Bengio-TR1312, author = {Bengio, Yoshua}, title = {Learning deep architectures for AI}, number = {1312}, year = {2007}, institution = {Dept. IRO, Universite de Montreal}, note = {Preliminary version of journal article with the same title appearing in Foundations and Trends in Machine Learning (2009)}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/TR1312.pdf}, abstract = {Theoretical results strongly suggest that in order to learn the kind of complicated functions that can represent high-level abstractions (e.g. in vision, language, and other AI-level tasks), one may need deep architectures. Deep architectures are composed of multiple levels of non-linear operations, such as in neural nets with many hidden layers. Searching the parameter space of deep architectures is a difficult optimization task, but learning algorithms such as those for Deep Belief Networks have recently been proposed to tackle this problem with notable success, beating the state-of-the-art in certain areas. This paper discusses the motivations and principles regarding learning algorithms for deep architectures and in particular for those based on unsupervised learning such as Deep Belief Networks, using as building blocks single-layer models such as Restricted {Boltzmann} Machines.} } @TECHREPORT{Bengio-tricks-arxiv2012, author = {Bengio, Yoshua}, title = {Practical recommendations for gradient-based training of deep architectures}, number = {Arxiv report 1206.5533}, year = {2012}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, url = {http://arxiv.org/abs/1206.5533} } @ARTICLE{Bengio-trnn94, author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo}, title = {Learning Long-Term Dependencies with Gradient Descent is Difficult}, journal = {IEEE Transactions on Neural Networks}, volume = {5}, number = {2}, year = {1994}, pages = {157--166}, abstract = {Recurrent neural networks can be used to map input sequences to output sequences, such as for recognition, production or prediction problems. However, practical difficulties have been reported in training recurrent neural networks to perform tasks in which the temporal contingencies present in the input/output sequences span long intervals. We show why gradient based learning algorithms face an increasingly difficult problem as the duration of the dependencies to be captures increases. These results expose a trade-off between efficient learning by gradient descent and latching on information for long periods. Based on an understanding of this problem, alternatives to standard gradient descent are considered.}, optnote={(Special Issue on Recurrent Neural Networks)},topics={LongTerm},cat={J}, } @INPROCEEDINGS{Bengio-wirn93, author = {Bengio, Yoshua and Frasconi, Paolo and Gori, Marco and Soda, G.}, editor = {Caianello, E.}, title = {Recurrent Neural Networks for Adaptive Temporal Processing}, booktitle = {Proc. of the 6th Italian Workshop on Neural Networks, WIRN-93}, year = {1993}, pages = {1183--1195}, publisher = {World Scientific Publ.}, address = {Vietri, Italy}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/rnn_review93.ps}, topics={LongTerm},cat={C}, } @ARTICLE{Bengio2000c, author = {Bengio, Yoshua}, title = {Gradient-Based Optimization of Hyperparameters}, journal = {Neural Computation}, volume = {12}, number = {8}, year = {2000}, pages = {1889--1900}, abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves a hyper-parameter. This hyper-parameter is usually chosen by trial and error with a model selection criterion. In this paper we present a methodology to optimize several hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. In the case of a quadratic training criterion, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient involving second derivatives of the training criterion.}, topics={ModelSelection},cat={J}, } @ARTICLE{Bengio89a, author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato and Merlo, Ettore}, title = {Programmable execution of multi-layered networks for automatic speech recognition}, journal = {Communications of the Association for Computing Machinery}, volume = {32}, number = {2}, year = {1989}, pages = {195--199}, topics={Speech},cat={J}, } @INPROCEEDINGS{Bengio89c, author = {Bengio, Yoshua and Cosi, Piero and Cardin, Regis and De Mori, Renato}, title = {Use of multi-layered networks for coding speech with phonetic features}, year = {1989}, pages = {224--231}, address = {Denver, CO}, crossref = {NIPS1}, abstract = {Preliminary results on speaker-independant speech recognition are reported. A method that combines expertise on neural networks with expertise on speech recognition is used to build the recognition systems. For transient sounds, event-driven property extractors with variable resolution in the time and frequency domains are used. For sonorant speech, a model of the human auditory system is preferred to FFT as a front-end module.}, topics={Speech},cat={C}, } @INPROCEEDINGS{Bengio89d, author = {De Mori, Renato and Bengio, Yoshua and Cosi, Piero}, title = {On the generalization capability of multilayered networks in the extraction of speech properties}, booktitle = {Proceedings of the International Joint Conference on Artificial Intelligence}, year = {1989}, pages = {1531--1536}, publisher = {IEEE}, address = {Detroit}, topics={Speech},cat={C}, } @INPROCEEDINGS{Bengio90, author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato}, title = {Speaker Independent Speech Recognition with Neural Networks and Speech Knowledge}, year = {1990}, pages = {218--225}, address = {Denver, CO}, crossref = {NIPS2}, abstract = {We attempt to combine neural networks with knowledge from speech science to build a speaker independent speech recognition system. This knowledge is utilized in designing the preprocessing, input coding, output coding, output supervision and architectural constraints. To handle the temporal aspect of speech we combine delays, copies of activations of hidden and output units at the input level, and Back-Propagation for Sequences (BPS), a learning algorithm for networks with local self-loops. This strategy is demonstrated in several experiments, in particular a nasal discrimination task for which the application of a speech theory hypothesis dramatically improved generalization.}, topics={PriorKnowledge,Speech},cat={C}, } @INCOLLECTION{Bengio90b, author = {Bengio, Yoshua}, title = {Radial Basis Functions for speech recognition}, booktitle = {Speech Recognition and Understanding: Recent Advances, Trends and Applications}, year = {1990}, pages = {293--298}, publisher = {NATO Advanced Study Institute Series F: Computer and Systems Sciences}, topics={Kernel,Speech},cat={B}, } @INCOLLECTION{Bengio90c, author = {Bengio, Yoshua and De Mori, Renato}, editor = {{Fogelman Soulie}, F. and Herault, J.}, title = {Speech coding with multilayer networks}, booktitle = {Neurocomputing: Algorithms, Architectures and Applications}, year = {1990}, pages = {207--216}, publisher = {NATO Advanced Study Institute Series F: Computer and Systems Sciences}, topics={Speech},cat={B}, } @INPROCEEDINGS{Bengio90e, author = {Bengio, Yoshua and Pouliot, Yannick and Bengio, Samy and Agin, Patrick}, title = {A neural network to detect homologies in proteins}, year = {1990}, pages = {423--430}, address = {Denver, CO}, crossref = {NIPS2}, abstract = {In order to detect the presence and location of immunoglobulin (Ig) domains from amino acid sequences we built a system based on a neural network with one hidden layer trained with back propagation. The program was designed to efficiently identify proteins exhibiting such domains, characterized by a few localized conserved regions and a low overall homology. When the National Biomedical Research Foundation (NBRF) NEW protein sequence database was scanned to evaluate the program's performance, we obtained very low rates of false negatives coupled with a moderate rate of false positives.}, topics={Bioinformatic,PriorKnowledge},cat={C}, } @INPROCEEDINGS{Bengio90z, author = {Bengio, Yoshua and De Mori, Renato and Gori, Marco}, editor = {Caianello, E.}, title = {Experiments on automatic speech recognition using BPS}, booktitle = {Parallel Architectures and Neural Networks}, year = {1990}, pages = {223--232}, publisher = {World Scientific Publ.}, topics={Speech},cat={C}, } @INPROCEEDINGS{Bengio91a, author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf}, title = {A comparative study of hybrid acoustic phonetic decoders based on artificial neural networks}, booktitle = {Proceedings of EuroSpeech'91}, year = {1991}, topics={PriorKnowledge,Speech},cat={C}, } @INPROCEEDINGS{Bengio91b, author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf}, title = {Global Optimization of a Neural Network - Hidden {M}arkov Model Hybrid}, booktitle = {Proceedings of EuroSpeech'91}, year = {1991}, topics={Markov},cat={C}, } @INPROCEEDINGS{Bengio91z, author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf}, title = {Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks}, booktitle = {Proceedings of EuroSpeech'91}, year = {1991}, location = {Genova, Italy}, cat={C}, } @ARTICLE{Bengio92b, author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf}, title = {Phonetically motivated acoustic parameters for continuous speech recognition using artificial neural networks}, journal = {Speech Communication}, volume = {11}, number = {2--3}, year = {1992}, pages = {261--271}, note = {Special issue on neurospeech}, topics={PriorKnowledge,Speech},cat={J}, } @INPROCEEDINGS{Bengio92c, author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf}, title = {Neural Network - Gaussian Mixture Hybrid for Speech Recognition or Density Estimation}, year = {1992}, pages = {175--182}, address = {Denver, CO}, crossref = {NIPS4}, abstract = {The subject of this paper is the integration of multi-layered Artificial Neural Networks ({ANN}) with probability density functions such as Gaussian mixtures found in continuous density hlidden {Markov} Models ({HMM}). In the first part of this paper we present an {ANN}/HMM hybrid in which all the parameters or the the system are simultaneously optimized with respect to a single criterion. In the second part of this paper, we study the relationship between the density of the inputs of the network and the density of the outputs of the networks. A rew experiments are presented to explore how to perform density estimation with {ANN}s.}, topics={Speech},cat={C}, } @INPROCEEDINGS{Bengio94d, author = {Frasconi, Paolo and Bengio, Yoshua}, title = {An {EM} Approach to Grammatical Inference: Input/Output {HMMs}}, booktitle = {International Conference on Pattern Recognition (ICPR'94)}, year = {1994}, pages = {289--294}, address = {Jerusalem 1994}, topics={Markov,LongTerm},cat={C}, } @ARTICLE{Bengio96, author = {Bengio, Yoshua and Frasconi, Paolo}, title = {Input/{O}utput {HMM}s for Sequence Processing}, journal = {IEEE Transactions on Neural Networks}, volume = {7}, number = {5}, year = {1996}, pages = {1231--1249}, abstract = {We consider problems of sequence processing and propose a solution based on a discrete state model in order to represent past context. We introduce a recurrent connectionist architecture having a modular structure that associates a subnetwork to each state. The model has a statistical interpretation we call Input/Output Hidden {Markov} Model ({IOHMM}). It can be trained by the {EM} or {GEM} algorithms, considering state trajectories as missing data, which decouples temporal credit assignment and actual parameter estimation. The model presents similarities to hidden {Markov} models ({HMM}s), but allows us to map input sequences to output sequences, using the same processing style as recurrent neural networks. {IOHMM}s are trained using a more discriminant learning paradigm than {HMM}s, while potentially taking advantage of the {EM} algorithm. We demonstrate that {IOHMM}s are well suited for solving grammatical inference problems on a benchmark problem. Experimental results are presented for the seven Tomita grammars, showing that these adaptive models can attain excellent generalization.}, topics={Markov},cat={J}, } @TECHREPORT{Bengio96-hmmsTR, author = {Bengio, Yoshua}, month = oct, title = {Markovian Models for Sequential Data}, number = {1049}, year = {1996}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hmmsTR.pdf}, abstract = {Hidden {Markov} Models ({HMM}s) are statistical models of sequential data that have been used successfully in many applications, especially for speech recognition. We first summarize the basics of {HMM}s, and then review several recent related learning algorithms and extensions of {HMM}s, including hybrids of {HMM}s with artificial neural networks, Input-Output {HMM}s, weighted transducers, variable-length {Markov} models and {Markov} switching state-space models. Finally, we discuss some of the challenges of future research in this area.}, topics={Markov},cat={T}, } @ARTICLE{Bengio97, author = {Bengio, Yoshua}, title = {Using a Financial Training Criterion Rather than a Prediction Criterion}, journal = {International Journal of Neural Systems}, volume = {8}, number = {4}, year = {1997}, pages = {433--443}, note = {Special issue on noisy time-series}, abstract = {The application of this work is to decision taking with financial time-series, using learning algorithms. The traditional approach is to train a model using a prediction criterion, such as minimizing the squared error between predictions and actual values of a dependent variable, or maximizing the likelihood of a conditional model of the dependent variable. We find here with noisy time-series that better results can be obtained when the model is directly trained in order to maximize the financial criterion of interest, here gains and losses (including those due to transactions) incurred during trading. Experiments were performed on portfolio selection with 35 Canadian stocks.}, topics={Finance,PriorKnowledge,Discriminant},cat={J}, } @ARTICLE{Bengio99a, author = {Bengio, Yoshua}, title = {Markovian Models for Sequential Data}, journal = {Neural Computing Surveys}, volume = {2}, year = {1999}, pages = {129--162}, abstract = {Hidden {Markov} Models ({HMM}s) are statistical models of sequential data that have been used successfully in many machine learning applications, especially for speech recognition. Furthermore? in the last few years, many new and promising probabilistic models related to {HMM}s have been proposed. We first summarize the basics of {HMM}s, arid then review several recent related learning algorithms and extensions of {HMM}s, including in particular hybrids of {HMM}s with artificial neural networks, Input-Output {HMM}s (which are conditional {HMM}s using neural networks to compute probabilities), weighted transducers, variable-length {Markov} models and {Markov} switching state-space models. Finally, we discuss some of the challenges of future research in this very active area.}, topics={Markov},cat={J}, } @ARTICLE{Bengio99b, author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles}, title = {Stochastic Learning of Strategic Equilibria for Auctions}, journal = {Neural Computation}, volume = {11}, number = {5}, year = {1999}, pages = {1199--1209}, abstract = {This paper presents a new application of stochastic adaptive learning algorithms to the computation of strategic equilibria in auctions. The proposed approach addresses the problems of tracking a moving target and balancing exploration (of action space) versus exploitation (of better modeled regions of action space). Neural networks are used to represent a stochastic decision model for each bidder. Experiments confirm the correctness and usefulness of the approach.}, topics={Auction},cat={J}, } @TECHREPORT{bengio:1990, author = {Bengio, Yoshua}, title = {Learning a Synaptic Learning Rule}, number = {751}, year = {1990}, institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, address = {Montr{\'{e}}al (QC) Canada}, topics={BioRules},cat={T}, } @INPROCEEDINGS{bengio:1990:snowbird, author = {Bengio, Yoshua and R., De Mori}, title = {Recurrent networks with Radial Basis Functions for speech recognition}, booktitle = {1990 Neural Networks for Computing Conference}, year = {1990}, address = {Snowbird, Utah, USA}, topics={Speech},cat={C}, } @INPROCEEDINGS{bengio:1991:ijcnn, author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn}, title = {Learning a Synaptic Learning Rule}, booktitle = {Proceedings of the International Joint Conference on Neural Networks}, year = {1991}, pages = {II--A969}, address = {Seattle, USA}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1991_ijcnn.ps}, abstract = {This paper presents an original approach to neural modeling based on the idea of searching, with learning methods, for a synaptic learning rule which is biologically plausible, and yields networks that are able to learn to perform difficult tasks. The proposed method of automatically finding the learning rule relies on the idea of considering the synaptic modification rule as a parametric function. This function has local inputs and is the same in many neurons. The parameters that define this function can be estimated with known learning methods. For this optimization, we give particular attention to gradient descent and genetic algorithms. In both cases, estimation of this function consists of a joint global optimization of (a) the synaptic modification function, and (b) the networks that are learning to perform some tasks. The proposed methodology can be used as a tool to explore the missing pieces of the puzzle of neural networks learning. Both network architecture, and the learning function can be designed within constraints derived from biological knowledge.}, addressfr={Seattle, USA},topics={BioRules},cat={C}, } @INPROCEEDINGS{bengio:1991:nnc, author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn}, title = {Learning Synaptic Learning Rules}, booktitle = {Neural Networks for Computing}, year = {1991}, address = {Snowbird, Utah, USA}, addressfr={Snowbird, Utah, USA},topics={BioRules},cat={C}, } @INPROCEEDINGS{bengio:1991:snowbird, author = {Bengio, Yoshua and Bengio, Samy and Cloutier, Jocelyn}, title = {Learning a Synaptic Learning Rule}, booktitle = {1991 Neural Networks for Computing Conference}, year = {1991}, address = {Snowbird, Utah, USA}, topics={BioRules},cat={C}, } @INPROCEEDINGS{bengio:1992:nn, author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan}, title = {Aspects th{\'{e}}oriques de l'optimisation d'une r{\{e}}gle d'apprentissage}, booktitle = {Actes de la conf{\'{e}}rence Neuro-N{\^{\i}}mes 1992}, year = {1992}, address = {N{\^{\i}}es, France}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1992_nn.ps}, abstract = {Ayant expos{\'{e}} dans de pr{\'{e}}c{\'{e}}dentes publications (voir [Beng90, Beng92] notamment) lid{\'{e}}e que lon pouvait optimiser des r{\{e}}gles dapprentissage param{\'{e}}triques pour r{\'{e}}seaux de neurones, nous montrons dans cet article comment d{\'{e}}velopper, par la m{\'{e}}thode du Lagrangien, le gradient n{\'{e}}cessaire {\{a}} loptimisation dune r{\{e}}gle dapprentissage par descente du gradient. Nous pr{\'{e}}sentons aussi les bases th{\'{e}}oriques qui permettent d{\'{e}}tudier la g{\'{e}}n{\'{e}}ralisation {\{a}} de nouvelles t{\^{a}}ches dune r{\{e}}gle dapprentissage dont les param{\{e}}tres ont {\'{e}}t{\'{e}} estim{\'{e}}s {\{a}} partir dun certain ensemble de t{\^{a}}ches. Enfin, nous exposons bri{\{e}}vement les r{\'{e}}sultats dune exp{\'{e}}rience consistant {\{a}} trouver, par descente du gradient, une r{\{e}}gle dapprentissage pouvant r{\'{e}}soudre plusieurs t{\^{a}}ches bool{\'{e}}ennes lin{\'{e}}airement et non lin{\'{e}}airement s{\'{e}}parables.}, addressfr={N{\^i}es, France},topics={BioRules},cat={C}, } @INPROCEEDINGS{bengio:1992:oban, author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan}, title = {On the Optimization of a Synaptic Learning rule}, booktitle = {Conference on Optimality in Biological and Artificial Networks}, year = {1992}, address = {Dallas, USA}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1992_oban.ps}, abstract = {This paper presents a new approach to neural modeling based on the idea of using an automated method to optimize the parameters of a synaptic learning rule. The synaptic modification rule is considered as a parametric function. This function has local inputs and is the same in many neurons. We can use standard optimization methods to select appropriate parameters for a given type of task. We also present a theoretical analysis permitting to study the generalization property of such parametric learning rules. By generalization, we mean the possibility for the learning rule to learn to solve new tasks. Experiments were performed on three types of problems: a biologically inspired circuit (for conditioning in Aplysia). Boolean functions (linearly separable as well as non linearly separable) and classification tasks. The neural network architecture as well as the form and initial parameter values of the synaptic learning function can be designed using a priori knowledge.}, addressfr={Dallas, USA},topics={BioRules},cat={C}, } @INPROCEEDINGS{bengio:1992:snowbird, author = {Bengio, Yoshua}, title = {Representations Based on Articulatory Dynamics for Speech Recognition}, booktitle = {1992 Neural Networks for Computing Conference}, year = {1992}, address = {Snowbird, Utah, USA}, topics={PriorKnowledge,Speech},cat={C}, } @INPROCEEDINGS{bengio:1993:icann, author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan}, editor = {Gielen, S. and Kappen, B.}, title = {Generalization of a Parametric Learning Rule}, booktitle = {{ICANN} '93: Proceedings of the International Conference on Artificial Neural Networks}, year = {1993}, pages = {502}, publisher = {Springer-Verlag}, address = {Amsterdam, Nederlands}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1993_icann.ps}, abstract = {In previous work ([4,2,1]) we discussed the subject of parametric learning rules for neural networks. In this article, we present a theoretical basis permitting to study the generalization property of a learning rule whose parameters are estimated from a set of learning tasks. By generalization, we mean the possibility of using the learning rule to learn solve new tasks. Finally, we describe simple experiments on two-dimensional categorization tasks and show how they corroborate the theoretical results.}, addressfr={Amsterdam, Pays-Bas},topics={BioRules},cat={C}, } @INPROCEEDINGS{bengio:1993:snowbird, author = {Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo}, title = {The Problem of Learning Long-Term Dependencies in Recurrent Networks}, booktitle = {1993 Neural Networks for Computing Conference}, year = {1993}, address = {Snowbird, Utah, USA}, topics={LongTerm},cat={C}, } @TECHREPORT{bengio:1994, author = {Bengio, Yoshua and Frasconi, Paolo}, title = {An {EM} Approach to Learning Sequential Behavior}, number = {DSI 11-94}, year = {1994}, institution = {Universita di Firenze, Dipartimento di Sistemi e Informatica}, topics={LongTerm},cat={T}, } @INPROCEEDINGS{bengio:1994:acfas, author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan}, title = {Optimisation d'une r{\{e}}gle d'apprentissage pour r{\'{e}}seaux de neurones artificiels}, booktitle = {Actes du soixante-deuxi{\{e}}me congr{\{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels}, year = {1994}, topics={BioRules},cat={C}, } @INPROCEEDINGS{bengio:1994:snowbird, author = {Bengio, Yoshua and Frasconi, Paolo}, title = {An {EM} Algorithm for Target Propagation}, booktitle = {1994 Neural Networks for Computing Conference}, year = {1994}, address = {Snowbird, Utah, USA}, topics={LongTerm},cat={C}, } @INPROCEEDINGS{bengio:1994:wcci, author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn}, title = {Use of Genetic Programming for the Search of a New Learning Rule for Neural Networks}, booktitle = {Proceedings of the First Conference on Evolutionary Computation, {IEEE} World Congress on Computational Intelligence}, year = {1994}, pages = {324--327}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1994_wcci.ps}, abstract = {In previous work ([1,2,3]), we explained how to use standard optimization methods such as simulated annealing, gradient descent and genetic algorithms to optimize a parametric function which could be used as a learning rule for neural networks. To use these methods, we had to choose a fixed number of parameters and a rigid form for the learning rule. In this article, we propose to use genetic programming to find not only the values of rule parameters but also the optimal number of parameters and the form of the rule. Experiments on classification tasks suggest genetic programming finds better learning rules than other optimization methods. Furthermore, the best rule found with genetic programming outperformed the well-known backpropagation algorithm for a given set of tasks.}, topics={BioRules},cat={C}, } @INPROCEEDINGS{bengio:1994b:acfas, author = {Bengio, Yoshua and Frasconi, Paolo}, title = {R{\'{e}}seaux de neurones {M}arkoviens pour l'inf{\'{e}}rence grammaticale}, booktitle = {Actes du soixante-deuxi{\{e}}me congr{\{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels}, year = {1994}, topics={Markov,Language},cat={C}, } @ARTICLE{bengio:1995:npl, author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn}, title = {On the Search for New Learning Rules for {ANN}s}, journal = {Neural Processing Letters}, volume = {2}, number = {4}, year = {1995}, pages = {26--30}, abstract = {In this paper, we present a framework where a learning rule can be optimized within a parametric learning rule space. We define what we call parametric learning rules and present a theoretical study of their generalization properties when estimated from a set of learning tasks and tested over another set of tasks. We corroborate the results of this study with practical experiments.}, topics={BioRules},cat={J}, } @INCOLLECTION{bengio:1995:oban, author = {Bengio, Samy and Bengio, Yoshua and Cloutier, Jocelyn and Gecsei, Jan}, editor = {Levine, D. S. and Elsberry, W. R.}, title = {{O}n the Optimization of a Synaptic Learning Rule}, booktitle = {Optimality in Biological and Artificial Networks}, year = {1995}, publisher = {Lawrence Erlbaum}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1995_oban.pdf}, abstract = {This paper presents a new approach to neural modeling based on the idea of using an automated method to optimize the parameters of a synaptic learning rule. The synaptic modification rule is considered as a parametric function. This function has local inputs and is the same in many neurons. We can use standard optimization methods to select appropriate parameters for a given type of task. We also present a theoretical analysis permitting to study the generalization property of such parametric learning rules. By generalization, we mean the possibility for the learning rule to learn to solve new tasks. Experiments were performed on three types of problems: a biologically inspired circuit (for conditioning in Aplysia), Boolean functions (linearly separable as well as non linearly separable) and classification tasks. The neural network architecture as well as the form and initial parameter values of the synaptic learning function can be designed using a priori knowledge.}, topics={BioRules},cat={B}, } @TECHREPORT{bengio:1996:udem, author = {Bengio, Yoshua and Bengio, Samy}, title = {Training Asynchronous Input/Output Hidden {M}arkov Models}, number = {1013}, year = {1996}, institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}}de Montr{\'{e}}al}, address = {Montr{\'{e}}al (QC) Canada}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1996_udem.ps}, topics={Markov},cat={T}, } @INPROCEEDINGS{bengio:1997:snowbird, author = {Bengio, Yoshua and Bengio, Samy and Singer, Yoram and Isabelle, Jean-Fran{\c c}ois}, title = {On the Clusterization of Probabilistic Transducers}, booktitle = {1997 Neural Networks for Computing Conference}, year = {1997}, address = {Snowbird, Utah, USA}, topics={HighDimensional},cat={C}, } @INPROCEEDINGS{bengio:1998:snowbird, author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles}, title = {Stochastic Learning of Strategic Equilibria for Auctions}, booktitle = {Learning Conference}, year = {1998}, address = {Snowbird, Utah, USA}, topics={Auction},cat={C}, } @TECHREPORT{bengio:1998:udem, author = {Bengio, Samy and Bengio, Yoshua and Robert, Jacques and B{\'{e}}langer, Gilles}, title = {Stochastic Learning of Strategic Equilibria for Auctions}, number = {1119}, year = {1998}, institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}}de Montr{\'{e}}al}, address = {Montr{\'{e}}al (QC) Canada}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bengio_1998_udem.pdf}, abstract = {This paper presents a new application of stochastic adaptive learning algorithms to the computation of strategic equilibria in auctions. The proposed approach addresses the problems of tracking a moving target and balancing exploration (of action space) versus exploitation (of better modeled regions of action space). Neural networks are used to represent a stochastic decision model for each bidder. Experiments confirm the correctness and usefulness of the approach.}, topics={Auction},cat={T}, } @INPROCEEDINGS{bengio:1999:snowbird, author = {Bengio, Yoshua and Latendresse, Simon and Dugas, Charles}, title = {Gradient-Based Learning of Hyper-Parameters}, booktitle = {Learning Conference}, year = {1999}, address = {Snowbird, Utah, USA}, topics={ModelSelection},cat={C}, } @INPROCEEDINGS{bengio:1999:titration, author = {Bengio, Yoshua and Brault, J-J. and Major, Fran{\c c}ois and Neal, R. and Pigeon, Steven}, title = {Learning Algorithms for Sorting Compounds from Titration Curves}, booktitle = {Symposium on New Perspectives for Computer-Aided Drug Design}, year = {1999}, address = {Montr{\'{e}}al, Qu{\'{e}}bec, Canada}, topics={Speech},cat={C}, } @ARTICLE{bengio:2000:ieeetrnn, author = {Bengio, Samy and Bengio, Yoshua}, title = {Taking on the Curse of Dimensionality in Joint Distributions Using Neural Networks}, journal = {IEEE Transaction on Neural Networks special issue on data mining and knowledge discovery}, volume = {11}, number = {3}, year = {2000}, pages = {550--557}, abstract = {The curse of dimensionality is severe when modeling high-dimensional discrete data: the number of possible combinations of the variables explodes exponentially. In this paper we propose a new architecture for modeling high-dimensional data that requires resources (parameters and computations) that grow at most as the square of the number of variables, using a multi_layer neural network to represent the joint distribution of the variables as the product of conditional distributions. The neural network can be interpreted as a graphical model without hidden random variables, but in which the conditional distributions are tied through the hidden units. The connectivity of the neural network can be pruned by using dependency tests between the variables (thus reducing significantly the number of parameters). Experiments on modeling the distribution of several discrete data sets show statistically significant improvements over other methods such as naive Bayes and comparable Bayesian networks, and show that significant improvements can be obtained by pruning the network.}, topics={HighDimensional,Unsupervised,Mining},cat={J}, } @INPROCEEDINGS{bengio:2000:nips, author = {Bengio, Yoshua and Bengio, Samy}, title = {Modeling High-Dimensional Discrete Data with Multi-Layer Neural Networks}, year = {2000}, pages = {400--406}, crossref = {NIPS12}, abstract = {The curse of dimensionality is severe when modeling high-dimensional discrete data: the number of possible combinations of the variables explodes exponentially. In this paper we propose a new architecture for modeling high-dimensional data that requires resources (parameters and computations) that grow only at most as the square of the number of variables, using a multi-layer neural network to represent the joint distribution of the variables as the product of conditional distributions. The neural network can be interpreted as a graphical model without hidden random variables, but in which the conditional distributions are tied through the hidden units. The connectivity of the neural network can be pruned by using dependency tests between the variables. Experiments on modeling the distribution of several discrete data sets show statistically significant improvements over other methods such as naive Bayes and comparable Bayesian networks, and show that significant improvements can be obtained by pruning the network.}, topics={HighDimensional,Unsupervised},cat={C}, } @ARTICLE{bengio:2003, author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Vincent, Pascal and Jauvin, Christian}, title = {A Neural Probabilistic Language Model}, volume = {3}, year = {2003}, pages = {1137--1155}, crossref = {JMLR}, abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.}, topics={Markov,Unsupervised,Language},cat={J}, } @TECHREPORT{bengio:socs-1990, author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf}, title = {Global Optimization of a Neural Network - Hidden {M}arkov Model Hybrid}, number = {TR-SOCS-90.22}, year = {1990}, institution = {School of Computer Science, McGill University}, address = {Montr{\'{e}}al (QC) Canada}, topics={Markov},cat={T}, } @INPROCEEDINGS{bengioc:1994:acfas, author = {Bengio, Yoshua and {LeCun}, Yann}, title = {Reconnaissance de mots manuscrits avec r{\'{e}}seaux de neurones et mod{\{e}}les de {M}arkov}, booktitle = {Actes du soixante-deuxi{\{e}}me congr{\{e}}s de l'Association Canadienne Fran{\c c}aise pour l'Avancement des Sciences, colloque sur l'apprentissage et les r{\'{e}}seaux de neurones artificiels}, year = {1994}, topics={Markov,Speech},cat={C}, } @TECHREPORT{Bengio_Bottou92, author = {Bengio, Yoshua and Bottou, {L{\'{e}}on}}, title = {A New Approach to Estimating Probability Density Functions with Artificial Neural Networks}, number = {TR-92.02}, year = {1992}, institution = {Massachusetts Institute of Technology, Dept. Brain and Cognitive Sciences}, topics={HighDimensional},cat={T}, } @INCOLLECTION{bengio_extension_nips_2003, author = {Bengio, Yoshua and Paiement, Jean-Fran{\c c}ois and Vincent, Pascal and Delalleau, Olivier and Le Roux, Nicolas and Ouimet, Marie}, keywords = {dimensionality reduction, eigenfunctions learning, {Isomap}, kernel {PCA}, locally linear embedding, Nystrom formula, spectral methods}, title = {Out-of-Sample Extensions for {LLE}, {Isomap}, {MDS}, {Eigenmaps}, and Spectral Clustering}, year = {2004}, address = {Cambridge, MA}, crossref = {NIPS16}, abstract = {Several unsupervised learning algorithms based on an eigendecomposition provide either an embedding or a clustering only for given training points, with no straightforward extension for out-of-sample examples short of recomputing eigenvectors. This paper provides a unified framework for extending Local Linear Embedding ({LLE}), {Isomap}, Laplacian {Eigenmaps}, Multi-Dimensional Scaling (for dimensionality reduction) as well as for Spectral Clustering. This framework is based on seeing these algorithms as learning eigenfunctions of a data-dependent kernel. Numerical experiments show that the generalizations performed have a level of error comparable to the variability of the embedding algorithms due to the choice of training data.}, topics={HighDimensional,Kernel,Unsupervised},cat={C}, } @ARTICLE{Bengio_Gingras98a, author = {Bengio, Yoshua and Gingras, Fran{\c c}ois and Goulard, Bernard and Lina, Jean-Marc}, title = {Gaussian Mixture Densities for Classification of Nuclear Power Plant Data}, journal = {Computers and Artificial Intelligence}, volume = {17}, number = {2-3}, year = {1998}, pages = {189--209}, abstract = {In this paper we are concerned with the application of learning algorithms to the classification of reactor states in nuclear plants. Two aspects must be considered, (1) some types of events (e.g., abnormal or rare) will not appear in the data set, but the system should be able to detect them, (2) not only classification of signals but also their interpretation are important for nuclear plant monitoring. We address both issues with a mixture of mixtures of Gaussians in which some parameters are shared to reflect the similar signals observed in different states of the reactor. An {EM} algorithm for these shared Gaussian mixtures is presented. Experimental results on nuclear plant data demonstrate the advantages of the proposed approach with respect to the above two points.}, topics={Mining},cat={J}, } @ARTICLE{Bengio_Gingras98b, author = {Gingras, Fran{\c c}ois and Bengio, Yoshua}, title = {Handling Asynchronous or Missing Financial Data with Recurrent Networks}, journal = {International Journal of Computational Intelligence and Organizations}, volume = {1}, number = {3}, year = {1998}, pages = {154--163}, abstract = {An important issue with many sequential data analysis problems, such as those encountered in financial data sets, is that different variables are known at different frequencies, at different times (asynchronicity), or are sometimes missing. To address this issue we propose to use recurrent networks with feedback into the input units, based on two fundamental ideas. The first motivation is that the filled-in value of the missing variable may not only depend in complicated ways on the value of this variable in the past of the sequence but also on the current and past values of other variables. The second motivation is that, for the purpose of making predictions or taking decisions, it is not always necessary to fill in the best possible value of the missing variables. In fact, it is sufficient to fill in a value which helps the system make better predictions or decisions. The advantages of this approach are demonstrated through experiments on several tasks.}, topics={Finance,Missing},cat={J}, } @INPROCEEDINGS{Bengio_icassp90, author = {Bengio, Yoshua and Cardin, Regis and De Mori, Renato and Normandin, Yves}, title = {A Hybrid Coder for Hidden {M}arkov Models Using a Recurrent Neural Network}, booktitle = {International Conference on Acoustics, Speech and Signal Processing}, year = {1990}, pages = {537--540}, address = {Albuquerque, NM}, topics={Markov,Speech},cat={C}, } @INPROCEEDINGS{Bengio_LeCun94, author = {Bengio, Yoshua and {LeCun}, Yann and Henderson, Donnie}, title = {Globally Trained Handwritten Word Recognizer using Spatial Representation, Space Displacement Neural Networks and Hidden {M}arkov Models}, year = {1994}, pages = {937--944}, crossref = {NIPS6}, abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. The preprocessor performs a word-level normalization by fitting a model of the word structure using the {EM} algorithm. Words are then coded into low resolution annotated images where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.}, topics={Speech},cat={C}, } @ARTICLE{Bengio_LeCun95, author = {Bengio, Yoshua and {LeCun}, Yann and Nohl, Craig and Burges, Chris}, title = {LeRec: A {NN}/{HMM} Hybrid for On-Line Handwriting Recognition}, journal = {Neural Computation}, volume = {7}, number = {6}, year = {1995}, pages = {1289--1303}, abstract = {We introduce a new approach for on-line recognition of handwritten words written in unconstrained mixed style. The preprocessor performs a word-level normalization by fitting a model of the word structure using the {EM} algorithm. Words are then coded into low resolution annotated images where each pixel contains information about trajectory direction and curvature. The recognizer is a convolution network which can be spatially replicated. From the network output, a hidden {Markov} model produces word scores. The entire system is globally trained to minimize word-level errors.}, topics={PriorKnowledge,Speech},cat={J}, } @ARTICLE{Bengio_prel92, author = {Bengio, Yoshua and Gori, Marco and De Mori, Renato}, title = {Learning the Dynamic Nature of Speech with Back-propagation for Sequences}, journal = {Pattern Recognition Letters}, volume = {13}, number = {5}, year = {1992}, pages = {375--385}, note = {(Special issue on Artificial Neural Networks)}, topics={Speech},cat={J}, } @ARTICLE{Bengio_trnn92, author = {Bengio, Yoshua and De Mori, Renato and Flammia, Giovanni and Kompe, Ralf}, title = {Global Optimization of a Neural Network-Hidden {M}arkov Model Hybrid}, journal = {IEEE Transactions on Neural Networks}, volume = {3}, number = {2}, year = {1992}, pages = {252--259}, topics={Markov},cat={J}, } @TECHREPORT{Bergstra+2009, author = {Bergstra, James and Desjardins, Guillaume and Lamblin, Pascal and Bengio, Yoshua}, month = apr, title = {Quadratic Polynomials Learn Better Image Features}, number = {1337}, year = {2009}, institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, abstract = {The affine-sigmoidal hidden unit (of the form $\sigma(ax+b)$) is a crude predictor of neuron response in visual area V1. More descriptive models of V1 have been advanced that are no more computationally expensive, yet artificial neural network research continues to focus on networks of affine-sigmoidal models. This paper identifies two qualitative differences between the affine-sigmoidal hidden unit and a particular recent model of V1 response: a) the presence of a low-rank quadratic term in the argument to $\sigma$, and b) the use of a gentler non-linearity than the $\tanh$ or logistic sigmoid. We evaluate these model ingredients by training single-layer neural networks to solve three image classification tasks. We experimented with fully-connected hidden units, as well as locally-connected units and convolutional units that more closely mimic the function and connectivity of the visual system. On all three tasks, both the quadratic interactions and the gentler non-linearity lead to significantly better generalization. The advantage of quadratic units was strongest in conjunction with sparse and convolutional hidden units.} } @INPROCEEDINGS{Bergstra+al-NIPS2011, author = {Bergstra, James and Bardenet, R{\'{e}}my and Bengio, Yoshua and K{\'{e}}gl, Bal{\'{a}}zs}, title = {Algorithms for Hyper-Parameter Optimization}, booktitle = {NIPS'2011}, year = {2011} } @INPROCEEDINGS{bergstra+al:2010-scipy, author = {Bergstra, James and Breuleux, Olivier and Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Pascanu, Razvan and Desjardins, Guillaume and Turian, Joseph and Warde-Farley, David and Bengio, Yoshua}, month = jun, title = {Theano: a {CPU} and {GPU} Math Expression Compiler}, booktitle = {Proceedings of the Python for Scientific Computing Conference ({SciPy})}, year = {2010}, location = {Austin, TX}, note = {Oral Presentation}, abstract = {Theano is a compiler for mathematical expressions in Python that combines the convenience of NumPy’s syntax with the speed of optimized native machine language. The user composes mathematical expressions in a high-level description that mimics NumPy’s syntax and semantics, while being statically typed and functional (as opposed to imperative). These expressions allow Theano to provide symbolic differentiation. Before performing computation, Theano optimizes the choice of expressions, translates them into C++ (or CUDA for GPU), compiles them into dynamically loaded Python modules, all automatically. Common machine learning algorithms implemented with Theano are from 1.6× to 7.5× faster than competitive alternatives (including those implemented with C/C++, NumPy/SciPy and MATLAB) when compiled for the CPU and between 6.5× and 44× faster when compiled for the GPU. This paper illustrates how to use Theano, outlines the scope of the compiler, provides benchmarks on both CPU and GPU processors, and explains its overall design.} } @MISC{bergstra+al:2010-sharcnet, author = {Bergstra, James and Bengio, Yoshua}, month = may, title = {{GPU} Programming with Theano}, year = {2010}, howpublished = {{SHARCNET} Research Day}, note = {Oral} } @MISC{bergstra+al:2010snowbird, author = {Bergstra, James and Breuleux, Olivier and Bastien, Fr{\'{e}}d{\'{e}}ric and Lamblin, Pascal and Turian, Joseph and Desjardins, Guillaume and Pascanu, Razvan and Erhan, Dumitru and Delalleau, Olivier and Bengio, Yoshua}, month = apr, title = {Deep Learning on {GPU}s with Theano}, booktitle = {The Learning Workshop}, year = {2010}, location = {Snowbird, Utah}, note = {Oral} } @INPROCEEDINGS{bergstra+all-Theano-NIPS2011, author = {Bergstra, James and Bastien, Fr{\'{e}}d{\'{e}}ric and Breuleux, Olivier and Lamblin, Pascal and Pascanu, Razvan and Delalleau, Olivier and Desjardins, Guillaume and Warde-Farley, David and Goodfellow, Ian J. and Bergeron, Arnaud and Bengio, Yoshua}, title = {Theano: Deep Learning on GPUs with Python}, booktitle = {Big Learn workshop, NIPS'11}, year = {2011}, abstract = {In this paper, we present Theano1 , a framework in the Python programming language for defining, optimizing and evaluating expressions involving high-level operations on tensors. Theano offers most of NumPy’s functionality, but adds automatic symbolic differentiation, GPU support, and faster expression evaluation. Theano is a general mathematical tool, but it was developed with the goal of facilitating research in deep learning. The Deep Learning Tutorials2 introduce recent advances in deep learning, and showcase how Theano makes such algorithms compact, elegant, and fast.} } @ARTICLE{bergstra+bengio+louradour:2011, author = {Bergstra, James and Bengio, Yoshua and Louradour, Jerome}, month = mar, title = {Suitability of {V1} Energy Models for Object Classification}, journal = {Neural Computation}, volume = {23}, number = {3}, year = {2011}, pages = {774–790} } @INPROCEEDINGS{Bergstra+Bengio-2009, author = {Bergstra, James and Bengio, Yoshua}, month = dec, title = {Slow, Decorrelated Features for Pretraining Complex Cell-like Networks}, year = {2009}, pages = {99--107}, publisher = {MIT Press}, url = {http://books.nips.cc/papers/files/nips22/NIPS2009_0933.pdf}, crossref = {NIPS22} } @ARTICLE{Bergstra+Bengio-2012, author = {Bergstra, James and Bengio, Yoshua}, month = feb, title = {Random Search for Hyper-Parameter Optimization}, journal = {Journal of Machine Learning Research}, volume = {13}, year = {2012}, pages = {281--305}, abstract = {Grid search and manual search are the most widely used strategies for hyper-parameter optimization. This paper shows empirically and theoretically that randomly chosen trials are more efficient for hyper-parameter optimization than trials on a grid. Empirical evidence comes from a comparison with a large previous study that used grid search and manual search to configure neural networks and deep belief networks. Compared with neural networks configured by a pure grid search, we find that random search over the same domain is able to find models that are as good or better within a small fraction of the computation time. Granting random search the same computational budget, random search finds better models by effectively searching a larger, less promising configuration space. Compared with deep belief networks configured by a thoughtful combination of manual search and grid search, purely random search over the same 32-dimensional configuration space found statistically equal performance on four of seven data sets, and superior performance on one of seven. A Gaussian process analysis of the function from hyper-parameters to validation set performance reveals that for most data sets only a few of the hyper-parameters really matter, but that different hyper-parameters are important on different data sets. This phenomenon makes grid search a poor choice for configuring algorithms for new data sets. Our analysis casts some light on why recent "High Throughput" methods achieve surprising success-they appear to search through a large number of hyper-parameters because most hyper-parameters do not matter much. We anticipate that growing interest in large hierarchical models will place an increasing burden on techniques for hyper-parameter optimization; this work shows that random search is a natural baseline against which to judge progress in the development of adaptive (sequential) hyper-parameter optimization algorithms.} } @MISC{bergstra+bengio:2011snowbird, author = {Bergstra, James and Bengio, Yoshua}, month = apr, title = {Random Search for Hyper-parameter Optimization}, year = {2011}, howpublished = {The Learning Workshop, Fort Lauderdale FL.}, note = {(Oral)} } @ARTICLE{bergstra+casagrande+erhan+eck+kegl:2006, author = {Bergstra, James and Casagrande, Norman and Erhan, Dumitru and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs}, month = dec, title = {Aggregate Features and AdaBoost for Music Classification}, journal = {Machine Learning}, volume = {65}, year = {2006}, pages = {473--484}, issn = {0885-6125}, abstract = {We present an algorithm that predicts musical genre and artist from an audio waveform. Our method uses the ensemble learner ADABOOST to select from a set of audio features that have been extracted from segmented audio and then aggregated. Our classifier proved to be the most effective method for genre classification at the recent MIREX 2005 international contests in music information extraction, and the second-best method for recognizing artists. This paper describes our method in detail, from feature extraction to song classification, and presents an evaluation of our method on three genre databases and two artist-recognition databases. Furthermore, we present evidence collected from a variety of popular features and classifiers that the technique of classifying features aggregated over segments of audio is better than classifying either entire songs or individual short-timescale features.}, PDF = {papers/2006_ml_draft.pdf}, SOURCE = {OwnPublication}, } @INPROCEEDINGS{bergstra+lacoste+eck:2006, author = {Bergstra, James and Lacoste, Alexandre and Eck, Douglas}, month = oct, title = {Predicting Genre Labels for Artists using FreeDB}, booktitle = {Proc. 7th International Conference on Music Information Retrieval ({ISMIR})}, year = {2006}, pages = {85--88}, publisher = {University of Victoria}, location = {Victoria, BC, Canada}, SOURCE = {OwnPublication}, PDF = {papers/2006_ismir_freedb.pdf}, } @INPROCEEDINGS{bergstra+mandel+eck:2010, author = {Bergstra, James and Mandel, Michael and Eck, Douglas}, month = aug, title = {Scalable Genre and Tag Prediction with Spectral Covariance}, booktitle = {Proceedings of the 11th {I}nternational {S}ociety for {M}usic {I}nformation {R}etrieval {C}onference ({ISMIR})}, year = {2010}, pages = {507--512}, location = {Utrecht, The Netherlands.} } @MASTERSTHESIS{Bergstra-Msc-2006, author = {Bergstra, James}, keywords = {apprentissage statistique, classification de musique par genre, extraction de caract{\'{e}}ristiques sonores, recherche d'information musicale}, title = {Algorithms for Classifying Recorded Music by Genre}, year = {2006}, school = {Universit{\'{e}} de Montreal}, abstract = {Ce m{\'{e}}moire traite le probl{\{e}}me de la classification automatique de signaux musicaux par genre. Dans un premier temps, je pr{\'{e}}sente une technique utilisant l'apprentissage machine pour classifier des statistiques extraites sur des segments du signal sonore. Malgr{\'{e}} le fait que cette technique a d{\'{e}}j{\{a}} {\'{e}}t{\'{e}} explor{\'{e}}e, mon m{\'{e}}moire est le premier {\{a}} investiguer l'influence de la longueur et de la quantit{\'{e}} de ces segments sur le taux de classification. J'explore {\'{e}}galement l'importance d'avoir des segments contigus dans le temps. Les segments d'une {\{a}} trois secondes apportent une meilleure performance, mais pour ce faire, ils doivent {\^{e}}tre suffisamment nombreux. Il peut m{\^{e}}me {\^{e}}tre utile d'augmenter la quantit{\'{e}} de segments jusqu'{\{a}} ce qu'ils se chevauchent. Dans les m{\^{e}}mes exp{\'{e}}riences, je pr{\'{e}}sente une formulation alternative des descripteurs d'audio nomm{\'{e}}e Melfrequency Cepstral Coefficient (MFCC) qui am{\{e}}ne un taux de classification de 81 \% sur un jeux de donn{\'{e}}es pour lequel la meilleure performance publi{\'{e}}e est de 71 \%. Cette m{\'{e}}thode de segmentation des chansons, ainsi que cette formulation alternative, ont pour but d'am{\'{e}}liorer l'algorithme gagnant du concours de classification de genre de MIREX 2005, d{\'{e}}velopp{\'{e}} par Norman Casagrande et moi. Ces exp{\'{e}}riences sont un approfondissement du travail entam{\'{e}} par Bergstra et al. [2006a], qui d{\'{e}}crit l'algorithme gagnant de ce concours. Dans un deuxi{\{e}}me temps, je pr{\'{e}}sent une m{\'{e}}thode qui utilise FreeDB, une base de donn{\'{e}}es d'information sur les albums, pour attribuer {\{a}} un artiste une distribution de probabilit{\'{e}} sur son genre. Avec une petite base de donn{\'{e}}es, faite {\{a}} la main, je montre qu'il y a une haute corr{\'{e}}lation entre cette distribution et l'{\'{e}}tiquette de genre traditionnel. Bien qu'il reste {\{a}} d{\'{e}}montrer que cette m{\'{e}}thode est utile pour organiser une collection de musique, ce r{\'{e}}sultat sugg{\{e}}re qu'on peut maintenant {\'{e}}tiqueter de grandes bases de musique automatiquement {\{a}} un faible co{\^{u}}t, et par cons{\'{e}}quent de poursuivre plus facilement la recherche en classification {\{a}} grande {\'{e}}chelle. Ce travail sera publi{\'{e}} comme Bergstra et al. [2006b] {\{a}} ISMIR 2006.} } @PHDTHESIS{Bergstra-Phd-2011, author = {Bergstra, James}, month = jun, title = {Incorporating Complex Cells into Neural Networks for Pattern Classification}, year = {2011}, school = {Universit{\'{e}} de Montr{\'{e}}al}, } @INPROCEEDINGS{bergstra:2010cosyne, author = {Bergstra, James and Bengio, Yoshua and Lamblin, Pascal and Desjardins, Guillaume and Louradour, Jerome}, month = feb, title = {Image classification with complex cell neural networks}, booktitle = {Computational and systems neuroscience (COSYNE)}, year = {2010}, location = {Salt Lake City}, note = {Poster}, url = {http://www.frontiersin.org/conferences/individual_abstract_listing.php?conferid=770&pap=3626&ind_abs=1&pg=335}, doi = {10.3389/conf.fnins.2010.03.00334} } @INPROCEEDINGS{biaslearn:2000:ijcnn, author = {Ghosn, Joumana and Bengio, Yoshua}, title = {Bias Learning, Knowledge Sharing}, booktitle = {International Joint Conference on Neural Networks 2000}, volume = {I}, year = {2000}, pages = {9--14}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/ijcnn_manifold.pdf}, abstract = {Biasing the hypothesis space of a learner has been shown to improve generalisation performances. Methods for achieving this goal have been proposed, that range from deriving and introducing a bias into a learner to automatically learning the bias. In the latter case, most methods learn the bias by simultaneously training several related tasks derived from the same domain and imposing constraints on their parameters. We extend some of the ideas presented in this field and describe a new model that parameterizes the parameters of each task as a function of an affine manifold defined in parameter space and a point lying on the manifold. An analysis of variance on a class of learning tasks is performed that shows some significantly improved performances when using the model.}, topics={MultiTask},cat={C}, } @ARTICLE{biaslearn:2003:tnn, author = {Ghosn, Joumana and Bengio, Yoshua}, title = {Bias Learning, Knowledge Sharing}, journal = {IEEE Transaction on Neural Networks}, volume = {14}, number = {4}, year = {2003}, pages = {748--765}, abstract = {Biasing properly the hypothesis space of a learner has been shown to improve generalization performance. Methods for achieving this goal have been proposed, that range from designing and introducing a bias into a learner to automatically learning the bias. Multitask learning methods fall into the latter category. When several related tasks derived from the same domain are available, these methods use the domain-related knowledge coded in the training examples of all the tasks as a source of bias. We extend some of the ideas presented in this field and describe a new approach that identifies a family of hypotheses, represented by a manifold in hypothesis space, that embodies domain-related knowledge. This family is learned using training examples sampled from a group of related tasks. Learning models trained on these tasks are only allowed to select hypotheses that belong to the family. We show that the new approach encompasses a large variety of families which can be learned. A statistical analysis on a class of related tasks is performed that shows significantly improved performances when using this approach.}, topics={MultiTask},cat={J}, } @MASTERSTHESIS{Boisvert-Mcs-2005, author = {Boisvert, Maryse}, keywords = {Algorithme {EM} , D{\'{e}}composition en valeurs singuli{\{e}}res , D{\'{e}}sambigu{\"{\i}}sation s{\'{e}}mantique , Mod{\{e}}les graphiques, WordNet }, title = {R{\'{e}}duction de dimension pour mod{\{e}}les graphiques probabilistes appliqu{\'{e}}s {\{a}} la d{\'{e}}sambiguisation s{\'{e}}mantique}, year = {2005}, school = {Universit{\'{e}} de Montr{\'{e}}al} } @INPROCEEDINGS{bonneville98, author = {Bonneville, Martin and Meunier, Jean and Bengio, Yoshua and Soucy, Jean-Paul}, title = {Support Vector Machines for Improving the classification of Brain Pet Images}, booktitle = {SPIE Medical Imaging}, year = {1998}, address = {San Diego}, topics={Kernel},cat={C}, } @INPROCEEDINGS{bordes-aaai-2011, author = {Bordes, Antoine and Weston, Jason and Collobert, Ronan and Bengio, Yoshua}, title = {Learning Structured Embeddings of Knowledge Bases}, booktitle = {AAAI 2011}, year = {2011}, abstract = {Many Knowledge Bases (KBs) are now readily available and encompass colossal quantities of information thanks to either a long-term funding effort (e.g. WordNet, OpenCyc) or a collaborative process (e.g. Freebase, DBpedia). However, each of them is based on a different rigorous symbolic framework which makes it hard to use their data in other systems. It is unfortunate because such rich structured knowledge might lead to a huge leap forward in many other areas of AI like natural language processing (word-sense disambiguation, natural language understanding, ...), vision (scene classification, image semantic annotation, ...) or collaborative filtering. In this paper, we present a learning process based on an innovative neural network architecture designed to embed any of these symbolic representations into a more flexible continuous vector space in which the original knowledge is kept and enhanced. These learnt embeddings would allow data from any KB to be easily used in recent machine learning methods for prediction and information retrieval. We illustrate our method on WordNet and Freebase and also present a way to adapt it to knowledge extraction from raw text. Erratum: https://www.hds.utc.fr/~bordesan/dokuwiki/doku.php?id=en:aaai11_erratum} } @INPROCEEDINGS{Bordes-et-al-AISTATS2012, author = {Bordes, Antoine and Glorot, Xavier and Weston, Jason and Bengio, Yoshua}, title = {Joint Learning of Words and Meaning Representations for Open-Text Semantic Parsing}, booktitle = {Proceedings of the 15th International Conference on Artificial Intelligence and Statistics (AISTATS)}, year = {2012} } @ARTICLE{Bordes-et-al-LSML2013, author = {Bordes, Antoine and Glorot, Xavier and Weston, Jason and Bengio, Yoshua}, title = {A Semantic Matching Energy Function for Learning with Multi-relational Data}, journal = {Machine Learning: Special Issue on Learning Semantics}, year = {2013} } @MISC{Bordes-et-al-LW2011, author = {Bordes, Antoine and Weston, Jason and Collobert, Ronan and Bengio, Yoshua}, title = {Learning Structured Embeddings of Knowledge Bases}, year = {2011}, howpublished = {The Learning Workshop (oral)} } @INPROCEEDINGS{Bottou+Bengio95, author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua}, title = {Convergence Properties of the {K}-Means Algorithm}, year = {1995}, pages = {585--592}, crossref = {NIPS7}, abstract = {This paper studies the convergence properties of the well known K-Means clustering algorithm. The K-Means algorithm can be described either as a gradient descent algorithm or by slightly extending the mathematics of the {EM} algorithm to this hard threshold case. We show that the K-Means algorithm actually minimizes the quantization error using the very fast Newton algorithm.}, topics={Unsupervised},cat={C}, } @ARTICLE{bottou-98, author = {Bottou, {L{\'{e}}on} and Haffner, Patrick and G. Howard, Paul and Simard, Patrice and Bengio, Yoshua and {LeCun}, Yann}, month = jul, title = {High Quality Document Image Compression with {DjVu}}, journal = {Journal of Electronic Imaging}, volume = {7}, number = {3}, year = {1998}, pages = {410--425}, topics={Compression},cat={J}, } @INPROCEEDINGS{Bottou-dcc98, author = {Bottou, {L{\'{e}}on} and G. Howard, Paul and Bengio, Yoshua}, editor = {Society, {IEEE} Computer}, title = {The Z-Coder Adaptive Binary Coder}, booktitle = {Data Compression Conference}, year = {1998}, url = {http://leon.bottou.org/papers/bottou-howard-bengio-98}, topics={Compression},cat={C}, } @INPROCEEDINGS{bottou-lecun-bengio-97, author = {Bottou, {L{\'{e}}on} and {LeCun}, Yann and Bengio, Yoshua}, title = {Global Training of Document Processing Systems using Graph Transformer Networks}, booktitle = {Proc. of Computer Vision and Pattern Recognition}, year = {1997}, pages = {490--494}, publisher = {IEEE}, address = {Puerto-Rico}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/bottou-lecun-bengio-97.pdf}, topics={PriorKnowledge,Speech},cat={C}, } @TECHREPORT{bottou96TR, author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and {LeCun}, Yann}, month = jun, title = {Document analysis with transducers}, number = {Technical Memorandum HA615600-960701-01TM}, year = {1996}, institution = {AT\&T Labs}, address = {New-Jersey, USA}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/transducer-tm.ps.gz}, topics={HighDimensional},cat={T}, } @TECHREPORT{bottou97TR, author = {Bottou, {L{\'{e}}on} and Bengio, Yoshua and G. Howard, Paul}, month = jul, title = {Z-Coder: A Fast Adaptive Binary Arithmetic Coder}, number = {Technical Memorandum HA615600-970721-02TM}, year = {1997}, institution = {AT\&T Labs}, address = {New-Jersey, USA}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/zcoder-tm.ps.gz}, topics={Compression},cat={T}, } @MASTERSTHESIS{Bouchard-Msc-2007, author = {Bouchard, Lysiane}, keywords = {auditory cortex, fMRI, linear classiﬁer, logistic regression, na{\"{\i}}ve bayesian gaussian model, neuroimaging, spectro-temporal modulation, support vectors machine}, title = {Analyse par apprentissage automatique des r{\'{e}}ponses fMRI du cortex auditif {\{a}} des modulations spectro-temporelles.}, year = {2009}, school = {Universit{\'{e}} de Montr{\'{e}}al}, abstract = {The application of linear machine learning classiﬁers to the analysis of brain imaging data (fMRI) has led to several interesting breakthroughs in recent years. These classiﬁers combine the responses of the voxels to detect and categorize diﬀerent brain states. They allow a more agnostic analysis than conventional fMRI analysis that systematically treats weak and distributed patterns as unwanted noise. In this project, we use such classiﬁers to validate an hypothesis concerning the encoding of sounds in the human brain. More precisely, we attempt to locate neurons tuned to spectral and temporal modulations in sound. We use fMRI recordings of brain responses of subjects listening to 49 diﬀerent spectro-temporal modulations. The analysis of fMRI data through linear classiﬁers is not yet a standard procedure in this ﬁeld. Thus, an important objective of this project, in the long term, is the development of new machine learning algorithms specialized for neuroimaging data. For these reasons, an important part of the experiments is dedicated to studying the behaviour of the classiﬁers. We are mainly interested in 3 standard linear classiﬁers, namely the support vectors machine algorithm (linear), the logistic regression algorithm (regularized) and the na{\"{\i}}ve bayesian gaussian model (shared variances).} } @PHDTHESIS{Boufaden-Phd-2005, author = {Boufaden, Narj{\{e}}s}, title = {Extraction d’information {\{a}} partir de transcriptions de conversations t{\'{e}}l{\'{e}}phoniques sp{\'{e}}cialis{\'{e}}es}, year = {2005}, school = {Universit{\'{e}} de Montr{\'{e}}al, D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnel} } @PROCEEDINGS{Boulanger-Bengio-Vincent-ISMIR-2013, author = {Boulanger-Lewandowski, Nicolas and Bengio, Yoshua and Vincent, Pascal}, month = nov, title = {Audio Chord Recognition with Recurrent Neural Networks}, booktitle = {Proc. ISMIR 14}, year = {2013}, abstract = {In this paper, we present an audio chord recognition system based on a recurrent neural network. The audio features are obtained from a deep neural network optimized with a combination of chromagram targets and chord information, and aggregated over different time scales. Contrarily to other existing approaches, our system incorporates acoustic and musicological models under a single training objective. We devise an efﬁcient algorithm to search for the global mode of the output distribution while taking long-term dependencies into account. The resulting method is competitive with state-of-the-art approaches on the MIREX dataset in the major/minor prediction task.} } @INPROCEEDINGS{Boulanger-et-al-ICASSP-2013, author = {Boulanger-Lewandowski, Nicolas and Bengio, Yoshua and Vincent, Pascal}, keywords = {polyphonic transcription, recurrent neural network, Restricted {Boltzmann} Machine, Sequence transduction}, title = {High-dimensional Sequence Transduction}, booktitle = {Proc. ICASSP 3}, year = {2013}, abstract = {We investigate the problem of transforming an input sequence into a high-dimensional output sequence in order to transcribe polyphonic audio music into symbolic notation. We introduce a probabilistic model based on a recurrent neural network that is able to learn realistic output distributions given the input and we devise an efﬁcient algorithm to search for the global mode of that distribution. The resulting method produces musically plausible transcriptions even under high levels of noise and drastically outperforms previous stateof-the-art approaches on ﬁve datasets of synthesized sounds and real recordings, approximately halving the test error rate.} } @INPROCEEDINGS{Boulanger-et-al-ICASSP2014, author = {Boulanger-Lewandowski, Nicolas and Droppo, Jasha and Seltzer, Mike and Yu, Dong}, title = {Phone Sequence Modeling with Recurrent Neural Networks}, booktitle = {Proc. ICASSP 39}, year = {2014}, abstract = {In this paper, we investigate phone sequence modeling with recurrent neural networks in the context of speech recognition. We introduce a hybrid architecture that combines a phonetic model with an arbitrary frame-level acoustic model and we propose efficient algorithms for training, decoding and sequence alignment. We evaluate the advantage of our phonetic model on the TIMIT and Switchboard-mini datasets in complementarity to a powerful context-dependent deep neural network (DNN) acoustic classifier and a higher-level 3-gram language model. Consistent improvements of 2--10\% in phone accuracy and 3\% in word error rate suggest that our approach can readily replace {HMM}s in current state-of-the-art systems.} } @INPROCEEDINGS{Boulanger-et-al-ICASSP2014b, author = {Boulanger-Lewandowski, Nicolas and Mysore, Gautham J. and Hoffman, Matthew}, title = {Exploiting Long-Term Temporal Dependencies in NMF Using Recurrent Neural Networks with Application to Source Separation}, booktitle = {Proc. ICASSP 39}, year = {2014}, abstract = {This paper seeks to exploit high-level temporal information during feature extraction from audio signals via non-negative matrix factorization. Contrary to existing approaches that impose local temporal constraints, we train powerful recurrent neural network models to capture long-term temporal dependencies and event co-occurrence in the data. This gives our method the ability to fill in the blanks'' in a smart way during feature extraction from complex audio mixtures, an ability very useful for a number of audio applications. We apply these ideas to source separation problems.} } @INPROCEEDINGS{Boulanger-et-al-ICML2012, author = {Boulanger-Lewandowski, Nicolas and Bengio, Yoshua and Vincent, Pascal}, title = {Modeling Temporal Dependencies in High-Dimensional Sequences: Application to Polyphonic Music Generation and Transcription}, year = {2012}, crossref = {ICML12}, abstract = {We investigate the problem of modeling symbolic sequences of polyphonic music in a completely general piano-roll representation. We introduce a probabilistic model based on distribution estimators conditioned on a recurrent neural network that is able to discover temporal dependencies in high-dimensional sequences. Our approach outperforms many traditional models of polyphonic music on a variety of realistic datasets. We show how our musical language model can serve as a symbolic prior to improve the accuracy of polyphonic transcription.} } @PHDTHESIS{BoulangerL-Phd-2014, author = {Boulanger-Lewandowski, Nicolas}, month = apr, title = {Modeling High-Dimensional Audio Sequences with Recurrent Neural Networks}, year = {2014}, school = {Universit{\'{e}} de Montr{\'{e}}al}, abstract = {This thesis studies models of high-dimensional sequences based on recurrent neural networks ({RNN}s) and their application to music and speech. While in principle {RNN}s can represent the long-term dependencies and complex temporal dynamics present in real-world sequences such as video, audio and natural language, they have not been used to their full potential since their introduction by Rumelhart et al. (1986a) due to the difficulty to train them efficiently by gradient-based optimization. In recent years, the successful application of Hessian-free optimization and other advanced training techniques motivated an increase of their use in many state-of-the-art systems. The work of this thesis is part of this development. The main idea is to exploit the power of {RNN}s to learn a probabilistic description of sequences of symbols, i.e. high-level information associated with observed signals, that in turn can be used as a prior to improve the accuracy of information retrieval. For example, by modeling the evolution of note patterns in polyphonic music, chords in a harmonic progression, phones in a spoken utterance, or individual sources in an audio mixture, we can improve significantly the accuracy of polyphonic transcription, chord recognition, speech recognition and audio source separation respectively. The practical application of our models to these tasks is detailed in the last four articles presented in this thesis. In the first article, we replace the output layer of an {RNN} with conditional restricted {Boltzmann} machines to describe much richer multimodal output distributions. In the second article, we review and develop advanced techniques to train {RNN}s. In the last four articles, we explore various ways to combine our symbolic models with deep networks and non-negative matrix factorization algorithms, namely using products of experts, input/output architectures, and generative frameworks that generalize hidden {Markov} models. We also propose and analyze efficient inference procedures for those models, such as greedy chronological search, high-dimensional beam search, dynamic programming-like pruned beam search and gradient descent. Finally, we explore issues such as label bias, teacher forcing, temporal smoothing, regularization and pre-training.} } @INPROCEEDINGS{BoulangerLewandowski+al-ISMIR-2012, author = {Boulanger-Lewandowski, Nicolas and Bengio, Yoshua and Vincent, Pascal}, month = oct, title = {Discriminative Non-negative Matrix Factorization for Multiple Pitch Estimation}, booktitle = {Proceedings of the 13th International Society for Music Information Retrieval Conference}, year = {2012}, address = {Porto, Portugal}, abstract = {In this paper, we present a supervised method to improve the multiple pitch estimation accuracy of the non-negative matrix factorization (NMF) algorithm. The idea is to ex- tend the sparse NMF framework by incorporating pitch information present in time-aligned musical scores in or- der to extract features that enforce the separability between pitch labels. We introduce two discriminative criteria that maximize inter-class scatter and quantify the predictive po- tential of a given decomposition using logistic regressors. Those criteria are applied to both the latent variable and the deterministic autoencoder views of NMF, and we devise efﬁcient update rules for each. We evaluate our method on three polyphonic datasets of piano recordings and or- chestral instrument mixes. Both models greatly enhance the quality of the basis spectra learned by NMF and the accuracy of multiple pitch estimation.} } @TECHREPORT{Breuleux+al-TR-2010, author = {Breuleux, Olivier and Bengio, Yoshua and Vincent, Pascal}, title = {Unlearning for Better Mixing}, number = {1349}, year = {2010}, institution = {Universit{\'{e}} de Montr{\'{e}}al/DIRO}, abstract = {Two learning algorithms were recently proposed – Herding and Fast Persistent Contrastive Divergence (FPCD) – which share the following interesting characteristic: they exploit changes in the model parameters while sampling in order to escape modes and mix better, during the sampling process that is part of the learning algorithm. We first justify such approaches as ways to escape modes while approximately keeping the same asymptotic distribution of the {Markov} chain. We then extend FPCD using an idea borrowed from Herding in order to obtain a pure sampling algorithm and show empirically that this FPCD-sampler yields substantially better samples than Gibbs sampling. Because these algorithms entangle the model and the sampling algorithm and we want to evaluate both (but particularly how well the sampling schemes mix), it is not always easy to evaluate them, so we propose a “black-box” approach based on how well and how quickly the samples generated by a model “cover” the test set examples. We empirically study these algorithms and variations with this perspective and these new evaluation tools in order to better understand their strengths and limitations.} } @ARTICLE{Breuleux+Bengio-2011, author = {Breuleux, Olivier and Bengio, Yoshua and Vincent, Pascal}, month = aug, title = {Quickly Generating Representative Samples from an {RBM}-Derived Process}, journal = {Neural Computation}, volume = {23}, number = {8}, year = {2011}, pages = {2053-2073} } @MASTERSTHESIS{breuleux2010, author = {Breuleux, Olivier}, month = feb, title = {{\'{E}}chantillonnage dynamique de champs markoviens}, year = {2010}, school = {Universit{\'{e}} de Montr{\'{e}}al} } @INPROCEEDINGS{Carreau+Bengio-2007, author = {Carreau, Julie and Bengio, Yoshua}, month = mar, title = {A Hybrid {Pareto} Model for Conditional Density Estimation of Asymmetric Fat-Tail Data}, booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AISTATS'07)}, year = {2007}, publisher = {Omnipress}, location = {Puerto Rico}, address = {San Juan, Porto Rico}, abstract = {We propose an estimator for the conditional density p(Y|X) that can adapt for asymmetric heavy tails which might depend on X. Such estimators have important applications in finance and insurance. We draw from Extreme Value Theory the tools to build a hybrid unimodal density having a parameter controlling the heaviness of the upper tail. This hybrid is a Gaussian whose upper tail has been replaced by a generalized {Pareto} tail. We use this hybrid in a multi-modal mixture in order to obtain a nonparametric density estimator that can easily adapt for heavy tailed data. To obtain a conditional density estimator, the parameters of the mixture estimator can be seen as functions of X and these functions learned. We show experimentally that this approach better models the conditional density in terms of likelihood than compared competing algorithms : conditional mixture models with other types of components and multivariate nonparametric models.}, date={21-24} } @ARTICLE{Carreau+Bengio-2009, author = {Carreau, Julie and Bengio, Yoshua}, title = {A Hybrid {Pareto} Mixture for Conditional Asymmetric Fat-Tailed Distribution}, journal = {IEEE Transactions on Neural Networks}, volume = {20}, number = {7}, year = {2009}, pages = {1087--1101}, issn = {1045-9227}, abstract = {In many cases, we observe some variables X that contain predictive information over a scalar variable of interest Y, with (X,Y) pairs observed in a training set. We can take advantage of this information to estimate the conditional density P(Y\X = x). In this paper, we propose a conditional mixture model with hybrid {Pareto} components to estimate P(Y\X = x).The hybrid {Pareto} is a Gaussian whose upper tail has been replaced by a generalized {Pareto} tail. A third parameter, in addition to the location and spread parameters of the Gaussian, controls the heaviness of the upper tail. Using the hybrid {Pareto} in a mixture model results in a nonparametric estimator that can adapt to multimodality, asymmetry, and heavy tails. A conditional density estimator is built by modeling the parameters of the mixture estimator as functions of X. We use a neural network to implement these functions. Such conditional density estimators have important applications in many domains such as finance and insurance. We show experimentally that this novel approach better models the conditional density in terms of likelihood, compared to competing algorithms: conditional mixture models with other types of components and a classical kernel-based nonparametric model.} } @ARTICLE{Carreau+Bengio-extreme-2009, author = {Carreau, Julie and Bengio, Yoshua}, title = {A Hybrid {Pareto} Model for Asymmetric Fat-Tailed Data: the univariate case}, journal = {Extremes}, volume = {12}, number = {1}, year = {2009}, pages = {53--76}, abstract = {Density estimators that can adapt to asymmetric heavy tails are required in many applications such as finance and insurance. Extreme Value Theory (EVT) has developped principled methods based on asymptotic results to estimate the tails of most distributions. However, the finite sample approximation might introduce a severe bias in many cases. Moreover, the full range of the distribution is often needed, not only the tail area. On the other hand, non-parametric methods, while being powerful where data are abundant, fail to extrapolate properly in the tail area. We put forward a non-parametric density estimator that brings together the strengths of non-parametric density estimation and of EVT. A hybrid {Pareto} distribution that can be used in a mixture model is proposed to extend the generalized {Pareto} (GP) to the whole real axis. Experiments on simulated data show the following. On one hand, the mixture of hybrid {Pareto}s converges faster in terms of log-likelihood and provides good estimates of the tail of the distributions when compared with other density estimators including the GP distribution. On the other hand, the mixture of hybrid {Pareto}s offers an alternate way to estimate the tail index which is comparable to the one estimated with the standard GP methodology. The mixture of hybrids is also evaluated on the Danish fire insurance data set.} } @PHDTHESIS{Carreau-PhD-2007, author = {Carreau, Julie}, keywords = {density estimation, extreme values, generalized {Pareto} distribution, heavy-tailed distribution, mixture of distributions, neural networks}, title = {Mod{\{e}}les {Pareto} hybrides pour distributions asym{\'{e}}triques et {\{a}} queues lourdes}, year = {2007}, school = {UdeM}, abstract = {We put forward a class of density estimators that can adapt to asymmetric, multi-modal and heavy-tailed distributions. Such distributions occur in many application domains such as finance and insurance. Mixture of gaussians are flexible non-parametric density estimators that have good approximation properties when the number of components is well chosen with respect to the training set size. However, those models are performing poorly on heavy-tailed data because few observations occur in the tail area. To solve this problem, we resort to extreme value theory where methods based on sound parametric assumptions have been developped to enable extrapolation beyond the range of the observations. More precisely, we build on the PoT method that was developped in hydrology where PoT stands for "Peaks-over-Threshold". The observations exceeding a given threshold are modeled by the generalized {Pareto} distribution. This distribution can approximate arbitrarily well the tail of most distributions. We build a new distribution, the hybrid {Pareto}, by stitching together a truncated Normal and a generalized {Pareto} distribution. We impose continuity constraints at the junction point. The hybrid {Pareto} is thus a smooth distribution that can be used in a mixture model. The behavior of the upper tail of the hybrid is similar to the behavior of the generalized {Pareto} tail. Moreover, the threshold inherent in the the PoT methodology can now be defined implicitly as the junction point of the component with the heaviest tail. This component also determines the tail index of the mixture. Hence, the hybrid {Pareto} mixture offers an alternate way to estimate the tail index associated with heavy-tailed data. In several applications, information that has predictive power on the variable of interest is available. In that case, we want to model the conditional density of Y given X, the vector containing predictive information. When the distribution of Y given X is asymmetric, multi-modal and heavy-tailed, we propose to use a mixure of hybrid {Pareto}s whose parameters are functions of X. Those functions are implemented by means of a neural network with one hidden layer. Neural neworks are non-parametric models that can, in principle, approximate any continuous function. Experiments on artificial and real data sets show that the hybrid {Pareto} mixture, unconditional and conditional, outperforms other density estimators in terms of log-likelihood.} } @INPROCEEDINGS{casagrande+eck+kegl:icmc2005, author = {Casagrande, Norman and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs}, title = {Geometry in Sound: A Speech/Music Audio Classifier Inspired by an Image Classifier}, booktitle = {{Proceedings of the International Computer Music Conference (ICMC)}}, year = {2005}, pages = {207--210}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_icmc_casagrande.pdf}, source={OwnPublication}, sourcetype={Conference}, } @INPROCEEDINGS{casagrande+eck+kegl:ismir2005, author = {Casagrande, Norman and Eck, Douglas and K{\'{e}}gl, Bal{\'{a}}zs}, title = {Frame-Level Audio Feature Extraction using {A}da{B}oost}, booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}}, year = {2005}, pages = {345--350}, address = {London: University of London}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_ismir_casagrande.pdf}, source={OwnPublication}, sourcetype={Conference}, } @PROCEEDINGS{ccai2006, editor = {Lamontagne, Luc and Marchand, Mario}, title = {Advances in Artificial Intelligence, 19th Conference of the Canadian Society for Computational Studies of Intelligence, Canadian AI 2006, Qu{\'{e}}bec City, Qu{\'{e}}bec, Canada, June 7-9, 2006, Proceedings}, booktitle = {Canadian Conference on AI}, series = {Lecture Notes in Computer Science}, volume = {4013}, year = {2006}, publisher = {Springer} } @INPROCEEDINGS{Chapados+Bengio-2006, author = {Chapados, Nicolas and Bengio, Yoshua}, title = {The K Best-Paths Approach to Approximate Dynamic Programming with Application to Portfolio Optimization}, booktitle = {AI06}, year = {2006}, pages = {491-502} } @INPROCEEDINGS{Chapados+Bengio-2007, author = {Chapados, Nicolas and Bengio, Yoshua}, month = jun, title = {Forecasting Commodity Contract Spreads with {G}aussian Process}, booktitle = {13th International Conference on Computing in Economics and Finance}, year = {2007}, abstract = {We introduce a functional representation of time series which allows forecasts to be performed over an unspecified horizon with progressively-revealed information sets. By virtue of using Gaussian processes, a complete covariance matrix between forecasts at several time-steps is available. This information is put to use in an application to actively trade price spreads between commodity futures contracts. The approach delivers impressive out-of-sample risk-adjusted returns after transaction costs on a portfolio of 30 spreads.} } @ARTICLE{Chapados+Bengio-2008-JOC, author = {Chapados, Nicolas and Bengio, Yoshua}, title = {Noisy K Best-Paths for Approximate Dynamic Programming with Application to Portfolio Optimization}, journal = {Journal of Computers}, volume = {2}, number = {1}, year = {2007}, pages = {12--19}, abstract = {We describe a general method to transform a non-Markovian sequential decision problem into a supervised learning problem using a K-bestpaths algorithm. We consider an application in financial portfolio management where we can train a controller to directly optimize a Sharpe Ratio (or other risk-averse non-additive) utility function. We illustrate the approach by demonstrating experimental results using a kernel-based controller architecture that would not normally be considered in traditional reinforcement learning or approximate dynamic programming.We further show that using a non-additive criterion (incremental Sharpe Ratio) yields a noisy K-best-paths extraction problem, that can give substantially improved performance.} } @MASTERSTHESIS{Chapados-Msc-2000, author = {Chapados, Nicolas}, month = jan, title = {Crit{\{e}}res d'optimisation d'algorithmes d'apprentissage en gestion de portefeuille}, year = {2000}, school = {Universit{\'{e}} de Montr{\'{e}}al} } @INPROCEEDINGS{chapados2000, author = {Chapados, Nicolas and Bengio, Yoshua}, title = {Cost Functions and Model Combination for {VaR}-Based Asset Allocation Using Neural Networks}, booktitle = {Computational Finance 2000}, year = {2000}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/compfin2000_final.pdf}, abstract = {We introduce an asset-allocation framework based on the active control of the value-at-risk of the portfolio. Within this framework, we compare two paradigms for making the allocation using neural networks. The first one uses the network to make a forecast of asset behavior, in conjunction with a traditional mean-variance allocator for constructing the portfolio. The second paradigm uses the network to directly make the portfolio allocation decisions. We consider a method for performing soft input variable selection, and show its considerable utility. We use model combination (committee) methods to systematize the choice of hyperparemeters during training. We show that committees using both paradigms are significantly outperforming the benchmark market performance.}, topics={Finance},cat={C}, } @ARTICLE{chapados:2001, author = {Chapados, Nicolas and Bengio, Yoshua}, title = {Cost Functions and Model Combination for VaR--based Asset Allocation using Neural Networks}, journal = {IEEE Transactions on Neural Networks}, volume = {12}, number = {4}, year = {2001}, pages = {890--906}, abstract = {We introduce an asset-allocation framework based on the active control of the value-at-risk of the portfolio. Within this framework, we compare two paradigms for making the allocation using neural networks. The first one uses the network to make a forecast of asset behavior, in conjunction with a traditional mean-variance allocator for constructing the portfolio. The second paradigm uses the network to directly make the portfolio allocation decisions. We consider a method for performing soft input variable selection, and show its considerable utility. We use model combination (committee) methods to systematize the choice of hyperparemeters during training. We show that committees using both paradigms are significantly outperforming the benchmark market performance.}, topics={Finance},cat={J}, } @ARTICLE{chapados:2003, author = {Bengio, Yoshua and Chapados, Nicolas}, title = {Extensions to Metric-Based Model Selection}, year = {2003}, crossref = {JMLR}, abstract = {Metric-based methods have recently been introduced for model selection and regularization, often yielding very significant improvements over the alternatives tried (including cross-validation). All these methods require unlabeled data over which to compare functions and detect gross differences in behavior away from the training points. We introduce three new extensions of the metric model selection methods and apply them to feature selection. The first extension takes advantage of the particular case of time-series data in which the task involves prediction with a horizon h. The idea is to use at t the h unlabeled examples that precede t for model selection. The second extension takes advantage of the different error distributions of cross-validation and the metric methods: cross-validation tends to have a larger variance and is unbiased. A hybrid combining the two model selection methods is rarely beaten by any of the two methods. The third extension deals with the case when unlabeled data is not available at all, using an estimated input density. Experiments are described to study these extensions in the context of capacity control and feature subset selection.}, topics={ModelSelection,Finance},cat={J}, } @ARTICLE{chapelle:2001, author = {Chapelle, Olivier and Vapnik, Vladimir and Bengio, Yoshua}, title = {Model Selection for Small Sample Regression}, journal = {Machine Learning}, year = {2001}, abstract = {Model selection is an important ingredient of many machine learning algorithms, in particular when the sample size in small, in order to strike the right trade-off between overfitting and underfitting. Previous classical results for linear regression are based on an asymptotic analysis. We present a new penalization method for performing model selection for regression that is appropriate even for small samples. Our penalization is based on an accurate estimator of the ratio of the expected training error and the expected generalization error, in terms of the expected eigenvalues of the input covariance matrix.}, topics={ModelSelection},cat={J}, } @INCOLLECTION{chapter-eval-longterm-2001, author = {Schmidhuber, Juergen and Hochreiter, Sepp and Bengio, Yoshua}, editor = {Kolen, J. and Kremer, S.}, title = {Evaluating Benchmark Problems by Random Guessing}, booktitle = {Field Guide to Dynamical Recurrent Networks}, year = {2001}, publisher = {IEEE Press}, topics={LongTerm},cat={B}, } @INCOLLECTION{chapter-gradient-document-2001, author = {{LeCun}, Yann and Bottou, {L{\'{e}}on} and Bengio, Yoshua and Haffner, Patrick}, editor = {Haykin, S. and Kosko, B.}, title = {Gradient-Based Learning Applied to Document Recognition}, booktitle = {Intelligent Signal Processing}, year = {2001}, pages = {306--351}, publisher = {IEEE Press}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/lecun-01a.pdf}, abstract = {Multilayer Neural Networks trained with a backprppagation algorithm constitute the best example of a successful Gradient-Based Learning technique. Given an appropriate network architecture, Gradient-Based Learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional Neural Networks, that are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation, recognition, and language modeling. A new learning paradigm, called Graph Transformer Networks (GTN), allows such multi-module systems to be trained globally using Gradient-Based methods so as to monimize an overall peformance measure. Two systems for on-line handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of Graph Transformer Networks. A Graph Transformer Network for reading bank check is also described. It uses Convolutional Neural Network character recognizers combined with a global training technique to provides record accuracy on business and personal checks. It is deployed commercially and reads several million checks per day.}, topics={PriorKnowledge,Speech},cat={B}, } @INCOLLECTION{chapter-gradient-flow-2001, author = {Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo}, editor = {Kolen, J. and Kremer, S.}, title = {Gradient Flow in Recurrent Nets: the Difficulty of Learning Long-Term Dependencies}, booktitle = {Field Guide to Dynamical Recurrent Networks}, year = {2001}, publisher = {IEEE Press}, topics={LongTerm},cat={B}, } @INPROCEEDINGS{chemero+eck:1999, author = {Chemero, T. and Eck, Douglas}, title = {An Exploration of Representational Complexity via Coupled Oscillators}, booktitle = {{Proceedings of the Tenth Midwest Artificial Intelligence and Cognitive Science Society}}, year = {1999}, publisher = {MIT Press}, address = {Cambridge, Mass.}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/1999_chemero.pdf}, abstract = {We note some inconsistencies in a view of representation which takes {\it decoupling} to be of key importance. We explore these inconsistencies using examples of representational vehicles taken from coupled oscillator theory and suggest a new way to reconcile {\it coupling} with {\it absence}. Finally, we tie these views to a teleological definition of representation.}, source={OwnPublication}, sourcetype={Conference}, } @ARTICLE{ChemInfModel2006, author = {Erhan, Dumitru and {L'Heureux}, Pierre-Jean and Yue, Shi Yi and Bengio, Yoshua}, title = {Collaborative Filtering on a Family of Biological Targets}, journal = {J. Chem. Inf. Model.}, volume = {46}, number = {2}, year = {2006}, pages = {626--635}, abstract = {Building a QSAR model of a new biological target for which few screening data are available is a statistical challenge. However, the new target may be part of a bigger family, for which we have more screening data. Collaborative filtering or, more generally, multi-task learning, is a machine learning approach that improves the generalization performance of an algorithm by using information from related tasks as an inductive bias. We use collaborative filtering techniques for building predictive models that link multiple targets to multiple examples. The more commonalities between the targets, the better the multi-target model that can be built. We show an example of a multi-target neural network that can use family information to produce a predictive model of an undersampled target. We evaluate JRank, a kernel-based method designed for collaborative filtering. We show their performance on compound prioritization for an HTS campaign and the underlying shared representation between targets. JRank outperformed the neural network both in the single- and multi-target models.}, topics={Bioinformatic,MultiTask},cat={J}, } @TECHREPORT{Chung-et-al-TR2014, author = {Chung, Junyoung and G{\"{u}}l{\c c}ehre, {\c C}ağlar and Cho, Kyunghyun and Bengio, Yoshua}, title = {Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling}, number = {Arxiv report 1412.3555}, year = {2014}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, note = {Presented at the Deep Learning workshop at NIPS2014} } @TECHREPORT{collobert:2001:rr01-12, author = {Collobert, Ronan and Bengio, Samy and Bengio, Yoshua}, title = {A Parallel Mixture of {SVM}s for Very Large Scale Problems}, number = {12}, year = {2001}, institution = {IDIAP}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/IDIAP-RR-01-12.ps}, abstract = {Support Vector Machines ({SVM}s) are currently the state-of-the-art models for many classification problems but they suffer from the complexity of their training algorithm which is at least quadratic with respect to the number of examples. Hence, it is hopeless to try to solve real-life problems having more than a few hundreds of thousands examples with {SVM}s. The present paper proposes a new mixture of {SVM}s that can be easily implemented in parallel and where each {SVM} is trained on a small subset of the whole dataset. Experiments on a large benchmark dataset (Forest) yielded significant time improvement (time complexity appears empirically to locally grow linearly with the number of examples). In addition, and that is a surprise, a significant improvement in generalization was observed.}, topics={Kernel},cat={T}, } @ARTICLE{collobert:2002, author = {Collobert, Ronan and Bengio, Samy and Bengio, Yoshua}, title = {Parallel Mixture of {SVM}s for Very Large Scale Problem}, journal = {Neural Computation}, year = {2002}, abstract = {Support Vector Machines ({SVM}s) are currently the state-of-the-art models for many classification problems but they suffer from the complexity of their training algorithm which is at least quadratic with respect to the number of examples. Hence, it is hopeless to try to solve real-life problems having more than a few hundreds of thousands examples with {SVM}s. The present paper proposes a new mixture of {SVM}s that can be easily implemented in parallel and where each {SVM} is trained on a small subset of the whole dataset. Experiments on a large benchmark dataset (Forest) yielded significant time improvement (time complexity appears empirically to locally grow linearly with the number of examples). In addition, and that is a surprise, a significant improvement in generalization was observed.}, topics={HighDimensional,Kernel},cat={J}, } @BOOK{collobert:2002:book, author = {Collobert, Ronan and Bengio, Yoshua and Bengio, Samy}, editor = {Lee, S. W. and Verri, A.}, title = {Scaling Large Learning Problems with Hard Parallel Mixtures}, booktitle = {Pattern Recognition with Support Vector Machines}, series = {Lecture Notes in Computer Science}, volume = {2388}, year = {2002}, publisher = {Springer-Verlag}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/2002_mixtures_svm.pdf}, abstract = {A challenge for statistical learning is to deal with large data sets, e.g. in data mining. Popular learning algorithms such as Support Vector Machines have training time at least quadratic in the number of examples: they are hopeless to solve prolems with a million examples. We propose a "hard parallelizable mixture" methodology which yields significantly reduced training time through modularization and parallelization: the training data is iteratively partitioned by a "gater" model in such a way that it becoms easy to learn an "expert" model separately in each region of the parition. A probabilistic extension and the use of a set of generative models allows representing a gater so that all pieces of the model are locally trained. For {SVM}s, time complexity appears empirically to locally grow linearly with the number of examples, while generalization performance can be enhanced. For the probabilistic version of the algorithm, the iterative algorithm provably goes down in a cost function that is an upper bound on the negative log-likelihood.}, topics={Kernel},cat={B}, } @MISC{copyright-CTAI, author = {Bengio, Yoshua and Ducharme, R{\'{e}}jean and Dorion, Christian}, title = {Commodity Trading Advisor Index}, year = {2004-2009}, howpublished = {copyright, and commercialized software license.} } @MISC{copyright-PLearn, author = {Vincent, Pascal and Bengio, Yoshua}, title = {{PLearn}, a {C++} Machine Learning Library}, year = {1998-2009}, howpublished = {copyright, public domain license.}, url = {www.plearn.org} } @ARTICLE{Cosi90, author = {Cosi, Piero and Bengio, Yoshua and De Mori, Renato}, title = {Phonetically-based multi-layered networks for acoustic property extraction and automatic speech recognition}, journal = {Speech Communication}, volume = {9}, number = {1}, year = {1990}, pages = {15--30}, topics={PriorKnowledge,Speech},cat={J}, } @TECHREPORT{courbariaux+al-TR2014, author = {Courbariaux, Matthieu and Bengio, Yoshua and David, Jean-Pierre}, month = dec, title = {Low precision arithmetic for deep learning}, number = {Arxiv report 1412.7024}, year = {2014}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, url = {http://arxiv.org/abs/1412.7024}, abstract = {We simulate the training of a set of state of the art neural networks, the Maxout networks (Goodfellow et al., 2013a), on three benchmark datasets: the MNIST, CIFAR10 and SVHN, with three distinct arithmetics: floating point, fixed point and dynamic fixed point. For each of those datasets and for each of those arithmetics, we assess the impact of the precision of the computations on the final error of the training. We find that very low precision computation is sufficient not just for running trained networks but also for training them. For example, almost state-of-the-art results were obtained on most datasets with 10 bits for computing activations and gradients, and 12 bits for storing updated parameters.} } @INPROCEEDINGS{Courville+al-2011, author = {Courville, Aaron and Bergstra, James and Bengio, Yoshua}, month = jun, title = {Unsupervised Models of Images by Spike-and-Slab {RBM}s}, year = {2011}, crossref = {ICML11} } @MISC{courville+bergstra+bengio:2010, author = {Courville, Aaron and Bergstra, James and Bengio, Yoshua}, title = {The Spike and Slab Restricted {B}oltzmann Machine}, year = {2010}, howpublished = {Deep Learning and Unsupervised Feature Learning Workshop, {NIPS}}, url = {http://deeplearningworkshopnips2010.files.wordpress.com/2010/11/nips2010_workshop_ssrbm.pdf} } @INPROCEEDINGS{courville+bergstra+bengio:2011aistats, author = {Courville, Aaron and Bergstra, James and Bengio, Yoshua}, editor = {Gordon, G. and Dunson, D. and Dud{\{\i}}k, M.}, month = apr, title = {A Spike and Slab Restricted {B}oltzmann Machine}, booktitle = {Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics}, series = {JMLR W{\&}CP}, volume = {15}, year = {2011}, note = {Recipient of People's Choice Award} } @MISC{courville+bergstra+bengio:2011snowbird, author = {Courville, Aaron and Bergstra, James and Bengio, Yoshua}, month = apr, title = {A Spike and Slab {RBM} Approach to Modeling Natural Images}, year = {2011}, howpublished = {The Learning Workshop, Fort Lauderdale, FL.}, note = {(Oral)} } @INCOLLECTION{courville+eck+bengio:nips2009, author = {Courville, Aaron and Eck, Douglas and Bengio, Yoshua}, editor = {}, title = {An Infinite Factor Model Hierarchy Via a Noisy-Or Mechanism}, booktitle = {Neural Information Processing Systems Conference (NIPS) 22}, year = {2009}, pages = {405--413}, publisher = {}, url = {http://books.nips.cc/papers/files/nips22/NIPS2009_1100.pdf}, source={OwnPublication}, sourcetype={Conference}, pdf={""}, } @ARTICLE{courville-al-ieee14, author = {Courville, Aaron and Desjardins, Guillaume and Bergstra, James and Bengio, Yoshua}, title = {The Spike-and-Slab {RBM} and Extensions to Discrete and Sparse Data Distributions}, journal = {Pattern Analysis and Machine Intelligence, IEEE Transactions on}, volume = {36}, number = {9}, year = {2014}, pages = {1874--1887}, } @INPROCEEDINGS{Dauphin+al-2011, author = {Dauphin, Yann and Glorot, Xavier and Bengio, Yoshua}, month = jun, title = {Large-Scale Learning of Embeddings with Reconstruction Sampling}, year = {2011}, crossref = {ICML11} } @INPROCEEDINGS{Dauphin+Bengio-ICLR2013, author = {Dauphin, Yann and Bengio, Yoshua}, title = {Big neural networks waste capacity}, booktitle = {ICLR’2013 workshops track (oral presentation), arXiv: 1301.3583}, year = {2013} } @INPROCEEDINGS{Dauphin+Bengio-NIPS2013, author = {Dauphin, Yann and Bengio, Yoshua}, title = {Stochastic Ratio Matching of {RBM}s for Sparse High-Dimensional Inputs}, year = {2013}, crossref = {NIPS26} } @INPROCEEDINGS{Dauphin-et-al-NIPS2011, author = {Rifai, Salah and Dauphin, Yann and Vincent, Pascal and Bengio, Yoshua and Muller, Xavier}, title = {The Manifold Tangent Classifier}, booktitle = {NIPS'2011}, year = {2011}, note = {Student paper award.} } @INPROCEEDINGS{davies+plumbley+eck:waspaa2009, author = {Davies, M. and Plumbley, M. and Eck, Douglas}, title = {Towards a musical beat emphasis function}, booktitle = {Proceedings of IEEE WASPAA}, year = {2009}, organization = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics}, address = {New Paltz, NY}, source={OwnPublication}, sourcetype={Conference}, } @INPROCEEDINGS{Delalleau+al-2005, author = {Delalleau, Olivier and Bengio, Yoshua and Le Roux, Nicolas}, editor = {Cowell, Robert G. and Ghahramani, Zoubin}, month = jan, title = {Efficient Non-Parametric Function Induction in Semi-Supervised Learning}, booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics (AISTATS'05)}, year = {2005}, pages = {96--103}, publisher = {Society for Artificial Intelligence and Statistics}, location = {Savannah Hotel, Barbados}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/semisup_aistats2005.pdf}, abstract = {There has been an increase of interest for semi-supervised learning recently, because of the many datasets with large amounts of unlabeled examples and only a few labeled ones. This paper follows up on proposed nonparametric algorithms which provide an estimated continuous label for the given unlabeled examples. First, it extends them to function induction algorithms that minimize a regularization criterion applied to an out-of-sample example, and happen to have the form of Parzen windows regressors. This allows to predict test labels without solving again a linear system of dimension n (the number of unlabeled and labeled training examples), which can cost O(n^3). Second, this function induction procedure gives rise to an efficient approximation of the training process, reducing the linear system to be solved to m << n unknowns, using only a subset of m examples. An improvement of O(n^2/m^2) in time can thus be obtained. Comparative experiments are presented, showing the good performance of the induction formula and approximation algorithm.}, topics={Unsupervised},cat={C}, } @INCOLLECTION{Delalleau+al-ssl-2006, author = {Delalleau, Olivier and Bengio, Yoshua and Le Roux, Nicolas}, editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander}, title = {Large-Scale Algorithms}, booktitle = {Semi-Supervised Learning}, year = {2006}, pages = {333--341}, publisher = {{MIT} Press}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/delalleau_ssl.pdf}, abstract = {In Chapter 11, it is shown how a number of graph-based semi-supervised learning algorithms can be seen as the minimization of a specific cost function, leading to a linear system with n equations and unknowns (with n the total number of labeled and unlabeled examples). Solving such a linear system will in general require on the order of O(kn2) time and O(kn) memory (for a sparse graph where each data point has k neighbors), which can be prohibitive on large datasets (especially if k = n, i.e. the graph is dense). We present in this chapter a subset selection method that can be used to reduce the original system to one of size m << n. The idea is to solve for the labels of a subset S of X of only m points, while still retaining information from the rest of the data by approximating their label with a linear combination of the labels in S (using the induction formula presented in Chapter 11). This leads to an algorithm whose computational requirements scale as O(m2n) and memory requirements as O(m2), thus allowing one to take advantage of significantly bigger unlabeled datasets than with the original algorithms.}, cat={B},topics={Unsupervised}, } @INPROCEEDINGS{Delalleau+Bengio-2011, author = {Delalleau, Olivier and Bengio, Yoshua}, title = {Shallow vs. Deep Sum-Product Networks}, year = {2011}, pages = {666--674}, crossref = {NIPS24}, abstract = {We investigate the representational power of sum-product networks (computation networks analogous to neural networks, but whose individual units compute either products or weighted sums), through a theoretical analysis that compares deep (multiple hidden layers) vs. shallow (one hidden layer) architectures. We prove there exist families of functions that can be represented much more efﬁciently with a deep network than with a shallow one, i.e. with substantially fewer hidden units. Such results were not available until now, and contribute to motivate recent research involving learning of deep sum-product networks, and more generally motivate research in Deep Learning.} } @ARTICLE{Delalleau-et-al-CIAIG2012, author = {Delalleau, Olivier and Contal, Emile and Thibodeau-Laufer, Eric and Chandias Ferrari, Raul and Bengio, Yoshua and Zhang, Frank}, keywords = {first person shooters, game balance, Matchmaking, neural networks, player satisfaction}, month = sep, title = {Beyond Skill Rating: Advanced Matchmaking in Ghost Recon Online}, journal = {IEEE Transactions on Computational Intelligence and AI in Games}, volume = {4}, number = {3}, year = {2012}, pages = {167--177}, issn = {1943-068X}, } @TECHREPORT{Delalleau-et-al-TR2012, author = {Delalleau, Olivier and Courville, Aaron and Bengio, Yoshua}, title = {Efficient {EM} Training of Gaussian Mixtures with Missing Data}, number = {Arxiv report 1209.0521}, year = {2012}, institution = {Universit{\'{e}} de Montr{\'{e}}al} } @PHDTHESIS{Delalleau-Phd-2012, author = {Delalleau, Olivier}, month = mar, title = {Apprentissage machine efficace : th{\'{e}}orie et pratique}, year = {2012}, school = {University of Montreal} } @INCOLLECTION{DeMori90a, author = {De Mori, Renato and Bengio, Yoshua and Cosi, Piero}, editor = {Mohr, R. and Pavlidis, T. and Sanfelin, A.}, title = {On the use of an ear model and multi-layer networks for automatic speech recognition}, booktitle = {Structural Pattern Analysis}, year = {1990}, publisher = {World Scientific}, topics={PriorKnowledge,Speech},cat={B}, } @INPROCEEDINGS{Desjardins+al-2010, author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua}, month = may, title = {Tempered {M}arkov Chain {M}onte {C}arlo for training of Restricted {B}oltzmann Machine}, booktitle = {JMLR W\&CP: Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2010)}, volume = {9}, year = {2010}, pages = {145--152}, location = {Chia Laguna Resort, Sardinia, Italy}, abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood on a toy dataset that it helps both sampling and learning.} } @INPROCEEDINGS{Desjardins+al-NIPS2011, author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua}, title = {On Tracking The Partition Function}, booktitle = {NIPS'2011}, year = {2011}, crossref = {NIPS24} } @TECHREPORT{Desjardins-2008, author = {Desjardins, Guillaume and Bengio, Yoshua}, keywords = {Convolutional Architectures, Deep Networks, RBM, Vision}, title = {Empirical Evaluation of Convolutional {RBMs} for Vision}, number = {1327}, year = {2008}, institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, abstract = {Convolutional Neural Networks ({CNN}) have had great success in machine learning tasks involving vision and represent one of the early successes of deep networks. Local receptive fields and weight sharing make their architecture ideally suited for vision tasks by helping to enforce a prior based on our knowledge of natural images. This same prior could also be applied to recent developments in the field of deep networks, in order to tailor these new architectures for artificial vision. In this context, we show how the Restricted {Boltzmann} Machine (RBM), the building block of Deep Belief Networks (DBN), can be adapted to operate in a convolutional manner. We compare their performance to standard fully-connected RBMs on a simple visual learning task and show that the convolutional RBMs (CRBMs) converge to smaller values of the negative likelihood function. Our experiments also indicate that CRBMs are more efficient than standard RBMs trained on small image patches, with the CRBMs having faster convergence.} } @TECHREPORT{Desjardins-arXiv-2012, author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua}, month = mar, title = {On Training Deep {Boltzmann} Machines}, number = {Arxiv report 1203.4416v1}, year = {2012}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, url = {http://arxiv.org/pdf/1203.4416v1} } @INPROCEEDINGS{Desjardins-et-al-ICLR2013, author = {Desjardins, Guillaume and Pascanu, Razvan and Courville, Aaron and Bengio, Yoshua}, title = {Metric-Free Natural Gradient for Joint-Training of {Boltzmann} Machines}, booktitle = {International Conference on Learning Representations (ICLR'2013)}, year = {2013}, abstract = {This paper introduces the Metric-Free Natural Gradient (MFNG) algorithm for training {Boltzmann} Machines. Similar in spirit to the Hessian-Free method of Martens [8], our algorithm belongs to the family of truncated Newton methods and exploits an efficient matrix-vector product to avoid explicitely storing the natural gradient metric $L$. This metric is shown to be the expected second derivative of the log-partition function (under the model distribution), or equivalently, the variance of the vector of partial derivatives of the energy function. We evaluate our method on the task of joint-training a 3-layer Deep {Boltzmann} Machine and show that MFNG does indeed have faster per-epoch convergence compared to Stochastic Maximum Likelihood with centering, though wall-clock performance is currently not competitive.} } @TECHREPORT{Desjardins-tech-2009, author = {Desjardins, Guillaume and Courville, Aaron and Bengio, Yoshua}, keywords = {CD, PCD, RBM, simulated tempering, tempered MCMC, unsupervised learning}, month = oct, title = {Tempered {Markov} Chain Monte Carlo for training of Restricted {Boltzmann} Machines}, number = {1345}, year = {2009}, institution = {D{\'{e}}partement d'Informatique et de Recherche Op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, abstract = {Alternating Gibbs sampling is the most common scheme used for sampling from Restricted {Boltzmann} Machines (RBM), a crucial component in deep architectures such as Deep Belief Networks. However, we find that it often does a very poor job of rendering the diversity of modes captured by the trained model. We suspect that this hinders the advantage that could in principle be brought by training algorithms relying on Gibbs sampling for uncovering spurious modes, such as the Persistent Contrastive Divergence algorithm. To alleviate this problem, we explore the use of tempered {Markov} Chain Monte-Carlo for sampling in RBMs. We find both through visualization of samples and measures of likelihood that it helps both sampling and learning.} } @TECHREPORT{DIRO-short, title = {DIRO}, year = {-1}, institution = {DIRO, Universit{\'{e}} de Montr{\'{e}}al}, address = {Montr{\'{e}}al (QC) Canada} } @TECHREPORT{DIRO-shorter, title = {DIRO}, year = {-1}, institution = {DIRO, UdeM} } @ARTICLE{Dugas+al-2011, author = {Dugas, Charles and Chapados, Nicolas and Ducharme, R{\'{e}}jean and Saint-Mleux, Xavier and Vincent, Pascal}, title = {A High-Order Feature Synthesis and Selection Algorithm Applied to Insurance Risk Modelling}, journal = {International Journal of Business Intelligence and Data Mining}, volume = {6}, number = {3}, year = {2011}, pages = {237 - 258}, abstract = {C. Dugas, N. Chapados, R. Ducharme, X. Saint-Mleux and P. Vincent,} } @ARTICLE{Dugas+Bengio-2009, author = {Dugas, Charles and Bengio, Yoshua and Belisle, Francois and Nadeau, Claude and Garcia, Rene}, month = jun, title = {Incorporating Functional Knowledge in Neural Networks}, journal = {The Journal of Machine Learning Research}, volume = {10}, year = {2009}, pages = {1239--1262}, abstract = {Incorporating prior knowledge of a particular task into the architecture of a learning algorithm can greatly improve generalization performance. We study here a case where we know that the function to be learned is non-decreasing in its two arguments and convex in one of them. For this purpose we propose a class of functions similar to multi-layer neural networks but (1) that has those properties, (2) is a universal approximator of Lipschitz functions with these and other properties. We apply this new class of functions to the task of modelling the price of call options. Experiments show improvements on regressing the price of call options using the new types of function classes that incorporate the a priori constraints.} } @PHDTHESIS{Dugas-Phd-2003, author = {Dugas, Charles}, title = {Les algorithmes d'apprentissage appliqu{\'{e}}s aux risques financiers}, year = {2003}, school = {Universit{\'{e}} de Montr{\'{e}}al} } @ARTICLE{dugas:2003, author = {Dugas, Charles and Bengio, Yoshua and Chapados, Nicolas and Vincent, Pascal and Denoncourt, Germain and Fournier, Christian}, title = {Statistical Learning Algorithms Applied to Automobile Insurance Ratemaking}, journal = {CAS Forum}, volume = {1}, number = {1}, year = {2003}, pages = {179--214}, abstract = {We recently conducted a research project for a large North American automobile insurer. This study was the most exhaustive ever undertaken by this particular insurer and lasted over an entire year. We analyzed the discriminating power of each variable used for ratemaking. We analyzed the performance of several models within five broad categories: linear regressions, generalized linear models, decision trees, neural networks and support vector machines. In this paper, we present the main results of this study. We qualitatively compare models and show how neural networks can represent high-order nonlinear dependencies with a small number of parameters, each of which is estimated on a large proportion of the data, thus yielding low variance. We thoroughly explain the purpose of the nonlinear sigmoidal transforms which are at the very heart of neural networks' performances. The main numerical result is a statistically significant reduction in the out-of-sample mean-squared error using the neural network model and our ability to substantially reduce the median premium by charging more to the highest risks. This in turn can translate into substantial savings and financial benefits for an insurer. We hope this paper goes a long way towards convincing actuaries to include neural networks within their set of modeling tools for ratemaking.}, topics={Finance,Mining},cat={J}, } @INPROCEEDINGS{eck+bertinmahieux+lamere+green:nips2007, author = {Eck, Douglas and Lamere, Paul and Bertin-Mahieux, Thierry and Green, Stephen}, editor = {Platt, John and Kolen, J. and Singer, Yoram and Roweis, S.}, title = {Automatic Generation of Social Tags for Music Recommendation}, year = {2008}, address = {Cambridge, MA}, crossref = {NIPS20}, source = "OwnPublication" } @INPROCEEDINGS{eck+bertinmahieux+lamere:ismir2007, author = {Eck, Douglas and Bertin-Mahieux, Thierry and Lamere, Paul}, title = {Autotagging music using supervised machine learning}, booktitle = {{Proceedings of the 8th International Conference on Music Information Retrieval ({ISMIR} 2007)}}, year = {2007}, source={OwnPublication}, } @INPROCEEDINGS{eck+casagrande:ismir2005, author = {Eck, Douglas and Casagrande, Norman}, title = {Finding Meter in Music Using an Autocorrelation Phase Matrix and Shannon Entropy}, booktitle = {{Proceedings of the 6th International Conference on Music Information Retrieval ({ISMIR} 2005)}}, year = {2005}, pages = {504--509}, address = {London: University of London}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/2005_ismir.pdf}, source={OwnPublication}, sourcetype={Conference}, } @INCOLLECTION{eck+gasser+port:2000, author = {Eck, Douglas and Gasser, M. and Port, Robert}, editor = {Desain, P. and Windsor, L.}, title = {Dynamics and Embodiment in Beat Induction}, booktitle = {{Rhythm Perception and Production}}, year = {2000}, pages = {157--170}, publisher = {Swets and Zeitlinger}, address = {Lisse, The Netherlands}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/2000_rppw.pdf}, abstract = {We provide an argument for using dynamical systems theory in the domain of beat induction. We motivate the study of beat induction and to relate beat induction to the more general study of human rhythm cognition. In doing so we compare a dynamical, embodied approach to a symbolic (traditional AI) one, paying particular attention to how the modeling approach brings with it tacit assumptions about what is being modeled. Please note that this is a philosophy paper about research that was, at the time of writing, very much in progress.}, source={OwnPublication}, sourcetype={Chapter}, } @INPROCEEDINGS{eck+gasser:1996, author = {Eck, Douglas and Gasser, M.}, editor = {}, title = {Perception of Simple Rhythmic Patterns in a Network of Oscillators}, booktitle = {{The Proceedings of the Eighteenth Annual Conference of the Cognitive Science Society}}, year = {1996}, publisher = {Lawrence Erlbaum Associates}, address = {New Jersey}, abstract = {This paper is concerned with the complex capacity to recognize and reproduce rhythmic patterns. While this capacity has not been well investigated, in broad qualitative terms it is clear that people can learn to identify and produce recurring patterns defined in terms of sequences of beats of varying intensity and rests: the rhythms behind waltzes, reels, sambas, etc. Our short term goal is a model which is "hard-wired" with knowledge of a set of such patterns. Presented with a portion of one of the patterns or a label for a pattern, the model should reproduce the pattern and continue to do so when the input is turned off. Our long-term goal is a model which can learn to adjust the connection strengths which implement particular patterns as it is exposed to input patterns.}, source={OwnPublication}, sourcetype={Conference}, } @TECHREPORT{eck+graves+schmidhuber:tr-speech2003, author = {Eck, Douglas and Graves, A. and Schmidhuber, Juergen}, month = {May}, title = {A New Approach to Continuous Speech Recognition Using {LSTM} Recurrent Neural Networks}, number = {IDSIA-14-03}, year = {2003}, institution = {IDSIA}, address = {www.idsia.ch/\-techrep.html}, abstract = {This paper presents an algorithm for continuous speech recognition built from two Long Short-Term Memory ({LSTM}) recurrent neural networks. A first {LSTM} network performs frame-level phone probability estimation. A second network maps these phone predictions onto words. In contrast to {HMM}s, this allows greater exploitation of long-timescale correlations. Simulation results are presented for a hand-segmented subset of the "Numbers-95" database. These results include isolated phone prediction, continuous frame-level phone prediction and continuous word prediction. We conclude that despite its early stage of development, our new model is already competitive with existing approaches on certain aspects of speech recognition and promising on others, warranting further research.}, source={OwnPublication}, sourcetype={TechReport}, } @TECHREPORT{eck+lapalme:2008, author = {Eck, Douglas and Lapalme, J.}, title = {Learning Musical Structure Directly from Sequences of Music}, number = {1300}, year = {2008}, institution = {Universit{\'{e}} de Montr{\'{e}}al DIRO}, address = {http://www.iro.umontreal.ca/\-\~{}eckdoug/papers/tr1300.pdf}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/tr1300.pdf}, source={OwnPublication}, sourcetype={TechReport}, } @INPROCEEDINGS{eck+schmidhuber:icann2002, author = {Eck, Douglas and Schmidhuber, Juergen}, editor = {Dorronsoro, J.}, title = {Learning The Long-Term Structure of the Blues}, booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}}, volume = {}, year = {2002}, pages = {284--289}, publisher = {Springer}, address = {Berlin}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/2002_icannMusic.pdf}, abstract = {In general music composed by recurrent neural networks ({RNN}s) suffers from a lack of global structure. Though networks can learn note-by-note transition probabilities and even reproduce phrases, they have been unable to learn an entire musical form and use that knowledge to guide composition. In this study, we describe model details and present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and some listeners believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen.}, source={OwnPublication}, sourcetype={Conference}, } @INPROCEEDINGS{eck+schmidhuber:ieee2002, author = {Eck, Douglas and Schmidhuber, Juergen}, editor = {Bourlard, H.}, title = {Finding Temporal Structure in Music: Blues Improvisation with {LSTM} Recurrent Networks}, booktitle = {Neural Networks for Signal Processing XII, Proceedings of the 2002 IEEE Workshop}, year = {2002}, pages = {747--756}, publisher = {IEEE}, address = {New York}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/2002_ieee.pdf}, abstract = {Few types of signal streams are as ubiquitous as music. Here we consider the problem of extracting essential ingredients of music signals, such as well-defined global temporal structure in the form of nested periodicities (or {\em meter}). Can we construct an adaptive signal processing device that learns by example how to generate new instances of a given musical style? Because recurrent neural networks can in principle learn the temporal structure of a signal, they are good candidates for such a task. Unfortunately, music composed by standard recurrent neural networks ({RNN}s) often lacks global coherence. The reason for this failure seems to be that {RNN}s cannot keep track of temporally distant events that indicate global music structure. Long Short-Term Memory ({LSTM}) has succeeded in similar domains where other {RNN}s have failed, such as timing \& counting and learning of context sensitive languages. In the current study we show that {LSTM} is also a good mechanism for learning to compose music. We present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and we believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen.}, source={OwnPublication}, sourcetype={Conference}, } @ARTICLE{eck+scott:2005, author = {Eck, Douglas and Scott, S. K.}, title = {Editorial: New Research in Rhythm Perception and Production}, journal = {Music Perception}, volume = {22}, number = {3}, year = {2005}, pages = {371-388}, source={OwnPublication}, sourcetype={Other}, } @MISC{eck+scott:editor2005, author = {Eck, Douglas and Scott, S. K.}, title = {Music Perception}, year = {2005}, note = {Guest Editor, Special Issue on Rhythm Perception and Production, 22(3)}, source={OwnPublication}, sourcetype={Other}, } @INPROCEEDINGS{eck:1999, author = {Eck, Douglas}, editor = {}, title = {Learning Simple Metrical Preferences in a Network of {F}itzhugh-{N}agumo Oscillators}, booktitle = {{The Proceedings of the Twenty-First Annual Conference of the Cognitive Science Society}}, year = {1999}, publisher = {Lawrence Erlbaum Associates}, address = {New Jersey}, abstract = {Hebbian learning is used to train a network of oscillators to prefer periodic signals of pulses over aperiodic signals. Target signals consisted of metronome-like voltage pulses with varying amounts of inter-onset noise injected. (with 0\% noise yielding a periodic signal and more noise yielding more and more aperiodic signals.) The oscillators---piecewise-linear approximations (Abbott, 1990) to Fitzhugh-Nagumo oscillators---are trained using mean phase coherence as an objective function. Before training a network is shown to readily synchronize with signals having wide range of noise. After training on a series of noise-free signals, a network is shown to only synchronize with signals having little or no noise. This represents a bias towards periodicity and is explained by strong positive coupling connections between oscillators having harmonically-related periods.}, source={OwnPublication}, sourcetype={Conference}, } @UNPUBLISHED{eck:bramsworkshop2004, author = {Eck, Douglas}, title = {Challenges for Machine Learning in the Domain of Music}, year = {2004}, note = {BRAMS Workshop on Brain and Music, Montreal Neurological Institute}, abstract = {Slides and musical examples available on request.}, source={OwnPublication}, sourcetype={Workshop}, optkey={""}, optmonth={""}, optannote={""}, } @PHDTHESIS{eck:diss, author = {Eck, Douglas}, title = {{Meter Through Synchrony: Processing Rhythmical Patterns with Relaxation Oscillators}}, year = {2000}, school = {Indiana University, Bloomington, IN, www.idsia.ch/\-\~{}doug/\-publications.html}, abstract = {This dissertation uses a network of relaxation oscillators to beat along with temporal signals. Relaxation oscillators exhibit interspersed slow-fast movement and model a wide array of biological oscillations. The model is built up gradually: first a single relaxation oscillator is exposed to rhythms and shown to be good at finding downbeats in them. Then large networks of oscillators are mutually coupled in an exploration of their internal synchronization behavior. It is demonstrated that appropriate weights on coupling connections cause a network to form multiple pools of oscillators having stable phase relationships. This is a promising first step towards networks that can recreate a rhythmical pattern from memory. In the full model, a coupled network of relaxation oscillators is exposed to rhythmical patterns. It is shown that the network finds downbeats in patterns while continuing to exhibit good internal stability. A novel non-dynamical model of downbeat induction called the Normalized Positive (NP) clock model is proposed, analyzed, and used to generate comparison predictions for the oscillator model. The oscillator model compares favorably to other dynamical approaches to beat induction such as adaptive oscillators. However, the relaxation oscillator model takes advantage of intrinsic synchronization stability to allow the creation of large coupled networks. This research lays the groundwork for a long-term research goal, a robotic arm that responds to rhythmical signals by tapping along. It also opens the door to future work in connectionist learning of long rhythmical patterns.}, source={OwnPublication}, sourcetype={Thesis}, } @INPROCEEDINGS{eck:icann2001, author = {Eck, Douglas}, editor = {Dorffner, Georg}, title = {A Network of Relaxation Oscillators that Finds Downbeats in Rhythms}, booktitle = {{Artificial Neural Networks -- ICANN 2001 (Proceedings)}}, volume = {}, year = {2001}, pages = {1239--1247}, publisher = {Springer}, address = {Berlin}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/2001_icann.pdf}, abstract = {A network of relaxation oscillators is used to find downbeats in rhythmical patterns. In this study, a novel model is described in detail. Its behavior is tested by exposing it to patterns having various levels of rhythmic complexity. We analyze the performance of the model and relate its success to previous work dealing with fast synchrony in coupled oscillators.}, source={OwnPublication}, sourcetype={Conference}, } @INPROCEEDINGS{eck:icassp2007, author = {Eck, Douglas}, editor = {}, title = {Beat Tracking Using an Autocorrelation Phase Matrix}, booktitle = {{Proceedings of the 2007 International Conference on Acoustics, Speech and Signal Processing (ICASSP)}}, year = {2007}, pages = {1313--1316}, publisher = {IEEE Signal Processing Society}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/2007_icassp.pdf}, source={OwnPublication}, sourcetype={Conference}, } @INPROCEEDINGS{eck:icmpc2004, author = {Eck, Douglas}, editor = {Lipscomb, S. D. and Ashley, R. and Gjerdingen, R. O. and Webster, P.}, title = {A Machine-Learning Approach to Musical Sequence Induction That Uses Autocorrelation to Bridge Long Timelags}, booktitle = {{The Proceedings of the Eighth International Conference on Music Perception and Cognition ({ICMPC}8)}}, year = {2004}, pages = {542-543}, publisher = {Causal Productions}, address = {Adelaide}, abstract = {One major challenge in using statistical sequence learning methods in the domain of music lies in bridging the long timelags that separate important musical events. Consider, for example, the chord changes that convey the basic structure of a pop song. A sequence learner that cannot predict chord changes will almost certainly not be able to generate new examples in a musical style or to categorize songs by style. Yet, it is surprisingly difficult for a sequence learner to bridge the long timelags necessary to identify when a chord change will occur and what its new value will be. This is the case because chord changes can be separated by dozens or hundreds of intervening notes. One could solve this problem by treating chords as being special (as did Mozer, NIPS 1991). But this is impractical---it requires chords to be labeled specially in the dataset, limiting the applicability of the model to non-labeled examples---and furthermore does not address the general issue of nested temporal structure in music. I will briefly describe this temporal structure (known commonly as "meter") and present a model that uses to its advantage an assumption that sequences are metrical. The model consists of an autocorrelation-based filtration that estimates online the most likely metrical tree (i.e. the frequency and phase of beat, measure, phrase &etc.) and uses that to generate a series of sequences varying at different rates. These sequences correspond to each level in the hierarchy. Multiple learners can be used to treat each series separately and their predictions can be combined to perform composition and categorization. I will present preliminary results that demonstrate the usefulness of this approach. Time permitting I will also compare the model to alternate approaches.}, source={OwnPublication}, sourcetype={Conference}, } @INPROCEEDINGS{eck:icmpc2006, author = {Eck, Douglas}, editor = {Baroni, M. and Addessi, A. R. and Caterina, R. and Costa, M.}, title = {Beat Induction Using an Autocorrelation Phase Matrix}, booktitle = {The Proceedings of the 9th International Conference on Music Perception and Cognition ({ICMPC9})}, year = {2006}, pages = {931-932}, publisher = {Causal Productions}, source={OwnPublication}, sourcetype={Conference}, } @UNPUBLISHED{eck:irisworkshop2004, author = {Eck, Douglas}, title = {Using Autocorrelation to Bridge Long Timelags when Learning Sequences of Music}, year = {2004}, note = {IRIS 2004 Machine Learning Workshop, Ottawa, Canada}, abstract = {Slides and musical examples available on request.}, source={OwnPublication}, sourcetype={Workshop}, optkey={""}, optmonth={""}, optannote={""}, } @ARTICLE{eck:jnmr2001, author = {Eck, Douglas}, title = {A Positive-Evidence Model for Rhythmical Beat Induction}, journal = {Journal of New Music Research}, volume = {30}, number = {2}, year = {2001}, pages = {187--200}, abstract = {The Normalized Positive (NPOS) model is a rule-based model that predicts downbeat location and pattern complexity in rhythmical patterns. Though derived from several existing models, the NPOS model is particularly effective at making correct predictions while at the same time having low complexity. In this paper, the details of the model are explored and a comparison is made to existing models. Several datasets are used to examine the complexity predictions of the model. Special attention is paid to the model's ability to account for the effects of musical experience on beat induction.}, source={OwnPublication}, sourcetype={Journal}, } @UNPUBLISHED{eck:mipsworkshop2004, author = {Eck, Douglas}, title = {Bridging Long Timelags in Music}, year = {2004}, note = {NIPS 2004 Workshop on Music and Machine Learning (MIPS), Whistler, British Columbia}, abstract = {Slides and musical examples available on request.}, source={OwnPublication}, sourcetype={Workshop}, optkey={""}, optmonth={""}, optannote={""}, } @ARTICLE{eck:mp2006, author = {Eck, Douglas}, title = {Finding Long-Timescale Musical Structure with an Autocorrelation Phase Matrix}, journal = {Music Perception}, volume = {24}, number = {2}, year = {2006}, pages = {167--176}, source={OwnPublication}, sourcetype={Journal}, } @UNPUBLISHED{eck:nipsworkshop2003, author = {Eck, Douglas}, title = {Time-warped hierarchical structure in music and speech: A sequence prediction challenge}, year = {2003}, note = {NIPS 2003 Workshop on Recurrent Neural Networks, Whistler, British Columbia}, abstract = {Slides and musical examples available on request.}, source={OwnPublication}, sourcetype={Workshop}, optkey={""}, optmonth={""}, optannote={""}, } @UNPUBLISHED{eck:nipsworkshop2006, author = {Eck, Douglas}, title = {Generating music sequences with an echo state network}, year = {2006}, note = {NIPS 2006 Workshop on Echo State Networks and Liquid State Machines}, abstract = {Slides and musical examples available on request.}, source={OwnPublication}, sourcetype={Workshop}, optkey={""}, optmonth={""}, optannote={""}, } @UNPUBLISHED{eck:nipsworkshop2007, author = {Eck, Douglas}, title = {Measuring and modeling musical expression}, year = {2007}, note = {NIPS 2007 Workshop on Music, Brain and Cognition}, source={OwnPublication}, sourcetype={Workshop}, optkey={""}, optmonth={""}, optannote={""}, } @ARTICLE{eck:psyres2002, author = {Eck, Douglas}, title = {Finding Downbeats with a Relaxation Oscillator}, journal = {Psychol. Research}, volume = {66}, number = {1}, year = {2002}, pages = {18--25}, abstract = {A relaxation oscillator model of neural spiking dynamics is applied to the task of finding downbeats in rhythmical patterns. The importance of downbeat discovery or {\em beat induction} is discussed, and the relaxation oscillator model is compared to other oscillator models. In a set of computer simulations the model is tested on 35 rhythmical patterns from Povel \& Essens (1985). The model performs well, making good predictions in 34 of 35 cases. In an analysis we identify some shortcomings of the model and relate model behavior to dynamical properties of relaxation oscillators.}, source={OwnPublication}, sourcetype={Journal}, } @UNPUBLISHED{eck:rppw2005, author = {Eck, Douglas}, title = {Meter and Autocorrelation}, year = {2005}, note = {{10th Rhythm Perception and Production Workshop (RPPW), Alden Biesen, Belgium}}, source={OwnPublication}, sourcetype={Workshop}, } @TECHREPORT{eck:tr-music2002, author = {Eck, Douglas and Schmidhuber, Juergen}, month = {March}, title = {A First Look at Music Composition using {LSTM} Recurrent Neural Networks}, number = {IDSIA-07-02}, year = {2002}, institution = {IDSIA}, address = {www.idsia.ch/\-techrep.html}, abstract = {In general music composed by recurrent neural networks ({RNN}s) suffers from a lack of global structure. Though networks can learn note-by-note transition probabilities and even reproduce phrases, attempts at learning an entire musical form and using that knowledge to guide composition have been unsuccessful. The reason for this failure seems to be that {RNN}s cannot keep track of temporally distant events that indicate global music structure. Long Short-Term Memory ({LSTM}) has succeeded in similar domains where other {RNN}s have failed, such as timing \& counting and CSL learning. In the current study I show that {LSTM} is also a good mechanism for learning to compose music. I compare this approach to previous attempts, with particular focus on issues of data representation. I present experimental results showing that {LSTM} successfully learns a form of blues music and is able to compose novel (and I believe pleasing) melodies in that style. Remarkably, once the network has found the relevant structure it does not drift from it: {LSTM} is able to play the blues with good timing and proper structure as long as one is willing to listen. {\em Note: This is a more complete version of the 2002 ICANN submission Learning the Long-Term Structure of the Blues.}}, source={OwnPublication}, sourcetype={TechReport}, } @TECHREPORT{eck:tr-npos2000, author = {Eck, Douglas}, month = {October}, title = {A Positive-Evidence Model for Classifying Rhythmical Patterns}, number = {IDSIA-09-00}, year = {2000}, institution = {IDSIA}, address = {www.idsia.ch/\-techrep.html}, abstract = {The Normalized Positive (NPOS) model is a novel matching model that predicts downbeat location and pattern complexity in rhythmical patterns. Though similar models report success, the NPOS model is particularly effective at making these predictions while at the same time being theoretically and mathematically simple. In this paper, the details of the model are explored and a comparison is made to existing models. Several datasets are used to examine the complexity predictions of the model. Special attention is paid to the model's ability to account for the effects of musical experience on rhythm perception.\\ {\em Note: See the 2001 Journal of New Music Research paper "A Positive-Evidence Model for Rhythmical Beat Induction" for a newer version of this paper.}}, ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-09-00.ps.gz}, source={OwnPublication}, sourcetype={TechReport}, } @TECHREPORT{eck:tr-oscnet2001, author = {Eck, Douglas}, month = {February}, title = {A Network of Relaxation Oscillators that Finds Downbeats in Rhythms}, number = {IDSIA-06-01}, year = {2001}, institution = {IDSIA}, address = {www.idsia.ch/\-techrep.html}, abstract = {A network of relaxation oscillators is used to find downbeats in rhythmical patterns. In this study, a novel model is described in detail. Its behavior is tested by exposing it to patterns having various levels of rhythmic complexity. We analyze the performance of the model and relate its success to previous work dealing with fast synchrony in coupled oscillators. \\ {\em Note: See the 2001 ICANN conference proceeding by the same title for a newer version of this paper.}}, ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-06-01.ps.gz}, source={OwnPublication}, sourcetype={TechReport}, } @TECHREPORT{eck:tr-tracking2000, author = {Eck, Douglas}, month = {October}, title = {Tracking Rhythms with a Relaxation Oscillator}, number = {IDSIA-10-00}, year = {2000}, institution = {IDSIA}, address = {www.idsia.ch/\-techrep.html}, abstract = {A number of biological and mechanical processes are typified by a continued slow accrual and fast release of energy. A nonlinear oscillator exhibiting this slow-fast behavior is called a relaxation oscillator and is used to model, for example, human heartbeat pacemaking and neural action potential. Similar limit cycle oscillators are used to model a wider range of behaviors including predator-prey relationships and synchrony in animal populations such as fireflies. Though nonlinear limit-cycle oscillators have been successfully applied to beat induction, relaxation oscillators have received less attention. In this work we offer a novel and effective relaxation oscillator model of beat induction. We outline the model in detail and provide a perturbation analysis of its response to external stimuli. In a series of simulations we expose the model to patterns from Experiment 1 of Povel \& Essens (1985). We then examine the beat assignments of the model. Although the overall performance of the model is very good, there are shortcomings. We believe that a network of mutually-coupled oscillators will address many of these shortcomings, and we suggest an appropriate course for future research.\\ {\em Note: See the 2001 {\em Psychological Research} article "Finding Downbeats with a Relaxation Oscillator" for a revised but less detailed version of this paper.}}, ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-10-00.ps.gz}, source={OwnPublication}, sourcetype={TechReport}, } @TECHREPORT{eck:tr-tracking2002, author = {Eck, Douglas}, month = {October}, title = {Real-Time Musical Beat Induction with Spiking Neural Networks}, number = {IDSIA-22-02}, year = {2002}, institution = {IDSIA}, address = {www.idsia.ch/\-techrep.html}, abstract = {Beat induction is best described by analogy to the activities of hand clapping or foot tapping, and involves finding important metrical components in an auditory signal, usually music. Though beat induction is intuitively easy to understand it is difficult to define and still more difficult to perform automatically. We will present a model of beat induction that uses a spiking neural network as the underlying synchronization mechanism. This approach has some advantages over existing methods; it runs online, responds at many levels in the metrical hierarchy, and produces good results on performed music (Beatles piano performances encoded as MIDI). In this paper the model is described in some detail and simulation results are discussed.}, source={OwnPublication}, sourcetype={TechReport}, } @UNPUBLISHED{eck:verita2002, author = {Eck, Douglas}, title = {Real Time Beat Induction with Spiking Neurons}, year = {2002}, address = {}, note = {{Music, Motor Control and the Mind: Symposium at Monte Verita, May}}, abstract = {Beat induction is best described by analogy to the activites of hand clapping or foot tapping, and involves finding important metrical components in an auditory signal, usually music. Though beat induction is intuitively easy to understand it is difficult to define and still more difficult to model. I will discuss an approach to beat induction that uses a network of spiking neurons to synchronize with periodic components in a signal at many timescales. Through a competitive process, groups of oscillators embodying a particular metrical interpretation (e.g. \"4/4\") are selected from the network and used to track the pattern. I will compare this model to other approaches including a traditional symbolic AI system (Dixon 2001), and one based on Bayesian statistics (Cemgil et al, 2001). Finally I will present performance results of the network on a set of MIDI-recorded piano performances of Beatles songs collected by the Music, Mind, Machine Group, NICI, University of Nijmegen (see Cemgil et al, 2001 for more details or http://www.nici.kun.nl/mmm).}, source={OwnPublication}, sourcetype={Workshop}, } @INPROCEEDINGS{ElHihi+Bengio-nips8, author = {El Hihi, Salah and Bengio, Yoshua}, title = {Hierarchical Recurrent Neural Networks for Long-Term Dependencies}, year = {1996}, crossref = {NIPS8}, abstract = {We have already shown that extracting lone-term dependencies from sequential data is difficult, both for deterministic dynamical systems such as recurrent networks, and probabilistic models such as hidden {Markov} models ({HMM}s) or input/output hidden {Markov} models ({IOHMM}s). In practice, to avoid this problem, researchers have used domain specific a-priori knowledge to give meaning to the hidden or state variables representing past context. In this paper we propose to use a more general type of a-priori knowledge, namely that the temporal dependencies are structured hierarchically. This implies that long-term dependencies are represented by variables with a long time scale. This principle is applied to a recurrent network which includes delays and multiple time scales. Experiments confirm the advantages of such structures. A similar approach is proposed for {HMM}s and {IOHMM}s.}, topics={LongTerm},cat={C}, } @ARTICLE{Erhan+al-2010, author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy}, month = feb, title = {Why Does Unsupervised Pre-training Help Deep Learning?}, volume = {11}, year = {2010}, pages = {625--660}, crossref = {JMLR}, abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants, with impressive results obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks involve an unsupervised learning component, usually in an unsupervised pre-training phase. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difficult learning problem. The main question investigated here is the following: why does unsupervised pre-training work and why does it work so well? Answering these questions is important if learning in deep architectures is to be further improved. We propose several explanatory hypotheses and test them through extensive simulations. We empirically show the influence of pre-training with respect to architecture depth, model capacity, and number of training examples. The experiments confirm and clarify the advantage of unsupervised pre-training. The results suggest that unsupervised pre-training guides the learning towards basins of attraction of minima that are better in terms of the underlying data distribution; the evidence from these results supports a regularization explanation for the effect of pre-training.} } @INPROCEEDINGS{Erhan-aistats-2010, author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua and Vincent, Pascal}, month = may, title = {Why Does Unsupervised Pre-training Help Deep Learning?}, booktitle = {JMLR W\&CP: Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2010)}, volume = {9}, year = {2010}, pages = {201-208}, location = {Chia Laguna Resort, Sardinia, Italy}, abstract = {Much recent research has been devoted to learning algorithms for deep architectures such as Deep Belief Networks and stacks of auto-encoder variants with impressive results being obtained in several areas, mostly on vision and language datasets. The best results obtained on supervised learning tasks often involve an unsupervised learning component, usually in an unsupervised pre-training phase. The main question investigated here is the following: why does unsupervised pre-training work so well? Through extensive experimentation, we explore several possible explanations discussed in the literature including its action as a regularizer (Erhan et al. 2009) and as an aid to optimization (Bengio et al. 2007). Our results build on the work of Erhan et al. 2009, showing that unsupervised pre-training appears to play predominantly a regularization role in subsequent supervised training. However our results in an online setting, with a virtually unlimited data stream, point to a somewhat more nuanced interpretation of the roles of optimization and regularization in the unsupervised pre-training effect.} } @MASTERSTHESIS{Erhan-MSc, author = {Erhan, Dumitru}, keywords = {Apprentisage multit{\^{a}}che, Filtrage collaboratif, M{\'{e}}thodes {\{a}} noyaux, QSAR, R{\'{e}}seaux de neurones}, title = {Collaborative filtering techniques for drug discovery}, year = {2006}, school = {Universit{\'{e}} de Montr{\'{e}}al}, abstract = {Cette th{\{e}}se examine le probl{\{e}}me d'apprendre plusieurs t{\^{a}}ches simultan{\'{e}}ment, afin de transf{\'{e}}rer les connaissances apprises {\{a}} une nouvelle t{\^{a}}che. Si on suppose que les t{\^{a}}ches partagent une repr{\'{e}}sentation et qu'il est possible de d{\'{e}}couvrir cette repr{\'{e}}sentation efficacement, cela peut nous servir {\{a}} construire un meilleur mod{\{e}}le de la nouvelle t{\^{a}}che. Il existe plusieurs variantes de cette m{\'{e}}thode: transfert inductif, apprentisage multit{\^{a}}che, filtrage collaboratif etc. Nous avons {\'{e}}valu{\'{e}} plusieurs algorithmes d'apprentisage supervis{\'{e}} pour d{\'{e}}couvrir des repr{\'{e}}sentations partag{\'{e}}es parmi les t{\^{a}}ches d{\'{e}}finies dans un probl{\{e}}me de chimie computationelle. Nous avons formul{\'{e}} le probl{\{e}}me dans un cadre d'apprentisage automatique, fait l'analogie avec les algorithmes standards de filtrage collaboratif et construit les hypoth{\{e}}ses g{\'{e}}n{\'{e}}rales qui devraient {\^{e}}tre test{\'{e}}es pour valider l'utilitisation des algorithmes multit{\^{a}}che. Nous avons aussi {\'{e}}valu{\'{e}} la performance des algorithmes d'apprentisage utilis{\'{e}}s et d{\'{e}}montrons qu'il est, en effet, possible de trouver une repr{\'{e}}sentation partag{\'{e}}e pour le probl{\{e}}me consider{\'{e}}. Du point de vue th{\'{e}}orique, notre apport est une modification d'un algorithme standard---les machines {\{a}} vecteurs de support--qui produit des r{\'{e}}sultats comparables aux meilleurs algorithmes disponsibles et qui utilise {\{a}} fond les concepts de l'apprentisage multit{\^{a}}che. Du point de vue pratique, notre apport est l'utilisation de notre algorithme par les compagnies pharmaceutiques dans leur d{\'{e}}couverte de nouveaux m{\'{e}}dicaments.} } @PHDTHESIS{Erhan-Phd-2010, author = {Erhan, Dumitru}, keywords = {artificial neural networks, deep architectures, machine learning, unsupervised learning, visualization}, month = oct, title = {Understanding deep architectures and the effect of unsupervised pre-training}, year = {2010}, school = {Universit{\'{e}} de Montr{\'{e}}al} } @TECHREPORT{Erhan-vis-techreport-2010, author = {Erhan, Dumitru and Courville, Aaron and Bengio, Yoshua}, month = oct, title = {Understanding Representations Learned in Deep Architectures}, number = {1355}, year = {2010}, institution = {Universit{\'{e}} de Montr{\'{e}}al/DIRO}, abstract = {Deep architectures have demonstrated state-of-the-art performance in a variety of settings, especially with vision datasets. Deep learning algorithms are based on learning several levels of representation of the input. Beyond test-set performance, there is a need for qualitative comparisons of the solutions learned by various deep architectures, focused on those learned representations. One of the goals of our research is to improve tools for finding good qualitative interpretations of high level features learned by such models. We also seek to gain insight into the invariances learned by deep networks. To this end, we contrast and compare several techniques for finding such interpretations. We applied our techniques on Stacked Denoising Auto-Encoders and Deep Belief Networks, trained on several vision datasets. We show that consistent filter-like interpretation is possible and simple to accomplish at the unit level. The tools developed make it possible to analyze deep models in more depth and accomplish the tracing of invariance manifolds for each of the hidden units. We hope that such techniques will allow researchers in deep architectures to understand more of how and why deep architectures work.} } @INPROCEEDINGS{Erhan2009, author = {Erhan, Dumitru and Manzagol, Pierre-Antoine and Bengio, Yoshua and Bengio, Samy and Vincent, Pascal}, keywords = {Deep Networks}, month = apr, title = {The Difficulty of Training Deep Architectures and the effect of Unsupervised Pre-Training}, year = {2009}, pages = {153--160}, crossref = {xAISTATS2009}, abstract = {Whereas theoretical work suggests that deep architectures might be more efﬁcient at representing highly-varying functions, training deep architectures was unsuccessful until the recent advent of algorithms based on unsupervised pretraining. Even though these new algorithms have enabled training deep models, many questions remain as to the nature of this difﬁcult learning problem. Answering these questions is important if learning in deep architectures is to be further improved. We attempt to shed some light on these questions through extensive simulations. The experiments conﬁrm and clarify the advantage of unsupervised pre-training. They demonstrate the robustness of the training procedure with respect to the random initialization, the positive effect of pre-training in terms of optimization and its role as a regularizer. We empirically show the inﬂuence of pre-training with respect to architecture depth, model capacity, and number of training examples.} } @ARTICLE{gasser+eck+port:1999, author = {Gasser, M. and Eck, Douglas and Port, Robert}, title = {Meter as Mechanism: A Neural Network Model that Learns Metrical patterns}, journal = {Connection Science}, volume = {11}, number = {2}, year = {1999}, pages = {187--216}, abstract = {One kind of prosodic structure that apparently underlies both music and some examples of speech production is meter. Yet detailed measurements of the timing of both music and speech show that the nested periodicities that define metrical structure can be quite noisy in time. What kind of system could produce or perceive such variable metrical timing patterns? And what would it take to be able to store and reproduce particular metrical patterns from long-term memory? We have developed a network of coupled oscillators that both produces and perceives patterns of pulses that conform to particular meters. In addition, beginning with an initial state with no biases, it can learn to prefer the particular meter that it has been previously exposed to.}, own={Have}, source={OwnPublication}, sourcetype={Journal}, } @TECHREPORT{gasser+eck+port:tr-1996, author = {Gasser, M. and Eck, Douglas and Port, Robert}, title = {Meter as Mechanism A Neural Network that Learns Metrical Patterns}, number = {180}, year = {1996}, institution = {Indiana University Cognitive Science Program}, source={OwnPublication}, sourcetype={TechReport}, } @INPROCEEDINGS{gasser+eck:1996, author = {Gasser, M. and Eck, Douglas}, editor = {}, title = {Representing Rhythmic Patterns in a Network of Oscillators}, booktitle = {{The Proceedings of the International Conference on Music Perception and Cognition}}, number = {4}, year = {1996}, pages = {361--366}, publisher = {Lawrence Erlbaum Associates}, address = {New Jersey}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/1996_gasser_icmpc.pdf}, abstract = {This paper describes an evolving computational model of the perception and pro-duction of simple rhythmic patterns. The model consists of a network of oscillators of different resting frequencies which couple with input patterns and with each other. Os-cillators whose frequencies match periodicities in the input tend to become activated. Metrical structure is represented explicitly in the network in the form of clusters of os-cillators whose frequencies and phase angles are constrained to maintain the harmonic relationships that characterize meter. Rests in rhythmic patterns are represented by ex-plicit rest oscillators in the network, which become activated when an expected beat in the pattern fails to appear. The model makes predictions about the relative difficulty of patterns and the effect of deviations from periodicity in the input.}, source={OwnPublication}, sourcetype={Conference}, } @INPROCEEDINGS{gers+eck+schmidhuber:icann2001, author = {Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen}, editor = {Dorffner, Georg}, title = {Applying {LSTM} to Time Series Predictable Through Time-Window Approaches}, booktitle = {{Artificial Neural Networks -- ICANN 2001 (Proceedings)}}, year = {2001}, pages = {669--676}, publisher = {Springer}, address = {Berlin}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/2001_gers_icann.pdf}, abstract = {Long Short-Term Memory ({LSTM}) is able to solve many time series tasks unsolvable by feed-forward networks using fixed size time windows. Here we find that {LSTM}'s superiority does {\em not} carry over to certain simpler time series tasks solvable by time window approaches: the Mackey-Glass series and the Santa Fe {FIR} laser emission series (Set A). This suggests t use {LSTM} only when simpler traditional approaches fail.}, source={OwnPublication}, sourcetype={Conference}, } @TECHREPORT{gers+eck+schmidhuber:tr-2000, author = {Gers, F. A. and Eck, Douglas and Schmidhuber, Juergen}, month = {December}, title = {Applying {LSTM} to Time Series Predictable Through Time-Window Approaches}, number = {IDSIA-22-00}, year = {2000}, institution = {IDSIA}, address = {www.idsia.ch/\-techrep.html}, abstract = {Long Short-Term Memory ({LSTM}) is able to solve many time series tasks unsolvable by feed-forward networks using fixed size time windows. Here we find that {LSTM}'s superiority does {\em not} carry over to certain simpler time series tasks solvable by time window approaches: the Mackey-Glass series and the Santa Fe {FIR} laser emission series (Set A). This suggests t use {LSTM} only when simpler traditional approaches fail.\\ {\em Note: See the 2001 ICANN conference proceeding by the same title for a newer version of this paper.}}, ps={ftp://ftp.idsia.ch/pub/techrep/IDSIA-22-00.ps.gz}, source={OwnPublication}, sourcetype={TechReport}, } @INPROCEEDINGS{gers+perez+eck+schmidhuber:esann2002, author = {Gers, F. A. and Perez-Ortiz, J. A. and Eck, Douglas and Schmidhuber, Juergen}, title = {{DEKF-LSTM}}, booktitle = {Proceedings of the 10th European Symposium on Artificial Neural Networks, ESANN 2002}, year = {2002}, address = {}, source={OwnPublication}, sourcetype={Conference}, } @INPROCEEDINGS{gers+perez+eck+schmidhuber:icannA2002, author = {Gers, F. A. and Perez-Ortiz, J. A. and Eck, Douglas and Schmidhuber, Juergen}, editor = {Dorronsoro, J.}, title = {Learning Context Sensitive Languages with {LSTM} Trained with {Kalman} Filters}, booktitle = {{Artificial Neural Networks -- ICANN 2002 (Proceedings)}}, year = {2002}, pages = {655--660}, publisher = {Springer}, address = {Berlin}, abstract = {Unlike traditional recurrent neural networks, the Long Short-Term Memory ({LSTM}) model generalizes well when presented with training sequences derived from regular and also simple nonregular languages. Our novel combination of {LSTM} and the decoupled extended Kalman filter, however, learns even faster and generalizes even better, requiring only the 10 shortest exemplars n <= 10 of the context sensitive language a^nb^nc^n to deal correctly with values of n up to 1000 and more. Even when we consider the relatively high update complexity per timestep, in many cases the hybrid offers faster learning than {LSTM} by itself.}, source={OwnPublication}, sourcetype={Conference}, } @PHDTHESIS{Ghosn-Phd-2003, author = {Ghosn, Joumana}, title = {Apprentissage multi-t{\^{a}}ches et partage de connaissances}, year = {2003}, school = {Universit{\'{e}} de Montr{\'{e}}al} } @INPROCEEDINGS{ghosn97, author = {Ghosn, Joumana and Bengio, Yoshua}, title = {Multi-Task Learning for Stock Selection}, year = {1997}, pages = {946--952}, publisher = {MIT Press, Cambridge, MA}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/multitask-nips97.pdf}, crossref = {NIPS9}, abstract = {Artificial Neural Networks can be used to predict future returns of stocks in order to take financial decisions. Should one build a separate network for each stock or share the same network for all the stocks. In this paper we also explore other alternatives, in which some layers are shared and others are not shared. When the prediction of future returns for different stocks are viewed as different tasks, sharing some parameters across stocks is a form of multi-task learning. In a series of experiments with Canadian stocks, we obtain yearly returns that are more than 14\% above various benchmarks.}, topics={MultiTask,Finance},cat={C}, } @TECHREPORT{Gingras-asynchronous-TR96, author = {Gingras, Fran{\c c}ois and Bengio, Yoshua}, title = {Handling asynchronous or missing financial data with recurrent networks}, number = {1020}, year = {1996}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, topics={Finance,Missing},cat={T}, } @TECHREPORT{Gingras-financial-TR99, author = {Gingras, Fran{\c c}ois and Bengio, Yoshua and Nadeau, Claude}, title = {On Out-of-Sample Statistics for Financial Time-Series}, number = {2585}, year = {1999}, institution = {D{\'{e}}partement d'informatique et recherche op{\'{e}}rationnelle, Universit{\'{e}} de Montr{\'{e}}al}, topics={Comparative,Finance},cat={T}, } @INPROCEEDINGS{gingras2000, author = {Gingras, Fran{\c c}ois and Bengio, Yoshua and Nadeau, Claude}, title = {On Out-of-Sample Statistics for Time-Series}, booktitle = {Computational Finance 2000}, year = {2000}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/out-err-cf2000.pdf}, abstract = {This paper studies an out-of-sample statistic for time-series prediction that is analogous to the widely used R2 in-sample statistic. We propose and study methods to estimate the variance of this out-of-sample statistic. We suggest that the out-of-sample statistic is more robust to distributional and asymptotic assumptions behind many tests for in-sample statistics. Furthermore we argue that it may be more important in some cases to choose a model that generalizes as well as possible rather than choose the parameters that are closest to the true parameters. Comparative experiments are performed on a financial time-series (daily and monthly returns of the TSE300 index). The experiments are performed or varying prediction horizons and we study the relation between predictibility (out-of-sample R2), variability of the out-of-sample R2 statistic, and the prediction horizon.}, topics={Comparative,Finance},cat={C}, } @INPROCEEDINGS{Glorot+al-AI-2011, author = {Glorot, Xavier and Bordes, Antoine and Bengio, Yoshua}, month = apr, title = {Deep Sparse Rectifier Neural Networks}, booktitle = {JMLR W\&CP: Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2011)}, year = {2011}, abstract = {While logistic sigmoid neurons are more biologically plausible than hyperbolic tangent neurons, the latter work better for training multi-layer neural networks. This paper shows that rectifying neurons are an even better model of biological neurons and yield equal or better performance than hyperbolic tangent networks in spite of the hard non-linearity and non-differentiability at zero, creating sparse representations with true zeros, which seem remarkably suitable for naturally sparse data. \% Even though they can take advantage of semi-supervised setups with extra-unlabeled data, deep rectifier networks can reach their best performance without requiring any unsupervised pre-training on purely supervised tasks with large labeled datasets. \% Hence, these results can be seen as a new milestone in the attempts at understanding the difficulty in training deep but purely supervised neural networks, and closing the performance gap between neural networks learnt with and without unsupervised pre-training.} } @INPROCEEDINGS{Glorot+al-ICML-2011, author = {Glorot, Xavier and Bordes, Antoine and Bengio, Yoshua}, month = jun, title = {Domain Adaptation for Large-Scale Sentiment Classification: A Deep Learning Approach}, volume = {27}, year = {2011}, pages = {97-110}, crossref = {ICML11}, abstract = {Learning good representations from a large set of unlabeled data is a particularly challenging task. Recent work (see ? for a review) shows that training deep architectures is a good way to extract such representations, by extracting and disentangling gradually higher-level factors of variation characterizing the input distribution. In this paper, we describe diﬀerent kinds of layers we trained for learning representations in the setting of the Unsupervised and Transfer Learning Challenge. The strategy of our team won the ﬁnal phase of the challenge. It combined and stacked diﬀerent one-layer unsupervised learning algorithms, adapted to each of the ﬁve datasets of the competition. This paper describes that strategy and the particular one-layer learning algorithms feeding a simple linear classiﬁer with a tiny number of labeled training samples (1 to 64 per class).} } @INPROCEEDINGS{Glorot-et-al-ICLR-2013, author = {Glorot, Xavier and Bordes, Antoine and Weston, Jason and Bengio, Yoshua}, month = may, title = {A Semantic Matching Energy Function for Learning with Multi-relational Data}, booktitle = {1st International Conference on Learning Representations (ICLR) (workshop poster)}, year = {2013}, location = {Scottsdale, USA} } @MISC{Glorot-et-al-NIPS-DLUFL2010, author = {Glorot, Xavier and Bordes, Antoine and Bengio, Yoshua}, title = {Deep Sparse Rectifier Networks}, year = {2010}, howpublished = {NIPS*2010 Workshop on Deep Learning and Unsupervised Feature Learning (poster)}, abstract = {Xavier Glorot, Antoine Bordes and Yoshua Bengio} } @INPROCEEDINGS{GlorotAISTATS2010, author = {Glorot, Xavier and Bengio, Yoshua}, month = may, title = {Understanding the difficulty of training deep feedforward neural networks}, booktitle = {JMLR W\&CP: Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AISTATS 2010)}, volume = {9}, year = {2010}, pages = {249-256}, location = {Chia Laguna Resort, Sardinia, Italy}, abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.} } @MISC{Goodfeli-et-al-2013, author = {Goodfellow, Ian J. and Erhan, Dumitru and Carrier, Pierre-Luc and Courville, Aaron and Mirza, Mehdi and Hamner, Ben and Cukierski, Will and Tang, Yichuan and Thaler, David and Lee, Dong-Hyun and Zhou, Yingbo and Ramaiah, Chetan and Feng, Fangxiang and Li, Ruifan and Wang, Xiaojie and Athanasakis, Dimitris and Shawe-Taylor, John and Milakov, Maxim and Park, John and Ionescu, Radu and Popescu, Marius and Grozea, Cristian and Bergstra, James and Xie, Jingjing and Romaszko, Lukasz and Xu, Bing and Chuang, Zhang and Bengio, Yoshua}, keywords = {competition, dataset, representation learning}, title = {Challenges in Representation Learning: A report on three machine learning contests}, year = {2013}, institution = {Unicer}, url = {http://arxiv.org/abs/1307.0414}, abstract = {The ICML 2013 Workshop on Challenges in Representation Learning focused on three challenges: the black box learning challenge, the facial expression recognition challenge, and the multimodal learn- ing challenge. We describe the datasets created for these challenges and summarize the results of the competitions. We provide suggestions for or- ganizers of future challenges and some comments on what kind of knowl- edge can be gained from machine learning competitions. http://deeplearning.net/icml2013-workshop-competition} } @INPROCEEDINGS{Goodfeli-et-al-ICONIP-2013, author = {Goodfellow, Ian J. and Erhan, Dumitru and Carrier, Pierre-Luc and Courville, Aaron and Mirza, Mehdi and Hamner, Ben and Cukierski, Will and Tang, Yichuan and Thaler, David and Lee, Dong-Hyun and Zhou, Yingbo and Ramaiah, Chetan and Feng, Fangxiang and Li, Ruifan and Wang, Xiaojie and Athanasakis, Dimitris and Shawe-Taylor, John and Milakov, Maxim and Park, John and Ionescu, Radu and Popescu, Marius and Grozea, Cristian and Bergstra, James and Xie, Jingjing and Romaszko, Lukasz and Xu, Bing and Chuang, Zhang and Bengio, Yoshua}, title = {Challenges in Representation Learning: A report on three machine learning contests}, booktitle = {International Conference On Neural Information Processing}, year = {2013} } @ARTICLE{Goodfeli-et-al-TPAMI-Deep-PrePrint-2013, author = {Goodfellow, Ian J. and Courville, Aaron and Bengio, Yoshua}, month = aug, title = {Scaling up spike-and-slab models for unsupervised feature learning}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, volume = {35}, number = {8}, year = {2013}, pages = {1902-1914}, issn = {0162-8828}, abstract = {We describe the use of two spike-and-slab models for modeling real-valued data, with an emphasis on their applications to object recognition. The ﬁrst model, which we call spike-and-slab sparse coding (S3C), is a pre-existing model for which we introduce a faster approximate inference algorithm. We introduce a deep variant of S3C which we call the partially directed deep {Boltzmann} machine (PD-DBM) and extend our S3C inference algorithm for use on this model. We describe learning procedures for each. We demonstrate that our inference procedure for S3C enables scaling the model to unprecedently large problem sizes, and demonstrate that using S3C as a feature extractor results in very good object recognition performance, particularly when the number of labeled examples is low. We show that the PD-DBM generates better samples than its shallow counterpart, and that unlike DBMs or DBNs, the PD-DBM may be trained successfully without greedy layerwise training.} } @TECHREPORT{Goodfeli-et-al-TR2013, author = {Goodfellow, Ian J. and Warde-Farley, David and Mirza, Mehdi and Courville, Aaron and Bengio, Yoshua}, month = feb, title = {Maxout Networks}, number = {Arxiv report 1302.4389}, year = {2013}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, url = {http://arxiv.org/abs/1302.4389}, abstract = {We consider the problem of designing models to leverage a recently introduced approximate model averaging technique called dropout. We define a simple new model called maxout (so named because its output is the max of a set of inputs, and because it is a natural companion to dropout) designed to both facilitate optimization by dropout and improve the accuracy of dropout's fast approximate model averaging technique. We empirically verify that the model successfully accomplishes both of these tasks. We use maxout and dropout to demonstrate state of the art classification performance on four benchmark datasets: MNIST, CIFAR-10, CIFAR-100, and SVHN.} } @INPROCEEDINGS{Goodfellow+al-ICLR2014, author = {Goodfellow, Ian J. and Mirza, Mehdi and Xiao, Da and Courville, Aaron and Bengio, Yoshua}, title = {An Empirical Investigation of Catastrophic Forgeting in Gradient-Based Neural Networks}, year = {2014}, crossref = {ICLR2014-conf}, abstract = {Catastrophic forgetting is a problem faced by many machine learning models and algorithms. When trained on one task, then trained on a second task, many machine learning models "forget" how to perform the first task. This is widely believed to be a serious problem for neural networks. Here, we investigate the extent to which the catastrophic forgetting problem occurs for modern neural networks, comparing both established and recent gradient-based training algorithms and activation functions. We also examine the effect of the relationship between the first task and the second task on catastrophic forgetting. We find that it is always best to train using the dropout algorithm--the dropout algorithm is consistently best at adapting to the new task, remembering the old task, and has the best tradeoff curve between these two extremes. We find that different tasks and relationships between tasks result in very different rankings of activation function performance. This suggests the choice of activation function should always be cross-validated.} } @INPROCEEDINGS{Goodfellow+al-ICML2012, author = {Goodfellow, Ian J. and Courville, Aaron and Bengio, Yoshua}, title = {Large-Scale Feature Learning With Spike-and-Slab Sparse Coding}, year = {2012}, crossref = {ICML12} } @INPROCEEDINGS{goodfellow+all-NIPS2011, author = {Goodfellow, Ian J. and Courville, Aaron and Bengio, Yoshua}, title = {Spike-and-Slab Sparse Coding for Unsupervised Feature Discovery}, booktitle = {NIPS Workshop on Challenges in Learning Hierarchical Models}, year = {2011} } @TECHREPORT{Goodfellow-et-al-ARXIV2014, author = {Goodfellow, Ian J. and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua}, title = {Generative Adversarial Networks}, number = {Arxiv report 1406.2661}, year = {2014}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, url = {http://arxiv.org/abs/1406.2661}, abstract = {We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to 1/2 everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any {Markov} chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples.} } @MISC{Goodfellow-et-al-ICLRWorkshop2013, author = {Goodfellow, Ian J. and Courville, Aaron and Bengio, Yoshua}, title = {Joint Training Deep {Boltzmann} Machines for Classification}, year = {2013}, howpublished = {ICLR 2013 workshops track} } @INPROCEEDINGS{Goodfellow-et-al-ICML2013, author = {Goodfellow, Ian J. and Warde-Farley, David and Mirza, Mehdi and Courville, Aaron and Bengio, Yoshua}, editor = {Dasgupta, Sanjoy and McAllester, David}, title = {Maxout Networks}, year = {2013}, pages = {1319-1327}, crossref = {ICML13}, abstract = {We consider the problem of designing models to leverage a recently introduced approximate model averaging technique called dropout. We dene a simple new model called maxout (so named because its output is the max of a set of inputs, and because it is a natural companion to dropout) designed to both facilitate optimization by dropout and improve the accuracy of dropouts fast approximate model averaging technique. We empirically verify that the model successfully accomplishes both of these tasks. We use maxout and dropout to demonstrate state of the art classication performance on four benchmark datasets: MNIST, CIFAR-10, CIFAR-100, and SVHN.} } @INPROCEEDINGS{Goodfellow-et-al-NIPS2013, author = {Goodfellow, Ian J. and Mirza, Mehdi and Courville, Aaron and Bengio, Yoshua}, month = dec, title = {Multi-Prediction Deep {Boltzmann} Machines}, year = {2013}, crossref = {NIPS26} } @TECHREPORT{Goodfellow-TR2010, author = {Goodfellow, Ian J.}, title = {Technical Report: Multidimensional, Downsampled Convolution for Autoencoders}, year = {2010}, institution = {Universit{\'{e}} de Montr{\'{e}}al}, abstract = {This technical report describes discrete convolution with a multidimen- sional kernel. Convolution implements matrix multiplication by a sparse matrix with several elements constrained to be equal to each other. To implement a convolutional autoencoder, the gradients of this operation, the transpose of this operation, and the gradients of the transpose are all needed. When using standard convolution, each of these supplementary operations can be described as a convolution on slightly modied argu- ments. When the output is implicitly downsampled by moving the kernel in more than one pixel at each step, we must dene two new operations in order to compute all of the necessary values.} } @INPROCEEDINGS{Gori89, author = {Gori, Marco and Bengio, Yoshua and De Mori, Renato}, title = {BPS: a learning algorithm for capturing the dynamic nature of speech}, booktitle = {International Joint Conference on Neural Networks}, volume = {2}, year = {1989}, pages = {417--424}, publisher = {IEEE, New York}, address = {Washington 1989}, topics={Speech},cat={C}, } @INCOLLECTION{Grandvalet+Bengio-ssl-2006, author = {Grandvalet, Yves and Bengio, Yoshua}, editor = {Chapelle, Olivier and {Sch{\"{o}}lkopf}, Bernhard and Zien, Alexander}, title = {Entropy Regularization}, booktitle = {Semi-Supervised Learning}, year = {2006}, pages = {151--168}, publisher = {{MIT} Press}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/entropy_regularization_2006.pdf}, abstract = {The problem of semi-supervised induction consists in learning a decision rule from labeled and unlabeled data. This task can be undertaken by discriminative methods, provided that learning criteria are adapted consequently. In this chapter, we motivate the use of entropy regularization as a means to benefit from unlabeled data in the framework of maximum a posteriori estimation. The learning criterion is derived from clearly stated assumptions and can be applied to any smoothly parametrized model of posterior probabilities. The regularization scheme favors low density separation, without any modeling of the density of input features. The contribution of unlabeled data to the learning criterion induces local optima, but this problem can be alleviated by deterministic annealing. For well-behaved models of posterior probabilities, deterministic annealing {EM} provides a decomposition of the learning problem in a series of concave subproblems. Other approaches to the semi-supervised problem are shown to be close relatives or limiting cases of entropy regularization. A series of experiments illustrates the good behavior of the algorithm in terms of performance and robustness with respect to the violation of the postulated low density separation assumption. The minimum entropy solution benefits from unlabeled data and is able to challenge mixture models and manifold learning in a number of situations.}, cat={B},topics={Unsupervised}, } @INPROCEEDINGS{graves+eck+schmidhuber:bio-adit2004, author = {Graves, A. and Eck, Douglas and Beringer, N. and Schmidhuber, Juergen}, title = {Biologically Plausible Speech Recognition with {LSTM} Neural Nets}, booktitle = {Proceedings of the First Int'l Workshop on Biologically Inspired Approaches to Advanced Information Technology (Bio-ADIT)}, year = {2004}, pages = {127-136}, url = {http://www.iro.umontreal.ca/~eckdoug/papers/2004_bioadit.pdf}, abstract = {Long Short-Term Memory ({LSTM}) recurrent neural networks ({RNN}s) are local in space and time and closely related to a biological model of memory in the prefrontal cortex. Not only are they more biologically plausible than previous artificial {RNN}s, they also outperformed them on many artificially generated sequential processing tasks. This encouraged us to apply {LSTM} to more realistic problems, such as the recognition of spoken digits. Without any modification of the underlying algorithm, we achieved results comparable to state-of-the-art Hidden {Markov} Model ({HMM}) based recognisers on both the {TIDIGITS} and TI46 speech corpora. We conclude that {LSTM} should be further investigated as a biologically plausible basis for a bottom-up, neural net-based approach to speech recognition.}, source={OwnPublication}, sourcetype={Conference}, } @TECHREPORT{graves+eck+schmidhuber:tr-digits2003, author = {Graves, A. and Eck, Douglas and Schmidhuber, Juergen}, month = {May}, title = {Comparing {LSTM} Recurrent Networks and Spiking Recurrent Networks on the Recognition of Spoken Digits}, number = {IDSIA-13-03}, year = {2003}, institution = {IDSIA}, address = {www.idsia.ch/\-techrep.html}, abstract = {One advantage of spiking recurrent neural networks ({SNN}s) is an ability to categorise data using a synchrony-based latching mechnanism. This is particularly useful in problems where timewarping is encountered, such as speech recognition. Differentiable recurrent neural networks ({RNN}s) by contrast fail at tasks involving difficult timewarping, despite having sequence learning capabilities superior to {SNN}s. In this paper we demonstrate that Long Short-Term Memory ({LSTM}) is an {RNN} capable of robustly categorizing timewarped speech data, thus combining the most useful features of both paradigms. We compare its performance to {SNN}s on two variants of a spoken digit identification task, using data from an international competition. The first task (described in Nature (Nadis 2003)) required the categorisation of spoken digits with only a single training exemplar, and was specifically designed to test robustness to timewarping. Here {LSTM} performed better than all the {SNN}s in the competition. The second task was to predict spoken digits using a larger training set. Here {LSTM} greatly outperformed an {SNN}-like model found in the literature. These results suggest that {LSTM} has a place in domains that require the learning of large timewarped datasets, such as automatic speech recognition.}, source={OwnPublication}, sourcetype={TechReport}, } @INPROCEEDINGS{Gulcehre+Bengio-ICLR2013, author = {G{\"{u}}l{\c c}ehre, {\c C}ağlar and Bengio, Yoshua}, title = {Knowledge Matters: Importance of Prior Information for Optimization}, booktitle = {International Conference on Learning Representations (ICLR'2013)}, year = {2013}, abstract = {We explore the effect of introducing prior information into the intermediate level of neural networks for a learning task on which all the state-of-the-art machine learning algorithms tested failed to learn. We motivate our work from the hypothesis that humans learn such intermediate concepts from other individuals via a form of supervision or guidance using a curriculum. The experiments we have conducted provide positive evidence in favor of this hypothesis. In our experiments, a two-tiered MLP architecture is trained on a dataset with 64x64 binary inputs images, each image with three sprites. The final task is to decide whether all the sprites are the same or one of them is different. Sprites are pentomino tetris shapes and they are placed in an image with different locations using scaling and rotation transformations. The first part of the two-tiered MLP is pre-trained with intermediate-level targets being the presence of sprites at each location, while the second part takes the output of the first part as input and predicts the final task's target binary event. The two-tiered MLP architecture, with a few tens of thousand examples, was able to learn the task perfectly, whereas all other algorithms (include unsupervised pre-training, but also traditional algorithms like {SVM}s, decision trees and boosting) all perform no better than chance. We hypothesize that the optimization difficulty involved when the intermediate pre-training is not performed is due to the {\em composition} of two highly non-linear tasks. Our findings are also consistent with hypotheses on cultural learning inspired by the observations of optimization problems with deep learning, presumably because of effective local minima.} } @INPROCEEDINGS{haffner-98, author = {Haffner, Patrick and Bottou, {L{\'{e}}on} and G. Howard, Paul and Simard, Patrice and Bengio, Yoshua and {LeCun}, Yann}, title = {Browsing through High Quality Document Images with {DjVu}}, booktitle = {Proc. of Advances in Digital Libraries 98}, year = {1998}, pages = {309--318}, publisher = {IEEE}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/haffner-98.ps.gz}, topics={HighDimensional},cat={C}, } @INPROCEEDINGS{Hamel+al-2009, author = {Hamel, Philippe and Wood, Sean and Eck, Douglas}, title = {Automatic Identification of Instrument Classes in Polyphonic and Poly-Instrument Audio}, booktitle = {10th International Society for Music Information Retrieval Conference}, year = {2009}, pages = {399--404}, location = {Kobe, Japan}, url = {http://ismir2009.ismir.net/proceedings/PS3-2.pdf}, abstract = {We present and compare several models for automatic identiﬁcation of instrument classes in polyphonic and poly-instrument audio. The goal is to be able to identify which categories of instrument (Strings, Woodwind, Guitar, Piano, etc.) are present in a given audio example. We use a machine learning approach to solve this task. We constructed a system to generate a large database of musically relevant poly-instrument audio. Our database is generated from hundreds of instruments classiﬁed in 7 categories. Musical audio examples are generated by mixing multi-track MIDI ﬁles with thousands of instrument combinations. We compare three different classiﬁers : a Support Vector Machine ({SVM}), a Multilayer Perceptron (MLP) and a Deep Belief Network (DBN). We show that the DBN tends to outperform both the {SVM} and the MLP in most cases.} } @INPROCEEDINGS{Hamel+al:2010, author = {Hamel, Philippe and Eck, Douglas}, month = aug, title = {Learning features from music audio with deep belief networks}, booktitle = {Proceedings of the 11th {I}nternational {S}ociety for {M}usic {I}nformation {R}etrieval {C}onference ({ISMIR})}, year = {2010}, pages = {339--344}, location = {Utrecht, The Netherlands}, abstract = {Feature extraction is a crucial part of many MIR tasks. In this work, we present a system that can automatically ex- tract relevant features from audio for a given task. The feature extraction system consists of a Deep Belief Network (DBN) on Discrete Fourier Transforms (DFTs) of the audio. We then use the activations of the trained network as inputs for a non-linear Support Vector Machine ({SVM}) classifier. In particular, we learned the features to solve the task of genre recognition. The learned features perform significantly better than MFCCs. Moreover, we obtain a classification accuracy of 84.3\% on the Tzanetakis dataset, which compares favorably against state-of-the-art genre classifiers using frame-based features. We also applied these same features to the task of auto-tagging. The autotaggers trained with our features performed better than those that were trained with timbral and temporal features.} } @INPROCEEDINGS{Hamel-et-al-ISMIR2011, author = {Hamel, Philippe and Lemieux, Simon and Bengio, Yoshua and Eck, Douglas}, title = {Temporal pooling and multiscale learning for automatic annotation and ranking of music audio}, booktitle = {In Proceedings of the 12th International Conference on Music Information Retrieval (ISMIR’11)}, year = {2011}, location = {Miami, FL, USA}, abstract = {This paper analyzes some of the challenges in performing automatic annotation and ranking of music audio, and proposes a few improvements. First, we motivate the use of principal component analysis on the mel-scaled spectrum. Secondly, we present an analysis of the impact of the selection of pooling functions for summarization of the features over time. We show that combining several pooling functions improves the performance of the system. Finally, we introduce the idea of multiscale learning. By incorporating these ideas in our model, we obtained state-of-the-art performance on the Magnatagatune dataset.} } @MISC{Hugo+al-snowbird-2007, author = {Larochelle, Hugo and Bengio, Yoshua and Erhan, Dumitru}, title = {Generalization to a zero-data task: an empirical study}, year = {2007}, howpublished = {Talk and poster presented at the Learning Workshop(Snowbird), San Juan, Puerto Rico, 2007} } @INPROCEEDINGS{hyper:2000:ijcnn, author = {Bengio, Yoshua}, title = {Continuous Optimization of Hyper-Parameters}, booktitle = {International Joint Conference on Neural Networks 2000}, volume = {I}, year = {2000}, pages = {305--310}, url = {http://www.iro.umontreal.ca/~lisa/pointeurs/hyper-ijcnn2000.pdf}, abstract = {Many machine learning algorithms can be formulated as the minimization of a training criterion which involves a hyper-parameter. This hyper-parameter is usually chosen by trial and error with a model selection criterion. In this paper we present a methodology to optimize several hyper-parameters, based on the computation of the gradient of a model selection criterion with respect to the hyper-parameters. In the case of a quadratic training criterion, the gradient of the selection criterion with respect to the hyper-parameters is efficiently computed by back-propagating through a Cholesky decomposition. In the more general case, we show that the implicit function theorem can be used to derive a formula for the hyper-parameter gradient involving second derivatives of the training criterion.}, topics={ModelSelection},cat={C}, } @INPROCEEDINGS{ICLR2014-conf-short, title = {International Conference on Learning Representations 2014}, booktitle = {ICLR'2014 (Conference Track)}, year = {-1} } @INPROCEEDINGS{ICLR2014-conf-shorter, title = {International Conference on Learning Representations 2014}, booktitle = {ICLR'2014}, year = {-1} } @TECHREPORT{ICLR2014-workshop-short, month = apr, title = {International Conference on Learning Representations 2014(workshop)}, year = {-1}, institution = {ICLR'2014(workshop)} } @TECHREPORT{ICLR2014-workshop-shorter, title = {International Conference on Learning Representations 2014(workshop)}, year = {-1}, institution = {ICLR 2014 (workshop)} } @INPROCEEDINGS{ICML01, editor = {Brodley, Carla E. and Danyluk, Andrea Pohoreckyj}, title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)}, booktitle = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)}, year = {-1}, publisher = {Morgan Kaufmann} } @INPROCEEDINGS{ICML01-short, editor = {Brodley, Carla E. and Danyluk, Andrea Pohoreckyj}, title = {Proceedings of the Eighteenth International Conference on Machine Learning (ICML'01)}, booktitle = {ICML'01}, year = {-1}, publisher = {Morgan Kaufmann} } @INPROCEEDINGS{ICML01-shorter, title = {ICML'01}, booktitle = {ICML'01}, year = {-1}, publisher = {Morgan Kaufmann} } @INPROCEEDINGS{ICML02, editor = {Sammut, Claude and Hoffmann, Achim G.}, title = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)}, booktitle = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)}, year = {-1}, publisher = {Morgan Kaufmann} } @INPROCEEDINGS{ICML02-short, editor = {Sammut, Claude and Hoffmann, Achim G.}, title = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML'02)}, booktitle = {ICML'02}, year = {-1}, publisher = {Morgan Kaufmann} } @INPROCEEDINGS{ICML02-shorter, title = {ICML'02}, booktitle = {ICML'02}, year = {-1}, publisher = {Morgan Kaufmann} } @INPROCEEDINGS{ICML03, editor = {Fawcett, Tom and Mishra, Nina}, title = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)}, booktitle = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)}, year = {-1}, publisher = {AAAI Press} } @INPROCEEDINGS{ICML03-short, editor = {Fawcett, Tom and Mishra, Nina}, title = {Proceedings of the Twenty International Conference on Machine Learning (ICML'03)}, booktitle = {ICML'03}, year = {-1}, publisher = {AAAI Press} } @INPROCEEDINGS{ICML03-shorter, title = {ICML'03}, booktitle = {ICML'03}, year = {-1}, publisher = {AAAI Press} } @INPROCEEDINGS{ICML04, editor = {Brodley, Carla E.}, title = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)}, booktitle = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML04-short, editor = {Brodley, Carla E.}, title = {Proceedings of the Twenty-first International Conference on Machine Learning (ICML'04)}, booktitle = {ICML'04}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML04-shorter, title = {ICML'04}, booktitle = {ICML'04}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML05-short, editor = {Raedt, Luc De and Wrobel, Stefan}, title = {Proceedings of the Twenty-second International Conference on Machine Learning (ICML'05)}, booktitle = {ICML'05}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML05-shorter, title = {ICML'05}, booktitle = {ICML'05}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML06-short, editor = {Cohen, William W. and Moore, Andrew}, title = {Proceedings of the Twenty-three International Conference on Machine Learning (ICML'06)}, booktitle = {ICML'06}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML06-shorter, title = {ICML'06}, booktitle = {ICML'06}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML07-short, editor = {Ghahramani, Zoubin}, title = {Proceedings of the 24th International Conference on Machine Learning (ICML'07)}, booktitle = {ICML'07}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML07-shorter, title = {ICML'07}, booktitle = {ICML'07}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML08-short, editor = {Cohen, William W. and McCallum, Andrew and Roweis, Sam T.}, title = {Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08)}, booktitle = {ICML'08}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML08-shorter, title = {ICML'08}, booktitle = {ICML'08}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML09-short, editor = {Bottou, {L{\'{e}}on} and Littman, Michael}, title = {Proceedings of the Twenty-sixth International Conference on Machine Learning (ICML'09)}, booktitle = {ICML'09}, year = {-1}, publisher = {ACM} } @INPROCEEDINGS{ICML09-shorter, title = {ICML'09}, booktitle = {ICML'09}, year = {-1}, publisher = {ACM} } @ARTICLE{ICML10, title = {Proceedings of the Twenty-seven International Conference on Machine Learning (ICML'10)}, journal = {Proceedings of the Twenty-seven International Conference on Machine Learning (ICML'10)}, year = {-1} } @INPROCEEDINGS{ICML10-short, title = {Proceedings of the Twenty-seven International Conference on Machine Learning (ICML'10)}, booktitle = {ICML10}, year = {-1}