msml21.bib

@Proceedings{msml21,
    booktitle = {Proceedings of the 2nd Mathematical and Scientific Machine Learning Conference},
    name = {Mathematical and Scientific Machine Learning},
    shortname = {msml},
    editor = {Joan Bruna and Jan Hesthaven and Lenka Zdeborova},
    volume = {145},
    year = {2021},
    start = {2021-08-16},
    end = {2021-08-19},
    published = {2022-04-30},
    conference_url = {https://msml21.github.io/},
    address = {Virtual Conference}
}

@InProceedings{adcock21,
	title = {Deep Neural Networks Are Effective At Learning High-Dimensional Hilbert-Valued Functions From Limited Data},
	author = {Adcock, Ben and Brugiapaglia, Simone and Dexter, Nick and Morage, Sebastian},
	pages = {1-36},
	abstract = {The accurate approximation of scalar-valued functions from sample points is a key task in mathematical modelling and computational science. Recently, machine learning techniques based on Deep Neural Networks (DNNs) have begun to emerge as promising tools for function approximation in scientific computing problems, with some impressive results achieved on problems where the dimension of the underlying data or problem domain is large. In this work, we broaden this perspective by focusing on the approximation of functions that are \textit{Hilbert-valued}, i.e.\ they take values in a separable, but typically infinite-dimensional, Hilbert space. This problem arises in many science and engineering problems, in particular those involving the solution of parametric Partial Differential Equations (PDEs). Such problems are challenging for three reasons. First, pointwise samples are expensive to acquire. Second, the domain of the function is usually high dimensional, and third, the range lies in a Hilbert space. Our contributions are twofold. First, we present a novel result on DNN training for holomorphic functions with so-called \textit{hidden anisotropy}. This result introduces a DNN training procedure and a full theoretical analysis with explicit guarantees on the error and sample complexity. This error bound is explicit in the three key errors occurred in the approximation procedure: the best approximation error, the measurement error and the physical discretization error. Our result shows that there exists a procedure (albeit a non-standard one) for learning Hilbert-valued functions via DNNs that performs as well as, but no better than current best-in-class schemes. It therefore gives a benchmark lower bound for how well methods DNN training can perform on such problems. Second, we examine whether better performance can be achieved in practice through different types of architectures and training. We provide preliminary numerical results illustrating the practical performance of DNNs on Hilbert-valued functions arising as solutions to parametric PDEs. We consider different parameters, modify the DNN architecture to achieve better and competitive results and compare these to current best-in-class schemes. }
}

@InProceedings{agazzi21,
	title = {Temporal-difference learning with nonlinear function approximation: lazy training and mean field regimes},
	author = {Agazzi, Andrea and Lu, Jianfeng},
	pages = {37-74},
	abstract = { We discuss the approximation of the value function for infinite-horizon discounted Markov Reward Processes (MRP) with wide neural networks trained with the Temporal-Difference (TD) learning algorithm. We first consider this problem under a certain scaling of the approximating function, leading to a regime called lazy training. In this regime, which arises naturally, implicit in the initialization of the neural network, the parameters of the model vary only slightly during the learning process, resulting in approximately linear behavior of the model. Both in the under- and over-parametrized frameworks, we prove exponential convergence to local, respectively global minimizers of the TD learning algorithm in the lazy training regime. We then compare the above scaling with the alternative mean-field scaling, where the approximately linear behavior of the model is lost. In this nonlinear, mean-field regime we prove that all fixed points of the dynamics in parameter space are global minimizers. We finally give examples of our convergence results in the case of models that diverge if trained with non-lazy TD learning.}
}

@InProceedings{aghazadeh21,
	title = {BEAR: Sketching BFGS Algorithm for Ultra-High Dimensional Feature Selection in Sublinear Memory},
	author = {Aghazadeh, Amirali and Gupta, Vipul and DeWeese, Alex and Koyluoglu, Ozan and Ramchandran, Kannan},
	pages = {75-92},
	abstract = {We consider feature selection for applications in machine learning where the dimensionality of the data is so large that it exceeds the working memory of the (local) computing machine. Unfortunately, current large-scale sketching algorithms show poor memory-accuracy trade-off in selecting features in high dimensions due to the irreversible collision and accumulation of the stochastic gradient noise in the sketched domain. Here, we develop a second-order feature selection algorithm, called BEAR, which avoids the extra collisions by efficiently storing the second-order stochastic gradients of the celebrated Broyden-Fletcher-Goldfarb-Shannon (BFGS) algorithm in Count Sketch, using a memory cost that grows sublinearly with the size of the feature vector. BEAR reveals an unexplored advantage of second-order optimization for memory-constrained high-dimensional gradient sketching. Our extensive experiments on several real-world data sets from genomics to language processing demonstrate that BEAR requires up to three orders of magnitude less memory space to achieve the same classification accuracy compared to the first-order sketching algorithms with a comparable run time. Our theoretical analysis further proves the global convergence of BEAR with $\mathcal{O}(1/t)$ rate in $t$ iterations of the sketched algorithm. }
}

@InProceedings{alsup21,
	title = {Multilevel Stein variational gradient descent with applications to Bayesian inverse problems},
	author = {Alsup, Terrence and Venturi, Luca and Peherstorfer, Benjamin},
	pages = {93-117},
	abstract = { This work presents a multilevel variant of Stein variational gradient descent to more efficiently sample from target distributions. The key ingredient is a sequence of distributions with growing fidelity and costs that converges to the target distribution of interest. For example, such a sequence of distributions is given by a hierarchy of ever finer discretization levels of the forward model in Bayesian inverse problems. The proposed multilevel Stein variational gradient descent moves most of the iterations to lower, cheaper levels with the aim of requiring only a few iterations on the higher, more expensive levels when compared to the traditional, single-level Stein variational gradient descent variant that uses the highest-level distribution only. Under certain assumptions, in the mean-field limit, the error of the proposed multilevel Stein method decays by a log factor faster than the error of the single-level counterpart with respect to computational costs. Numerical experiments with Bayesian inverse problems show speedups of more than one order of magnitude of the proposed multilevel Stein method compared to the single-level variant that uses the highest level only.}
}

@InProceedings{balestriero21,
	title = {Interpretable and Learnable Super-Resolution Time-Frequency Representation},
	author = {Balestriero, Randall and Glotin, Herve and Baranuik, Richard},
	pages = {118-152},
	abstract = {We develop a novel interpretable and learnable time-frequency representation (TFR) that produces a super-resolved quadratic signal representation for time-series analysis; the proposed TFR is a Gaussian filtering of the Wigner-Ville (WV) transform of a signal parametrized with a few inter- pretable parameters. Our approach has two main hallmarks. First, by varying the filters applied onto the WV, our new TFR can interpolate between known TFRs such as spectrograms, wavelet transforms, and chirplet transforms. Beyond that, our representation can also reach perfect time and frequency localization, hence super-resolution; this generalizes standard TFRs whose resolu- tion is limited by the Heisenberg uncertainty principle. Second, our proposed TFR is interpretable thanks to an explicit low-dimensional and physical parametrization of the WV Gaussian filtering. We demonstrate that our approach enables us to learn highly adapted TFRs and is able to tackle a range of large-scale classification tasks, where we reach higher performance compared to baseline and learned TFRs. Ours is to the best of our knowledge the first learnable TFR that can contin- uously interpolate between super-resolution representation and commonly employed TFRs based on a few learnable parameters and which preserves full interpretability of the produced TFR, even after learning. }
}

@InProceedings{bandeira21,
	title = {Average-Case Integrality Gap for Non-Negative Principal Component Analysis},
	author = {Bandeira, Afonso and Kunisky, Dmitriy and Wein, Alexander},
	pages = {153-171},
	abstract = { Montanari and Richard (2015) asked whether a natural semidefinite programming (SDP) relaxation can effectively optimize $\bx^{\top}\bW \bx$ over $\|\bx\| = 1$ with $x_i \geq 0$ for all coordinates $i$, where $\bW \in \RR^{n \times n}$ is drawn from the Gaussian orthogonal ensemble (GOE) or a spiked matrix model.
In small numerical experiments, this SDP appears to be \emph{tight} for the GOE, producing a rank-one optimal matrix solution aligned with the optimal vector $\bx$.
We prove, however, that as $n \to \infty$ the SDP is not tight, and certifies an upper bound asymptotically no better than the simple spectral bound $\lambda_{\max}(\bW)$ on this objective function.
We also provide evidence, using tools from recent literature on hypothesis testing with low-degree polynomials, that no subexponential-time certification algorithm can improve on this behavior.
Finally, we present further numerical experiments estimating how large $n$ would need to be before this limiting behavior becomes evident, providing a cautionary example against extrapolating asymptotics of SDPs in high dimension from their efficacy in small ``laptop scale'' computations.}
}

@InProceedings{boyarski21,
	title = {Spectral Geometric Matrix Completion},
	author = {Boyarski, Amit and Vedula, Sanketh and Bronstein, Alex},
	pages = {172-196},
	abstract = { Deep Matrix Factorization (DMF) is an emerging approach to the problem of matrix completion. Recent works have established that gradient descent applied to a DMF model induces an implicit regularization on the rank of the recovered matrix. In this work we interpret the DMF model through the lens of spectral geometry. This allows us to incorporate explicit regularization without breaking the DMF structure, thus enjoying the best of both worlds. In particular, we focus on matrix completion problems with underlying geometric or topological relations between the rows and/or columns. Such relations are prevalent in matrix completion problems that arise in many applications, such as recommender systems and drug-target interaction. Our contributions enable DMF models to exploit these relations, and make them competitive on real benchmarks, while exhibiting one of the first successful applications of deep linear networks.}
}

@InProceedings{cosentino21,
	title = {Deep Autoencoders:
From Understanding to Generalization Guarantees},
	author = {Cosentino, Romain and Balestriero, Randall and Baranuik, Richard and Aazhang, Behnaam},
	pages = {197-222},
	abstract = { A big mystery in deep learning continues to be the ability of methods to generalize when the number of model parameters is larger than the number of training examples. In this work, we take a step towards a better understanding of the underlying phenomena of Deep Autoencoders (AEs), a mainstream deep learning solution for learning compressed, interpretable, and structured data representations. In particular, we interpret how AEs approximate the data manifold by exploiting their continuous piecewise affine structure. Our reformulation of AEs provides new insights into their mapping, reconstruction guarantees, as well as an interpretation of commonly used regularization techniques. We leverage these findings to derive two new regularizations that enable AEs to capture the inherent symmetry in the data. Our regularizations leverage recent advances in the group of transformation learning to enable AEs to better approximate the data manifold without explicitly defining the group underlying the manifold. Under the assumption that the symmetry of the data can be explained by a Lie group, we prove that the regularizations ensure the generalization of the corresponding AEs. A range of experimental evaluations demonstrate that our methods outperform other state-of-the-art regularization techniques.}
}

@InProceedings{douglas21,
	title = {Numerical Calabi-Yau metrics from holomorphic networks},
	author = {Douglas, Michael and Lakshminarasimhan, Subramanian and Qi, Yidi},
	pages = {223-252},
	abstract = {We propose machine learning inspired methods for computing numerical Calabi-Yau (Ricci flat Ka ̈hler) metrics, and implement them using Tensorflow/Keras. We compare them with previous work, and find that they are far more accurate for manifolds with little or no symmetry. We also discuss issues such as overparameterization and choice of optimization methods. }
}

@InProceedings{e21a,
	title = {Some observations on high-dimensional partial differential equations with Barron data},
	author = {E, Weinan and Wojtowytsch, Stephan},
	pages = {253-269},
	abstract = { We use explicit representation formulas to show that solutions to certain partial differential equa- tions lie in Barron spaces or multilayer spaces if the PDE data lie in such function spaces. Conse- quently, these solutions can be represented efficiently using artificial neural networks, even in high dimension. Conversely, we present examples in which the solution fails to lie in the function space associated to a neural network under consideration.}
}

@InProceedings{e21b,
	title = {On the emergence of simplex symmetry in the final and penultimate layers of neural network classifiers},
	author = {E, Weinan and Wojtowytsch, Stephan},
	pages = {270-290},
	abstract = {A recent numerical study observed that neural network classifiers enjoy a large degree of symmetry in the penultimate layer. Namely, if $h(x) = Af(x) +b$ where $A$ is a linear map and $f$ is the output of the penultimate layer of the network (after activation), then all data points $x_{i, 1}, \dots, x_{i, N_i}$ in a class $C_i$ are mapped to a single point $y_i$ by $f$ and the points $y_i$ are located at the vertices of a regular $k-1$-dimensional \sw{standard simplex} in a high-dimensional Euclidean space.

We explain this observation analytically in toy models for highly expressive deep neural networks. In complementary examples, we demonstrate rigorously that even the final output of the classifier $h$ is not uniform over data samples from a class $C_i$ if $h$ is a shallow network (or if the deeper layers do not bring the data samples into a convenient geometric configuration). }
}

@InProceedings{feinauer21,
	title = {Reconstruction of Pairwise Interactions using Energy-Based Models},
	author = {Feinauer, Christoph and Lucibello, Carlo},
	pages = {291-313},
	abstract = {Pairwise models like the Ising model or the generalized Potts model have found many successful applications in fields like physics, biology, and economics. Closely connected is the problem of inverse statistical mechanics, where the goal is to infer the parameters of such models given observed data. An open problem in this field is the question of how to train these models in the case where the data contain additional higher-order interactions that are not present in the pairwise model. In this work, we propose an approach based on Energy-Based Models and pseudolikelihood maximization to address these complications: we show that hybrid models, which combine a pairwise model and a neural network, can lead to significant improvements in the reconstruction of pairwise interactions. We show these improvements to hold consistently when compared to a standard approach using only the pairwise model and to an approach using only a neural network. This is in line with the general idea that simple interpretable models and complex black-box models are not necessarily a dichotomy: interpolating these two classes of models can allow to keep some advantages of both. }
}

@InProceedings{ganassali21,
	title = {Sharp threshold for alignment of graph databases with Gaussian weights},
	author = {Ganassali, Luca},
	pages = {314-335},
	abstract = {We study the fundamental limits for reconstruction in weighted graph (or matrix) database alignment. We consider a model of two graphs where $\pi^*$ is a planted uniform permutation and all pairs of edge weights $(A_{i,j}, B_{\pi^*(i),\pi^*(j)})_{1 \leq i<j \leq n}$ are i.i.d. pairs of Gaussian variables with zero mean, unit variance and correlation parameter $\rho \in [0,1]$. We prove that there is a sharp threshold for exact recovery of $\pi^*$: if $n \rho^2 \geq (4+\eps) \log n + \omega(1)$ for some $\eps>0$, there is an estimator  $\hat{\pi}$ -- namely the MAP estimator -- based on the observation of databases $A,B$ that achieves exact reconstruction with high probability. Conversely, if $n \rho^2 \leq 4 \log n - \log \log n - \omega(1)$, then any estimator $\hat{\pi}$ verifies $\hat{\pi}=\pi$ with probability $o(1)$.

		This result shows that the information-theoretic threshold for exact recovery is the same as the one obtained for detection in a recent work by \cite{Wu20}: in other words, for Gaussian weighted graph alignment, the problem of reconstruction is not more difficult than that of detection. Though the reconstruction task was already well understood for vector-shaped database alignment (that is taking signal of the form $(u_i, v_{\pi^*(i)})_{1 \leq i\leq n}$ where $(u_i, v_{\pi^*(i)})$ are i.i.d. pairs in $\dR^{d_u} \times \dR^{d_v}$), its formulation for graph (or matrix) databases brings a drastically different problem for which the hard phase is conjectured to be wide.

		The proofs build upon the analysis of the MAP estimator and the second moment method, together with the study of the correlation structure of energies of permutations. }
}

@InProceedings{gao21,
	title = {Deep Generative Learning via Euler Particle Transport},
	author = {Gao, Yuan and Huang, Jian and Jiao, Yuling and Liu, Jin and Lu, Xiliang and Yang, Zhijian},
	pages = {336-368},
	abstract = {We propose an Euler particle transport (EPT) approach to generative learning. EPT is motivated by the problem of constructing an optimal transport map from a reference distribution to a target distribution characterized by the Monge-Ampe`re equation. Interpreting the infinitesimal linearization of the Monge-Ampe`re equation from the perspective of gradient flows in measure spaces leads to a stochastic McKean-Vlasov equation. We use the forward Euler method to solve this equation. The resulting forward Euler map pushes forward a reference distribution to the target. This map is the composition of a sequence of simple residual maps, which are computationally stable and easy to train. The key task in training is the estimation of the density ratios or differences that determine the residual maps. We estimate the density ratios based on the Bregman divergence with a gradient penalty using deep density-ratio fitting. We show that the proposed density-ratio estimators do not suffer from the “curse of dimensionality” if data is supported on a lower-dimensional manifold. Numerical experiments with multi-mode synthetic datasets and comparisons with the existing methods on real benchmark datasets support our theoretical results and demonstrate the effectiveness of the proposed method. }
}

@InProceedings{gispen21,
	title = {Ground States of Quantum Many Body Lattice Models via Reinforcement Learning},
	author = {Gispen, Willem and Lamacraft, Austen},
	pages = {369-385},
	abstract = {We introduce reinforcement learning (RL) formulations of the problem of finding the ground state of a many-body quantum mechanical model defined on a lattice. We show that stoquastic Hamilto- nians – those without a sign problem – have a natural decomposition into stochastic dynamics and a potential representing a reward function. The mapping to RL is developed for both continuous and discrete time, based on a generalized Feynman–Kac formula in the former case and a stochastic representation of the Schro ̈dinger equation in the latter. We discuss the application of this mapping to the neural representation of quantum states, spelling out the advantages over approaches based on direct representation of the wavefunction of the system. }
}

@InProceedings{goh21,
	title = {Solving Bayesian Inverse Problems via Variational Autoencoders},
	author = {Goh, Hwan and Sheriffdeen, Sheroze and Wittmer, Jonathan and Bui-Thanh, Tan},
	pages = {386-425},
	abstract = {In recent years, the field of machine learning has made phenomenal progress in the pursuit of simu- lating real-world data generation processes. One notable example of such success is the variational autoencoder (VAE). In this work, with a small shift in perspective, we leverage and adapt VAEs for a different purpose: uncertainty quantification (UQ) in scientific inverse problems. We intro- duce UQ-VAE: a flexible, adaptive, hybrid data/model-constrained framework for training neural networks capable of rapid modelling of the posterior distribution representing the unknown param- eter of interest. Specifically, from divergence-based variational inference, our framework is derived such that most of the information usually present in scientific inverse problems is fully utilized in the training procedure. Additionally, this framework includes an adjustable hyperparameter that allows selection of the notion of distance between the posterior model and the target distribution. This introduces more flexibility in controlling how optimization directs the learning of the posterior model. Further, this framework possesses an inherent adaptive optimization property that emerges through the learning of the posterior uncertainty. Numerical results for an elliptic PDE-constrained Bayesian inverse problem are provided to verify the proposed framework. }
}

@InProceedings{goldt21,
	title = {The Gaussian equivalence of generative models for learning with shallow neural networks},
	author = {Goldt, Sebastian and Loureiro, Bruno and Reeves, Galen and Krzakala, Florent and Mezard, Marc and Zdeborova, Lenka},
	pages = {426-471},
	abstract = {Understanding the impact of data structure on the computational tractability of learning is a key challenge for the theory of neural networks. Many theoretical works do not explicitly model training data, or assume that inputs are drawn component-wise independently from some simple probability distribution. Here, we go beyond this simple paradigm by studying the performance of neural networks trained on data drawn from pre-trained generative models. This is possible due to a Gaussian equivalence stating that the key metrics of interest, such as the training and test errors, can be fully captured by an appropriately chosen Gaussian model. We provide three strands of rigorous, analytical and numerical evidence corroborating this equivalence. First, we establish rigorous conditions for the Gaussian equivalence to hold in the case of single-layer generative models, as well as deterministic rates for convergence in distribution. Second, we leverage this equivalence to derive a closed set of equations describing the generalisation performance of two widely studied machine learning problems: two-layer neural networks trained using one-pass stochastic gradient descent, and full-batch pre-learned features or kernel methods. Finally, we perform experiments demonstrating how our theory applies to deep, pre-trained generative models. These results open a viable path to the theoretical study of machine learning models with realistic data. }
}

@InProceedings{phillips21,
	title = {Orientation-Preserving Vectorized Distance Between Curves},
	author = {Phillips, Jeff and Pourmahmood-Aghababa, Hasan},
	pages = {472-496},
	abstract = {We introduce an orientation-preserving landmark-based distance for continuous curves, which can be viewed as an alternative to the Fre ́chet or Dynamic Time Warping distances. This measure re- tains many of the properties of those measures, and we prove some relations, but can be interpreted as a Euclidean distance in a particular vector space. Hence it is significantly easier to use, faster for general nearest neighbor queries, and allows easier access to classification results than those measures. It is based on the signed distance function to the curves or other objects from a fixed set of landmark points. We also prove new stability properties with respect to the choice of landmark points, and along the way introduce a concept called signed local feature size (slfs) which param- eterizes these notions. Slfs explains the complexity of shapes such as non-closed curves where the notion of local orientation is in dispute – but is more general than the well-known concept of (unsigned) local feature size, and is for instance infinite for closed simple curves. Altogether, this work provides a novel, simple, and powerful method for oriented shape similarity and analysis. }
}

@InProceedings{huang21,
	title = {Adversarial Robustness of Stabilized Neural ODE Might be from Obfuscated Gradients},
	author = {Huang, Yifei and Yu, Yaodong and Zhang, Hongyang and Ma, Yi and Yao, Yuan},
	pages = {497-515},
	abstract = {In this paper we introduce a provably stable architecture for Neural Ordinary Differential Equations (ODEs) which achieves non-trivial adversarial robustness under white-box adversarial attacks even when the network is trained naturally. For most existing defense methods withstanding strong white-box attacks, to improve robustness of neural networks, they need to be trained adversarially, hence have to strike a trade-off between natural accuracy and adversarial robustness.
Inspired by dynamical system theory, we design a stabilized neural ODE network named SONet whose ODE blocks are skew-symmetric and proved to be input-output stable. With natural training, SONet can achieve comparable robustness with the state-of-the-art adversarial defense methods, without sacrificing natural accuracy.
Even replacing only the first layer of a ResNet by such a ODE block can exhibit further improvement in robustness, e.g., under PGD-20 ($\ell_\infty=0.031$) attack on CIFAR-10 dataset, it achieves 91.57\% and natural accuracy and 62.35\% robust accuracy, while a counterpart architecture of ResNet trained with TRADES achieves natural and robust accuracy 76.29\% and 45.24\%, respectively. To understand possible reasons behind this surprisingly good result, we further explore the possible mechanism underlying such . We show that the adaptive stepsize numerical ODE solvers, such as adaptive HEUN2, BOSH3, and DOPRI5, have a gradient masking effect that fails the PGD attacks which are sensitive to gradient information of training loss; on the other hand, they cannot fool the CW attack of robust gradients and the SPSA attack that is gradient-free. This provides a new explanation that the adversarial robustness of ODE-based networks mainly comes from the obfuscated gradients in numerical ODE solvers with adaptive step sizes. (Source codes: \url{https://github.com/silkylove/SONet}; \url{https://github.com/yao-lab/SONet}) }
}

@InProceedings{lawrence21,
	title = {Phase Retrieval with Holography and Untrained Priors: Tackling the Challenges of Low-Photon Nanoscale Imaging},
	author = {Lawrence, Hannah and Barmherzig, David and Li, Henry and Eickenberg, Michael and Gabrie, Marylou},
	pages = {516-567},
	abstract = {Phase retrieval is the inverse problem of recovering a signal from magnitude-only Fourier measure- ments, and underlies numerous imaging modalities, such as Coherent Diffraction Imaging (CDI). A variant of this setup, known as holography, includes a reference object that is placed adjacent to the specimen of interest before measurements are collected. The resulting inverse problem, known as holographic phase retrieval, is well-known to have improved problem conditioning relative to the original. This innovation, i.e. Holographic CDI, becomes crucial at the nanoscale, where imaging specimens such as viruses, proteins, and crystals require low-photon measurements. This data is highly corrupted by Poisson shot noise, and often lacks low-frequency content as well. In this work, we introduce a dataset-free deep learning framework for holographic phase retrieval adapted to these challenges. The key ingredients of our approach are the explicit and flexible incorporation of the physical forward model into an automatic differentiation procedure, the Poisson log-likelihood objective function, and an optional untrained deep image prior. We perform extensive evaluation under realistic conditions. Compared to competing classical methods, our method recovers signal from higher noise levels and is more resilient to suboptimal reference design, as well as to large missing regions of low frequencies in the observations. Finally, we show that these properties carry over to experimental data acquired on optical wavelengths. To the best of our knowledge, this is the first work to consider a dataset-free machine learning approach for holographic phase retrieval. }
}

@InProceedings{zhai21,
	title = {A deep learning method for solving Fokker-Planck equations},
	author = {Zhai, Jiayu and Dobson, Matthew and Li, Yao},
	pages = {568-597},
	abstract = {The time evolution of the probability distribution of a stochastic differential equation follows the Fokker-Planck equation, which usually has an unbounded, high-dimensional domain. Inspired by Li (2019), we propose a mesh-free Fokker-Planck solver, in which the solution to the Fokker-Planck equation is now represented by a neural network. The presence of the differential operator in the loss function improves the accuracy of the neural network representation and reduces the demand of data in the training process. Several high dimensional numerical examples are demonstrated. }
}

@InProceedings{li21,
	title = {A semigroup method for high dimensional committor functions based on neural network},
	author = {Li, Haoya and Khoo, Yuehaw and Ren, Yinuo and Ying, Lexing},
	pages = {598-618},
	abstract = {
This paper proposes a new method based on neural networks for computing the high-dimensional committor functions that satisfy Fokker-Planck equations. Instead of working with partial differ- ential equations, the new method works with an integral formulation based on the semigroup of the differential operator. The variational form of the new formulation is then solved by param- eterizing the committor function as a neural network. There are two major benefits of this new approach. First, stochastic gradient descent type algorithms can be applied in the training of the committor function without the need of computing any mixed second order derivatives. Moreover, unlike the previous methods that enforce the boundary conditions through penalty terms, the new method takes into account the boundary conditions automatically. Numerical results are provided to demonstrate the performance of the proposed method. }
}

@InProceedings{lin21a,
	title = {Decentralized Multi-Agents by Imitation of a Centralized Controller},
	author = {Lin, Alex Tong and Debord, Mark and Estabridis, Katia and Hewer, Gary and Montufar, Guido and Osher, Stanley},
	pages = {619-651},
	abstract = {We consider a multi-agent reinforcement learning problem where each agent seeks to maximize a shared reward while interacting with other agents, and they may or may not be able to communicate. Typically the agents do not have access to other agent policies and thus each agent is situated in a non-stationary and partially-observable environment. In order to obtain multi-agents that act in a decentralized manner, we introduce a novel algorithm under the popular framework of centralized training, but decentralized execution. This training framework first obtains solutions to a multi- agent problem with a single centralized joint-space learner, which is then used to guide imitation learning for independent decentralized multi-agents. This framework has the flexibility to use any reinforcement learning algorithm to obtain the expert as well as any imitation learning algorithm to obtain the decentralized agents. This is in contrast to other multi-agent learning algorithms that, for example, can require more specific structures. We present some theoretical bounds for our method, and we show that one can obtain decentralized solutions to a multi-agent problem through imitation learning. }
}

@InProceedings{lin21b,
	title = {A Data Driven Method for Computing Quasipotentials},
	author = {Lin, Bo and Li, Qianxiao and Ren, Weiqing},
	pages = {652-670},
	abstract = {The quasipotential is a natural generalization of the concept of energy functions to non-equilibrium systems. In the analysis of rare events in stochastic dynamics, it plays a central role in charac- terizing the statistics of transition events and the likely transition paths. However, computing the quasipotential is challenging, especially in high dimensional dynamical systems where a global landscape is sought. Traditional methods based on the dynamic programming principle or path space minimization tend to suffer from the curse of dimensionality. In this paper, we propose a simple and efficient machine learning method to resolve this problem. The key idea is to learn an orthogonal decomposition of the vector field that drives the dynamics, from which one can identify the quasipotential. We demonstrate on various example systems that our method can effectively compute quasipotential landscapes without requiring spatial discretization or solving path-space optimization problems. Moreover, the method is purely data driven in the sense that only observed trajectories of the dynamics are required for the computation of the quasipotential. These prop- erties make it a promising method to enable the general application of quasipotential analysis to dynamical systems away from equilibrium. }
}

@InProceedings{ma21,
	title = {A Qualitative Study of the Dynamic Behavior for Adaptive Gradient Algorithms},
	author = {Ma, Chao and Wu, Lei and E, Weinan},
	pages = {671-692},
	abstract = {The dynamic behavior of RMSprop and Adam algorithms is studied through a combination of careful numerical experiments and theoretical explanations. Three types of qualitative features are observed in the training loss curve: fast initial convergence, oscillations, and large spikes in the late phase. The sign gradient descent (signGD) flow, which is the limit of Adam when taking the learning rate to 0 while keeping the momentum parameters fixed, is used to explain the fast initial convergence. For the late phase of Adam, three different types of qualitative patterns are observed depending on the choice of the hyper-parameters: oscillations, spikes, and divergence. In particular, Adam converges much smoother and even faster when the values of the two momentum factors are close to each other. This observation is particularly important for scientific computing tasks, for which the training process usually proceeds into the high precision regime. }
}

@InProceedings{maillard21,
	title = {Construction of optimal spectral methods in phase retrieval},
	author = {Maillard, Antoine and Krzakala, Florent and Lu, Yue M. and Zdeborova, Lenka},
	pages = {693-720},
	abstract = {We consider the \emph{phase retrieval} problem, in which the observer wishes to recover a $n$-dimensional real or complex signal $\bX^\star$ from
        the (possibly noisy) observation
        of $|\bPhi \bX^\star|$, in which $\bPhi$ is a matrix of size $m \times n$. We consider a \emph{high-dimensional} setting where $n,m \to \infty$ with $m/n = \mathcal{O}(1)$, and
        a large class of (possibly correlated) random matrices $\bPhi$ and observation channels.
        Spectral methods are a powerful tool to obtain approximate observations of the signal $\bX^\star$ which can be then used as initialization
        for a subsequent algorithm, at a low
        computational cost.
        In this paper, we extend and unify previous results and approaches on spectral methods for the phase retrieval problem.
        More precisely, we combine the linearization of message-passing algorithms and the analysis of the \emph{Bethe Hessian}, a classical tool of
        statistical physics.
        Using this toolbox, we show how to derive optimal spectral methods for arbitrary channel noise and right-unitarily invariant matrix $\bPhi$, in an automated manner
        (i.e.\ with no optimization over any hyperparameter or preprocessing function). }
}

@InProceedings{rabbani21,
	title = {Practical and Fast Momentum-Based Power Methods},
	author = {Rabbani, Tahseen and Jain, Apollo and Rajkumar, Arjun and Huang, Furong},
	pages = {721-756},
	abstract = {The power method is a classical algorithm with broad applications in machine learning tasks, in- cluding streaming PCA, spectral clustering, and low-rank matrix approximation. The distilled pur- pose of the vanilla power method is to determine the largest eigenvalue (in absolute modulus) and its eigenvector of a matrix. A momentum-based scheme can be used to accelerate the power method, but achieving an optimal convergence rate with existing algorithms critically relies on additional spectral information that is unavailable at run-time, and sub-optimal initializations can result in di- vergence. In this paper, we provide a pair of novel momentum-based power methods, which we call the delayed momentum power method (DMPower) and a streaming variant, the delayed momentum streaming method (DMStream). Our methods leverage inexact deflation and are capable of achiev- ing near-optimal convergence with far less restrictive hyperparameter requirements. We provide convergence analyses for both algorithms through the lens of perturbation theory. Further, we ex- perimentally demonstrate that DMPower routinely outperforms the vanilla power method and that both algorithms match the convergence speed of an oracle running existing accelerated methods with perfect spectral knowledge. }
}

@InProceedings{rotskoff21,
	title = {Active Importance Sampling for Variational Objectives Dominated by Rare Events: Consequences for Optimization and Generalization},
	author = {Rotskoff, Grant M and Mitchell, Andrew R and Vanden-Eijnden, Eric},
	pages = {757-780},
	abstract = {Deep neural networks, when optimized with sufficient data, provide accurate representations of high-dimensional functions; in contrast, function approximation techniques that have predominated in scientific computing do not scale well with dimensionality. As a result, many high-dimensional sampling and approximation problems once thought intractable are being revisited through the lens of machine learning. While the promise of unparalleled accuracy may suggest a renaissance for applications that require parameterizing representations of complex systems, in many applications gathering sufficient data to develop such a representation remains a significant challenge. Here we introduce an approach that combines rare events sampling techniques with neural network train- ing to optimize objective functions that are dominated by rare events. We show that importance sampling reduces the asymptotic variance of the solution to a learning problem, suggesting benefits for generalization. We study our algorithm in the context of solving high-dimensional PDEs that admit a variational formulation, a problem with applications in statistical physics and implications in machine learning theory. Our numerical experiments demonstrate that we can successfully learn even with the compounding difficulties of high-dimension and rare data.}
}

@InProceedings{rudi21,
	title = {Parameter Estimation with Dense and Convolutional Neural Networks Applied to the FitzHugh–Nagumo ODE},
	author = {Rudi, Johann and Bessac, Julie and Lenzi, Amanda},
	pages = {781-808},
	abstract = {Machine learning algorithms have been successfully used to approximate nonlinear maps under weak assumptions on the structure and properties of the maps. We present deep neural networks using dense and convolutional layers to solve an inverse problem, where we seek to estimate param- eters of a FitzHugh–Nagumo model, which consists of a nonlinear system of ordinary differential equations (ODEs). We employ the neural networks to approximate reconstruction maps for model parameter estimation from observational data, where the data comes from the solution of the ODE and takes the form of a time series representing dynamically spiking membrane potential of a bio- logical neuron. We target this dynamical model because of the computational challenges it poses in an inference setting, namely, having a highly nonlinear and nonconvex data misfit term and permit- ting only weakly informative priors on parameters. These challenges cause traditional optimization to fail and alternative algorithms to exhibit large computational costs. We quantify the prediction errors of model parameters obtained from the neural networks and investigate the effects of net- work architectures with and without the presence of noise in observational data. We generalize our framework for neural network-based reconstruction maps to simultaneously estimate ODE parame- ters and parameters of autocorrelated observational noise. Our results demonstrate that deep neural networks have the potential to estimate parameters in dynamical models and stochastic processes, and they are capable of predicting parameters accurately for the FitzHugh–Nagumo model.}
}

@InProceedings{saglietti21,
	title = {Solvable Model for Inheriting the Regularization through Knowledge Distillation},
	author = {Saglietti, Luca and Zdeborova, Lenka},
	pages = {809-846},
	abstract = {In recent years the empirical success of transfer learning with neural networks has stimulated an increasing interest in obtaining a theoretical understanding of its core properties. Knowledge Dis- tillation where a smaller neural network is trained using the outputs of a larger neural network is a particularly interesting case of transfer learning. In the present work, we introduce a statistical physics framework that allows an analytic characterization of the properties of knowledge distil- lation (KD) in shallow neural networks. Focusing the analysis on a solvable model that exhibits a non-trivial generalization gap, we investigate the effectiveness of KD. We are able to show that, through KD, the regularization properties of the larger teacher model can be inherited by the smaller student and that the yielded generalization performance is closely linked to and limited by the op- timality of the teacher. Finally, we analyze the double descent phenomenology that can arise in the considered KD setting. }
}

@InProceedings{bollinger21,
	title = {Reduced Order Modeling using Shallow ReLU Networks with Grassmann Layers},
	author = {Bollinger, Kayla and Schaeffer, Hayden},
	pages = {847-867},
	abstract = {This paper presents a nonlinear model reduction method for systems of equations using a structured neural network. The neural network takes the form of a “three-layer” network with the first layer constrained to lie on the Grassmann manifold and the first activation function set to identity, while the remaining network is a standard two-layer ReLU neural network. The Grassmann layer deter- mines the reduced basis for the input space, while the remaining layers approximate the nonlinear input-output system. The training alternates between learning the reduced basis and the nonlin- ear approximation, and is shown to be more effective than fixing the reduced basis and training the network only. An additional benefit of this approach is, for data that lie on low-dimensional subspaces, that the number of parameters in the network does not need to be large. We show that our method can be applied to scientific problems in the data-scarce regime, which is typically not well-suited for neural networks approximations. Examples include reduced order modeling for nonlinear dynamical systems and several aerospace engineering problems. }
}

@InProceedings{seleznova21,
	title = {Analyzing Finite Neural Networks: Can We Trust Neural Tangent Kernel Theory?},
	author = {Seleznova, Mariia and Kutyniok, Gitta},
	pages = {868-895},
	abstract = {Neural Tangent Kernel (NTK) theory is widely used to study the dynamics of infinitely-wide deep neural networks (DNNs) under gradient descent. But do the results for infinitely-wide networks give us hints about the behavior of real finite-width ones? In this paper, we study empirically when NTK theory is valid in practice for fully-connected ReLU and sigmoid DNNs. We find out that whether a network is in the NTK regime depends on the hyperparameters of random initialization and the network’s depth. In particular, NTK theory does not explain the behavior of sufficiently deep networks initialized so that their gradients explode as they propagate through the network’s layers: the kernel is random at initialization and changes significantly during training in this case, contrary to NTK theory. On the other hand, in the case of vanishing gradients, DNNs are in the the NTK regime but become untrainable rapidly with depth. We also describe a framework to study generalization properties of DNNs, in particular the variance of network’s output function, by means of NTK theory and discuss its limits.}
}

@InProceedings{thorpe21,
	title = {Robust Certification for Laplace Learning on Geometric Graphs},
	author = {Thorpe, Matthew and Wang, Bao},
	pages = {896-920},
	abstract = {Graph Laplacian (GL)-based semi-supervised learning is one of the most used approaches for clas- sifying nodes in a graph. Understanding and certifying the adversarial robustness of machine learn- ing (ML) algorithms has attracted large amounts of attention from different research communities due to its crucial importance in many security-critical applied domains. There is great interest in the theoretical certification of adversarial robustness for popular ML algorithms. In this paper, we provide the first adversarial robust certification for the GL classifier. More precisely we quanti- tatively bound the difference in the classification accuracy of the GL classifier before and after an adversarial attack. Numerically, we validate our theoretical certification results and show that lever- aging existing adversarial defenses for the k-nearest neighbor classifier can remarkably improve the robustness of the GL classifier. }
}

@InProceedings{tirer21,
	title = {Kernel-Based Smoothness Analysis of Residual Networks},
	author = {Tirer, Tom and Bruna, Joan and Giryes, Raja},
	pages = {921-954},
	abstract = {A major factor in the success of deep neural networks is the use of sophisticated architectures rather than the classical multilayer perceptron (MLP). Residual networks (ResNets) stand out among these powerful modern architectures. Previous works focused on the optimization advantages of deep ResNets over deep MLPs. In this paper, we show another distinction between the two models, namely, a tendency of ResNets to promote smoother interpolations than MLPs. We analyze this phenomenon via the neural tangent kernel (NTK) approach. First, we compute the NTK for a considered ResNet model and prove its stability during gradient descent training. Then, we show by various evaluation methodologies that for ReLU activations the NTK of ResNet, and its kernel regression results, are smoother than the ones of MLP. The better smoothness observed in our analysis may explain the better generalization ability of ResNets and the practice of moderately attenuating the residual blocks. }
}

@InProceedings{xu21,
	title = {Dynamic Algorithms for Online Multiple Testing},
	author = {Xu, Ziyu and Ramdas, Aaditya},
	pages = {955-986},
	abstract = {We derive new algorithms for online multiple testing that provably control false discovery exceedance (FDX) while achieving orders of magnitude more power than previous methods. This statistical advance is enabled by the development of new algorithmic ideas: earlier algorithms are more “static” while our new ones allow for the dynamical adjustment of testing levels based on the amount of wealth the algorithm has accumulated. We demonstrate that our algorithms achieve higher power in a variety of synthetic experiments. We also prove that SupLORD can provide error control for both FDR and FDX, and controls FDR at stopping times. Stopping times are particularly important as they permit the experimenter to end the experiment arbitrarily early while maintaining desired control of the FDR. SupLORD is the first non-trivial algorithm, to our knowledge, that can control FDR at stopping times in the online setting. }
}

@InProceedings{xuan21,
	title = {Optimal Policies for a Pandemic: A Stochastic Game Approach and a Deep Learning Algorithm},
	author = {Xuan, Yao and Balkin, Robert and Han, Jiequn and Hu, Ruimeng and Ceniceros, Hector D},
	pages = {987-1012},
	abstract = {Game theory has been an effective tool in the control of disease spread and in suggesting optimal policies at both individual and area levels. In this paper, we propose a multi-region SEIR model based on stochastic differential game theory, aiming to formulate optimal regional policies for infectious diseases. Specifically, we enhance the standard epidemic SEIR model by taking into account the social and health policies issued by multiple region planners. This enhancement makes the model more realistic and powerful. However, it also introduces a formidable computational challenge due to the high dimensionality of the solution space brought by the presence of multiple regions. This significant numerical difficulty of the model structure motivates us to generalize the deep fictitious algorithm introduced in [Han and Hu, MSML2020, pp.221–245, PMLR, 2020] and develop an improved algorithm to overcome the curse of dimensionality. We apply the proposed model and algorithm to study the COVID-19 pandemic in three states: New York, New Jersey and Pennsylvania. The model parameters are estimated from real data posted by the Centers for Disease Control and Prevention (CDC). We are able to show the effects of the lockdown/travel ban policy on the spread of COVID-19 for each state and how their policies affect each other. }
}

@InProceedings{yang21,
	title = {Generalization and Memorization: The Bias Potential Model},
	author = {Yang, Hongkang and E, Weinan},
	pages = {1013-1043},
	abstract = {Models for learning probability distributions such as generative models and density estimators be- have quite differently from models for learning functions. One example is found in the memo- rization phenomenon, namely the ultimate convergence to the empirical distribution, that occurs in generative adversarial networks (GANs). For this reason, the issue of generalization is more subtle than that for supervised learning. For the bias potential model, we show that dimension- independent generalization accuracy is achievable if early stopping is adopted, despite that in the long term, the model either memorizes the samples or diverges. }
}

@InProceedings{yao21,
	title = {Noise-Robust End-to-End Quantum Control using Deep Autoregressive Policy Networks},
	author = {Yao, Jiahao and Kottering, Paul and Gundlach, Hans and Lin, Lin and Bukov, Marin},
	pages = {1044-1081},
	abstract = {Variational quantum eigensolvers have recently received increased attention, as they enable the use of quantum computing devices to find solutions to complex problems, such as the ground energy and ground state of strongly-correlated quantum many-body systems. In many applications, it is the optimization of both continuous and discrete parameters that poses a formidable challenge. Using reinforcement learning (RL), we present a hybrid policy gradient algorithm capable of simul- taneously optimizing continuous and discrete degrees of freedom in an uncertainty-resilient way. The hybrid policy is modeled by a deep autoregressive neural network to capture causality. We employ the algorithm to prepare the ground state of the nonintegrable quantum Ising model in a unitary process, parametrized by a generalized quantum approximate optimization ansatz: the RL agent solves the discrete combinatorial problem of constructing the optimal sequences of unitaries out of a predefined set and, at the same time, it optimizes the continuous durations for which these unitaries are applied. We demonstrate the noise-robust features of the agent by considering three sources of uncertainty: classical and quantum measurement noise, and errors in the control unitary durations. Our work exhibits the beneficial synergy between reinforcement learning and quantum control. }
}

@InProceedings{zhang21,
	title = {Implicit Form Neural Network for Learning Scalar Hyperbolic Conservation Laws},
	author = {Zhang, Xiaoping and Cheng, Tao and Ju, Lili},
	pages = {1082-1098},
	abstract = {
Conservation laws are critical to our understanding of the physical world. Numerical solution of hyperbolic conservation laws has been a very important and challenging task with some difficul- ties such as nonphysical solutions, solution discontinuities and shock wave capturing, and a large amount of conventional numerical solvers of finite difference, finite volume and discontinuous Galerkin types have been developed in the past decades. Along with the booming and great suc- cess of deep learning in computer vision and natural language processing, many excellent works also have emerged for their application to PDE related scientific problems in recent years. In this paper, we propose a deep learning method for solving scalar hyperbolic conservation laws. More specifically, we design a neural network model in the unsupervised fashion, called “IFNN”, based on a special implicit form for the solution of the problem. The proposed IFNN does not directly process the target PDEs, instead it deals with the nonlinear equations characterizing implicitly the solution of the PDEs. Numerical experiments and performance comparisons of our IFNN with the well-known PINN are performed on two well-known problems under various boundary conditions, the inviscid Burgers’ equation and the Lighthill-Whitham-Richards (LWR) model for traffic flow, and the results show that IFNN can significantly outperform PINN in capturing the shock waves. }
}

@InProceedings{zhu21a,
	title = {Borrowing From the Future: Addressing Double Sampling in Model-free Control},
	author = {Zhu, Yuhua and Izzo, Zachary and Ying, Lexing},
	pages = {1099-1136},
	abstract = {In model-free reinforcement learning, the temporal difference method is an important algorithm but might become unstable when combined with nonlinear function approximations. Bellman residual minimization with stochastic gradient descent (SGD) is stable but suffers from the double sampling problem: given the current state, two independent samples for the next state are required, but often only one sample is available. Recently, the borrowing-from-the-future (BFF) algorithm was intro- duced in (Zhu et al., 2020) to address this issue for policy evaluation. The main idea is to borrow extra randomness from the future to approximately re-sample the next state when the underlying dynamics of the problem are sufficiently smooth. This paper extends the BFF algorithm to action- value function based model-free control. We prove that BFF is close to unbiased SGD when the underlying dynamics vary slowly with respect to actions. We confirm our theoretical findings with numerical simulations. }
}

@InProceedings{zhu21b,
	title = {Hessian-Aided Random Perturbation (HARP) Using Noisy Zeroth-Order Queries},
	author = {Zhu, Jingyi},
	pages = {1137-1160},
	abstract = {In stochastic optimization problems using noisy zeroth-order (ZO) oracles only, the randomized counterpart of Kiefer-Wolfowitz-type method is widely used to estimate the gradient. Existing al- gorithms generate the randomized perturbation from a zero-mean unit-covariance distribution. In contrast, this work considers the generalization where the perturbations may have non-isotropic covariance matrix constructed from the ZO queries. We propose to feed the Hessian-inverse ap- proximation into the covariance of the random perturbation, so it is dubbed as Hessian-Aided Ran- dom Perturbation (HARP). HARP collects two or more (depending on the specific estimator form) zeroth-order queries per iteration to form approximations for both the gradient and the Hessian. We show the almost surely convergence and derive the convergence rate for HARP under stan- dard assumptions. We demonstrate, with theoretical guarantees and numerical experiments, that HARP is less sensitive to ill-conditioning and more query-efficient than other gradient approxima- tion schemes with isotropic-covariance random perturbation }
}

@InProceedings{zhu21c,
	title = {Hessian Estimation via Stein's Identity in Black-Box Problems},
	author = {Zhu, Jingyi},
	pages = {1161-1178},
	abstract = {
When only the noisy zeroth-order (ZO) oracle is available, stochastic approximation algorithms are popular for estimating the root of the multivariate gradient equation. Inspired by Stein’s identity, this work establishes a novel Hessian approximation scheme. We compare it with second-order si- multaneous perturbation stochastic approximation (2SPSA) algorithm (Spall, 2000). On the basis of the almost sure convergence guarantee with the same convergence rate, 2SPSA requires four ZO queries, while ours requires three instead. Moreover, 2SPSA requires two statistically independent perturbations and two differencing stepsizes, while ours requires generating one perturbation vec- tor and tuning one differencing stepsize only. Besides, the weighting mechanism for the Hessian estimate is generalized and the smoothness restriction on the loss function is relaxed compared to 2SPSA. Finally, we present numerical support for the reduced per-iteration ZO query complexity. }
}