/
AISTATS2019.bib
2894 lines (2173 loc) · 429 KB
/
AISTATS2019.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@Proceedings{AISTATS-2019,
booktitle = {Proceedings of Machine Learning Research},
name = {The 22nd International Conference on Artificial Intelligence and Statistics},
editor = {Kamalika Chaudhuri and Masashi Sugiyama},
volume = {89},
year = {2019},
start = {2019-04-16},
end = {2019-04-18},
published = {2019-04-11},
url = {https://www.aistats.org},
location = {Naha, Okinawa, Japan},
shortname = {AISTATS}
}
@InProceedings{pedregosa19a,
title = {Proximal Splitting Meets Variance Reduction},
author = {Pedregosa, Fabian and Fatras, Kilian and Casotto, Mattia},
pages = {1-10},
abstract = {Despite the raise to fame of stochastic variance reduced methods like SAGA and ProxSVRG, their use in non-smooth optimization is still limited to a few simple cases. Existing methods require to compute the proximal operator of the non-smooth term at each iteration, which, for complex penalties like the total variation, overlapping group lasso or trend filtering, is an iterative process that becomes unfeasible for moderately large problems. In this work we propose and analyze VRTOS, a variance-reduced method to solve problems with an arbitrary number of non-smooth terms. Like other variance reduced methods, it only requires to evaluate one gradient per iteration and converges with a constant step size, and so is ideally suited for large scale applications. Unlike existing variance reduced methods, it admits multiple non-smooth terms whose proximal operator only needs to be evaluated once per iteration. We provide a convergence rate analysis for the proposed methods that achieves the same asymptotic rate as their full gradient variants and illustrate its computational advantage on 4 different large scale datasets.}
}
@InProceedings{geng19a,
title = {Optimal Noise-Adding Mechanism in Additive Differential Privacy},
author = {Geng, Quan and Ding, Wei and Guo, Ruiqi and Kumar, Sanjiv},
pages = {11-20},
abstract = {We derive the optimal $(0, \delta)$-differentially private query-output independent noise-adding mechanism for single real-valued query function under a general cost-minimization framework. Under a mild technical condition, we show that the optimal noise probability distribution is a uniform distribution with a probability mass at the origin. We explicitly derive the optimal noise distribution for general $\ell^p$ cost functions, including $\ell^1$ (for noise magnitude) and $\ell^2$ (for noise power) cost functions, and show that the probability concentration on the origin occurs when $\delta > \frac{p}{p+1}$. Our result demonstrates an improvement over the existing Gaussian mechanisms by a factor of two and three for $(0,\delta)$-differential privacy in the high privacy regime in the context of minimizing the noise magnitude and noise power, and the gain is more pronounced in the low privacy regime. Our result is consistent with the existing result for $(0,\delta)$-differential privacy in the discrete setting, and identifies a probability concentration phenomenon in the continuous setting.}
}
@InProceedings{neykov19a,
title = {Tossing Coins Under Monotonicity},
author = {Neykov, Matey},
pages = {21-30},
abstract = {This paper considers the following problem: we are given n coin tosses of coins with monotone increasing probability of getting heads (success). We study the performance of the monotone constrained likelihood estimate, which is equivalent to the estimate produced by isotonic regression. We derive adaptive and non-adaptive bounds on the performance of the isotonic estimate, i.e., we demonstrate that for some probability vectors the isotonic estimate converges much faster than in general. As an application of this framework we propose a two step procedure for the binary monotone single index model, which consists of running LASSO and consequently running an isotonic regression. We provide thorough numerical studies in support of our claims.}
}
@InProceedings{neykov19b,
title = {Gaussian Regression with Convex Constraints},
author = {Neykov, Matey},
pages = {31-38},
abstract = {The focus of this paper is the linear model with Gaussian design under convex constraints. Specifically, we study the performance of the constrained least squares estimate. We derive two general results characterizing its performance - one requiring a tangent cone structure, and one which holds in a general setting. We use our general results to analyze three functional shape constrained problems where the signal is generated from an underlying Lipschitz, monotone or convex function. In each of the examples we show specific classes of functions which achieve fast adaptive estimation rates, and we also provide non-adaptive estimation rates which hold for any function. Our results demonstrate that the Lipschitz, monotone and convex constraints allow one to analyze regression problems even in high-dimensional settings where the dimension may scale as the square or fourth degree of the sample size respectively.}
}
@InProceedings{cardoso19a,
title = {Risk-Averse Stochastic Convex Bandit},
author = {Cardoso, Adrian Rivera and Xu, Huan},
pages = {39-47},
abstract = {Motivated by applications in clinical trials and finance, we study the problem of online convex optimization (with bandit feedback) where the decision maker is risk-averse. We provide two algorithms to solve this problem. The first one is a descent-type algorithm which is easy to implement. The second algorithm, which combines the ellipsoid method and a center point device, achieves (almost) optimal regret bounds with respect to the number of rounds. To the best of our knowledge this is the first attempt to address risk-aversion in the online convex bandit problem.}
}
@InProceedings{dedieu19a,
title = {Error bounds for sparse classifiers in high-dimensions},
author = {Dedieu, Antoine},
pages = {48-56},
abstract = {We prove an L2 recovery bound for a family of sparse estimators defined as minimizers of some empirical loss functions -- which include hinge loss and logistic loss. More precisely, we achieve an upper-bound for coefficients estimation scaling as $(k\ast/n)\log(p/k\ast)$: n,p is the size of the design matrix and k* the dimension of the theoretical loss minimizer. This is done under standard assumptions, for which we derive stronger versions of a cone condition and a restricted strong convexity. Our bound holds with high probability and in expectation and applies to an L1-regularized estimator and to a recently introduced Slope estimator, which we generalize for classification problems. Slope presents the advantage of adapting to unknown sparsity. Thus, we propose a tractable proximal algorithm to compute it and assess its empirical performance. Our results match the best existing bounds for classification and regression problems.}
}
@InProceedings{bellot19a,
title = {Boosting Transfer Learning with Survival Data from Heterogeneous Domains},
author = {Bellot, Alexis and van der Schaar, Mihaela},
pages = {57-65},
abstract = {Survival models derived from health care data are an important support to inform critical screening and therapeutic decisions. Most models however, do not generalize to populations outside the marginal and conditional distribution assumptions for which they were derived. This presents a significant barrier to the deployment of machine learning techniques into wider clinical practice as most medical studies are data scarce, especially for the analysis of time-to-event outcomes. In this work we propose a survival prediction model that is able to improve predictions on a small data domain of interest - such as a local hospital - by leveraging related data from other domains - such as data from other hospitals. We construct an ensemble of weak survival predictors which iteratively adapt the marginal distributions of the source and target data such that similar source patients contribute to the fit and ultimately improve predictions on target patients of interest. This represents the first boosting-based transfer learning algorithm in the survival analysis literature. We demonstrate the performance and utility of our algorithm on synthetic and real healthcare data collected at various locations.}
}
@InProceedings{bauer19a,
title = {Resampled Priors for Variational Autoencoders},
author = {Bauer, Matthias and Mnih, Andriy},
pages = {66-75},
abstract = {We propose Learned Accept/Reject Sampling (LARS), a method for constructing richer priors using rejection sampling with a learned acceptance function. This work is motivated by recent analyses of the VAE objective, which pointed out that commonly used simple priors can lead to underfitting. As the distribution induced by LARS involves an intractable normalizing constant, we show how to estimate it and its gradients efficiently. We demonstrate that LARS priors improve VAE performance on several standard datasets both when they are learned jointly with the rest of the model and when they are fitted to a pretrained model. Finally, we show that LARS can be combined with existing methods for defining flexible priors for an additional boost in performance.}
}
@InProceedings{hirt19a,
title = {Scalable Bayesian Learning for State Space Models using Variational Inference with SMC Samplers},
author = {Hirt, Marcel and Dellaportas, Petros},
pages = {76-86},
abstract = {We present a scalable approach to performing approximate fully Bayesian inference in generic state space models. The proposed method is an alternative to particle MCMC that provides fully Bayesian inference of both the dynamic latent states and the static pa- rameters of the model. We build up on recent advances in computational statistics that combine variational methods with sequential Monte Carlo sampling and we demonstrate the advantages of performing full Bayesian inference over the static parameters rather than just performing variational EM approxima- tions. We illustrate how our approach enables scalable inference in multivariate stochastic volatility models and self-exciting point pro- cess models that allow for flexible dynamics in the latent intensity function.}
}
@InProceedings{zhang19a,
title = {Scalable Thompson Sampling via Optimal Transport},
author = {Zhang, Ruiyi and Wen, Zheng and Chen, Changyou and Fang, Chen and Yu, Tong and Carin, Lawrence},
pages = {87-96},
abstract = {Thompson sampling (TS) is a class of algorithms for sequential decision-making, which requires maintaining a posterior distribution over a reward model. However, calculating exact posterior distributions is intractable for all but the simplest models. Consequently, how to computationally-efficiently approximate a posterior distribution is a crucial problem for scalable TS with complex models, such as neural networks. In this paper, we use distribution optimization techniques to approximate the posterior distribution, solved via Wasserstein gradient flows. Based on the framework, a principled particle-optimization algorithm is developed for TS to approximate the posterior efficiently. Our approach is scalable and does not make explicit distribution assumptions on posterior approximations. Extensive experiments on both synthetic data and large-scale real data demonstrate the superior performance of the proposed methods.}
}
@InProceedings{pierson19a,
title = {Inferring Multidimensional Rates of Aging from Cross-Sectional Data},
author = {Pierson, Emma and Koh, Pang Wei and Hashimoto, Tatsunori and Koller, Daphne and Leskovec, Jure and Eriksson, Nick and Liang, Percy},
pages = {97-107},
abstract = {Modeling how individuals evolve over time is a fundamental problem in the natural and social sciences. However, existing datasets are often cross-sectional with each individual observed only once, making it impossible to apply traditional time-series methods. Motivated by the study of human aging, we present an interpretable latent-variable model that learns temporal dynamics from cross-sectional data. Our model represents each individual's features over time as a nonlinear function of a low-dimensional, linearly-evolving latent state. We prove that when this nonlinear function is constrained to be order-isomorphic, the model family is identifiable solely from cross-sectional data provided the distribution of time-independent variation is known. On the UK Biobank human health dataset, our model reconstructs the observed data while learning interpretable rates of aging associated with diseases, mortality, and aging risk factors.}
}
@InProceedings{du19a,
title = {Interaction Detection with Bayesian Decision Tree Ensembles},
author = {Du, Junliang and Linero, Antonio R.},
pages = {108-117},
abstract = {Methods based on Bayesian decision tree ensembles have proven valuable in constructing high-quality predictions, and are particularly attractive in certain settings because they encourage low-order interaction effects. Despite adapting to the presence of low-order interactions for prediction purpose, we show that Bayesian decision tree ensembles are generally anti-conservative for the purpose of conducting interaction detection. We address this problem by introducing Dirichlet process forests (DP-Forests), which leverage the presence of low-order interactions by clustering the trees so that trees within the same cluster focus on detecting a specific interaction. We show on both simulated and benchmark data that DP-Forests perform well relative to existing interaction detection techniques for detecting low-order interactions, attaining very low false-positive and false-negative rates while maintaining the same performance for prediction using a comparable computational budget.}
}
@InProceedings{barnes19a,
title = {On the Interaction Effects Between Prediction and Clustering},
author = {Barnes, Matt and Dubrawski, Artur},
pages = {118-126},
abstract = {Machine learning systems increasingly depend on pipelines of multiple algorithms to provide high quality and well structured predictions. This paper argues interaction effects between clustering and prediction (e.g. classification, regression) algorithms can cause subtle adverse behaviors during cross-validation that may not be initially apparent. In particular, we focus on the problem of estimating the out-of-cluster (OOC) prediction loss given an approximate clustering with probabilistic error rate p_0. Traditional cross-validation techniques exhibit significant empirical bias in this setting, and the few attempts to estimate and correct for these effects are intractable on larger datasets. Further, no previous work has been able to characterize the conditions under which these empirical effects occur, and if they do, what properties they have. We precisely answer these questions by providing theoretical properties which hold in various settings, and prove that expected out-of-cluster loss behavior rapidly decays with even minor clustering errors. Fortunately, we are able to leverage these same properties to construct hypothesis tests and scalable estimators necessary for correcting the problem. Empirical results on benchmark datasets validate our theoretical results and demonstrate how scaling techniques provide solutions to new classes of problems.}
}
@InProceedings{lin19a,
title = {Towards a Theoretical Understanding of Hashing-Based Neural Nets},
author = {Lin, Yibo and Song, Zhao and Yang, Lin F.},
pages = {127-137},
abstract = {Parameter reduction has been a popular topic in deep learning due to the ever- increasing size of deep neural network models and the need to train and run deep neural nets on resource limited machines. Despite many efforts in this area, there were no rigorous theoretical guarantees on why existing neural net compression methods should work. In this paper, we provide provable guarantees on some hashing-based parameter reduction methods in neural nets. First, we introduce a neural net compression scheme based on random linear sketching (which is usually implemented efficiently via hashing), and show that the sketched (smaller) network is able to approximate the original network on all input data coming from any smooth well-conditioned low-dimensional manifold. The sketched network can also be trained directly via back-propagation. Next, we study the previously proposed HashedNets architecture and show that the optimization landscape of one-hidden-layer HashedNets has a local strong convexity property similar to a normal fully connected neural network. Together with the initialization algorithm developed in [51], this implies that the parameters in HashedNets can be provably recovered. We complement our theoretical results with some empirical verification.}
}
@InProceedings{zhou19a,
title = {Faster First-Order Methods for Stochastic Non-Convex Optimization on Riemannian Manifolds},
author = {Zhou, Pan and Yuan, Xiao-Tong and Feng, Jiashi},
pages = {138-147},
abstract = {SPIDER (Stochastic Path Integrated Differential EstimatoR) is an efficient gradient estimation technique developed for non-convex stochastic optimization. Although having been shown to attain nearly optimal computational complexity bounds, the SPIDER-type methods are limited to linear metric spaces. In this paper, we introduce the Riemannian SPIDER (R-SPIDER) method as a novel nonlinear-metric extension of SPIDER for efficient non-convex optimization on Riemannian manifolds. We prove that for finite-sum problems with $n$ components, R-SPIDER converges to an $\epsilon$-accuracy stationary point within $\mathcal{O}\big(\min\big(n+\frac{\sqrt{n}}{\epsilon^2},\frac{1}{\epsilon^3}\big)\big)$ stochastic gradient evaluations, which is sharper in magnitude than the prior Riemannian first-order methods. For online optimization, R-SPIDER is shown to converge with $\mathcal{O}\big(\frac{1}{\epsilon^3}\big)$ complexity which is, to the best of our knowledge, the first non-asymptotic result for online Riemannian optimization. Especially, for gradient dominated functions, we further develop a variant of R-SPIDER and prove its linear convergence rate. Numerical results demonstrate the computational efficiency of the proposed methods.}
}
@InProceedings{zhou19b,
title = {LF-PPL: A Low-Level First Order Probabilistic Programming Language for Non-Differentiable Models},
author = {Zhou, Yuan and Gram-Hansen, Bradley J and Kohn, Tobias and Rainforth, Tom and Yang, Hongseok and Wood, Frank},
pages = {148-157},
abstract = {We develop a new Low-level, First-order Probabilistic Programming Language~(LF-PPL) suited for models containing a mix of continuous, discrete, and/or piecewise-continuous variables. The key success of this language and its compilation scheme is in its ability to automatically distinguish parameters the density function is discontinuous with respect to, while further providing runtime checks for boundary crossings. This enables the introduction of new inference engines that are able to exploit gradient information, while remaining efficient for models which are not everywhere differentiable. We demonstrate this ability by incorporating a discontinuous Hamiltonian Monte Carlo (DHMC) inference engine that is able to deliver automated and efficient inference for non-differentiable models. Our system is backed up by a mathematical formalism that ensures that any model expressed in this language has a density with measure zero discontinuities to maintain the validity of the inference engine.}
}
@InProceedings{park19a,
title = {Identifiability of Generalized Hypergeometric Distribution (GHD) Directed Acyclic Graphical Models},
author = {Park, Gunwoong and Park, Hyewon},
pages = {158-166},
abstract = {We introduce a new class of identifiable DAG models where the conditional distribution of each node given its parents belongs to a family of generalized hypergeometric distributions (GHD). A family of generalized hypergeometric distributions includes a lot of discrete distributions such as the binomial, Beta-binomial, negative binomial, Poisson, hyper-Poisson, and many more. We prove that if the data drawn from the new class of DAG models, one can fully identify the graph structure. We further present a reliable and polynomial-time algorithm that recovers the graph from finitely many data. We show through theoretical results and numerical experiments that our algorithm is statistically consistent in high-dimensional settings (p >n) if the indegree of the graph is bounded, and out-performs state-of-the-art DAG learning algorithms.}
}
@InProceedings{titsias19a,
title = {Unbiased Implicit Variational Inference},
author = {Titsias, Michalis K. and Ruiz, Francisco},
pages = {167-176},
abstract = {We develop unbiased implicit variational inference (UIVI), a method that expands the applicability of variational inference by defining an expressive variational family. UIVI considers an implicit variational distribution obtained in a hierarchical manner using a simple reparameterizable distribution whose variational parameters are defined by arbitrarily flexible deep neural networks. Unlike previous works, UIVI directly optimizes the evidence lower bound (ELBO) rather than an approximation to the ELBO. We demonstrate UIVI on several models, including Bayesian multinomial logistic regression and variational autoencoders, and show that UIVI achieves both tighter ELBO and better predictive performance than existing approaches at a similar computational cost.}
}
@InProceedings{kuzborskij19a,
title = {Efficient Linear Bandits through Matrix Sketching},
author = {Kuzborskij, Ilja and Cella, Leonardo and Cesa-Bianchi, Nicol\`{o}},
pages = {177-185},
abstract = {We prove that two popular linear contextual bandit algorithms, OFUL and Thompson Sampling, can be made efficient using Frequent Directions, a deterministic online sketching technique. More precisely, we show that a sketch of size $m$ allows a $\mathcal{O}(md)$ update time for both algorithms, as opposed to $\Omega(d^2)$ required by their non-sketched versions in general (where $d$ is the dimension of context vectors). This computational speedup is accompanied by regret bounds of order $(1+\varepsilon_m)^{3/2}d\sqrt{T}$ for OFUL and of order $\big((1+\varepsilon_m)d\big)^{3/2}\sqrt{T}$ for Thompson Sampling, where $\varepsilon_m$ is bounded by the sum of the tail eigenvalues not covered by the sketch. In particular, when the selected contexts span a subspace of dimension at most $m$, our algorithms have a regret bound matching that of their slower, non-sketched counterparts. Experiments on real-world datasets corroborate our theoretical results.}
}
@InProceedings{rowland19a,
title = {Orthogonal Estimation of Wasserstein Distances},
author = {Rowland, Mark and Hron, Jiri and Tang, Yunhao and Choromanski, Krzysztof and Sarlos, Tamas and Weller, Adrian},
pages = {186-195},
abstract = {Wasserstein distances are increasingly used in a wide variety of applications in machine learning. Sliced Wasserstein distances form an important subclass which may be estimated efficiently through one-dimensional sorting operations. In this paper, we propose a new variant of sliced Wasserstein distance, study the use of orthogonal coupling in Monte Carlo estimation of Wasserstein distances and draw connections with stratified sampling, and evaluate our approaches experimentally in a range of large-scale experiments in generative modelling and reinforcement learning.}
}
@InProceedings{du19b,
title = {Linear Convergence of the Primal-Dual Gradient Method for Convex-Concave Saddle Point Problems without Strong Convexity},
author = {Du, Simon S. and Hu, Wei},
pages = {196-205},
abstract = {We consider the convex-concave saddle point problem $\min_{x}\max_{y} f(x)+y^\top A x-g(y)$ where $f$ is smooth and convex and $g$ is smooth and strongly convex. We prove that if the coupling matrix $A$ has full column rank, the vanilla primal-dual gradient method can achieve linear convergence even if $f$ is not strongly convex. Our result generalizes previous work which either requires $f$ and $g$ to be quadratic functions or requires proximal mappings for both $f$ and $g$. We adopt a novel analysis technique that in each iteration uses a "ghost" update as a reference, and show that the iterates in the primal-dual gradient method converge to this "ghost" sequence. Using the same technique we further give an analysis for the primal-dual stochastic variance reduced gradient method for convex-concave saddle point problems with a finite-sum structure.}
}
@InProceedings{sakaue19a,
title = {Greedy and IHT Algorithms for Non-convex Optimization with Monotone Costs of Non-zeros},
author = {Sakaue, Shinsaku},
pages = {206-215},
abstract = {Non-convex optimization methods, such as greedy-style algorithms and iterative hard thresholding (IHT), for $\ell_0$-constrained minimization have been extensively studied thanks to their high empirical performances and strong guarantees. However, few works have considered non-convex optimization with general non-zero patterns; this is unfortunate since various non-zero patterns are quite common in practice. In this paper, we consider the case where non-zero patterns are specified by monotone set functions. We first prove an approximation guarantee of a cost-benefit greedy (CBG) algorithm by using the {\it weak submodularity} of the problem. We then consider an IHT-style algorithm, whose projection step uses CBG, and prove its convergence guarantee. We also provide many applications and experimental results that confirm the advantages of the algorithms introduced.}
}
@InProceedings{lang19a,
title = {Block Stability for MAP Inference},
author = {Lang, Hunter and Sontag, David and Vijayaraghavan, Aravindan},
pages = {216-225},
abstract = {Recent work (Lang et al., 2018) has shown that some popular approximate MAP inference algorithms perform very well when the input instance is stable. The simplest stability condition assumes that the MAP solution does not change at all when some of the pairwise potentials are adversarially perturbed. Unfortunately, this strong condition does not seem to hold in practice. We introduce a significantly more relaxed condition that only requires portions of an input instance to be stable. Under this block stability condition, we prove that the pairwise LP relaxation is persistent on the stable blocks. We complement our theoretical results with an evaluation of real-world examples from computer vision, and we find that these instances have large stable regions.}
}
@InProceedings{yang19a,
title = {A Stein--Papangelou Goodness-of-Fit Test for Point Processes},
author = {Yang, Jiasen and Rao, Vinayak and Neville, Jennifer},
pages = {226-235},
abstract = {Point processes provide a powerful framework for modeling the distribution and interactions of events in time or space. Their flexibility has given rise to a variety of sophisticated models in statistics and machine learning, yet model diagnostic and criticism techniques remain underdeveloped. In this work, we propose a general Stein operator for point processes based on the Papangelou conditional intensity function. We then establish a kernel goodness-of-fit test by defining a Stein discrepancy measure for general point processes. Notably, our test also applies to non-Poisson point processes whose intensity functions contain intractable normalization constants due to the presence of complex interactions among points. We apply our proposed test to several point process models, and show that it outperforms a two-sample test based on the maximum mean discrepancy.}
}
@InProceedings{choromanski19a,
title = {KAMA-NNs: Low-dimensional Rotation Based Neural Networks},
author = {Choromanski, Krzysztof and Pacchiano, Aldo and Pennington, Jeffrey and Tang, Yunhao},
pages = {236-245},
abstract = {We present new architectures for feedforward neural networks built from products of learned or random low-dimensional rotations that offer substantial space compression and computational speedups in comparison to the unstructured baselines. Models using them are also competitive with the baselines and often, due to imposed orthogonal structure, outperform baselines accuracy-wise. We propose to use our architectures in two settings. We show that in the non-adaptive scenario (random neural networks) they lead to asymptotically more accurate, space-efficient and faster estimators of the so-called PNG-kernels (for any activation function defining the PNG). This generalizes several recent theoretical results about orthogonal estimators (e.g. orthogonal JLTs, orthogonal estimators of angular kernels and more). In the adaptive setting we propose efficient algorithms for learning products of low-dimensional rotations and show how our architectures can be used to improve space and time complexity of state of the art reinforcement learning (RL) algorithms (e.g. PPO, TRPO). Here they offer up to 7x compression of the network in comparison to the unstructured baselines and outperform reward-wise state of the art structured neural networks offering similar computational gains and based on low displacement rank matrices.}
}
@InProceedings{berthet19a,
title = {Statistical Windows in Testing for the Initial Distribution of a Reversible Markov Chain},
author = {Berthet, Quentin and Kanade, Varun},
pages = {246-255},
abstract = {We study the problem of hypothesis testing between two discrete distributions, where we only have access to samples after the action of a known reversible Markov chain, playing the role of noise. We derive instance-dependent minimax rates for the sample complexity of this problem, and show how its dependence in time is related to the spectral properties of the Markov chain. We show that there exists a wide statistical window, in terms of sample complexity for hypothesis testing between different pairs of initial distributions. We illustrate these results in several concrete examples.}
}
@InProceedings{tassarotti19a,
title = {Sketching for Latent Dirichlet-Categorical Models},
author = {Tassarotti, Joseph and Tristan, Jean-Baptiste and Wick, Michael},
pages = {256-265},
abstract = {Recent work has explored transforming data sets into smaller, approximate summaries in order to scale Bayesian inference. We examine a related problem in which the parameters of a Bayesian model are very large and expensive to store in memory, and propose more compact representations of parameter values that can be used during inference. We focus on a class of graphical models that we refer to as latent Dirichlet-Categorical models, and show how a combination of two sketching algorithms known as count-min sketch and approximate counters provide an efficient representation for them. We show that this sketch combination -- which, despite having been used before in NLP applications, has not been previously analyzed -- enjoys desirable properties. We prove that for this class of models, when the sketches are used during Markov Chain Monte Carlo inference, the equilibrium of sketched MCMC converges to that of the exact chain as sketch parameters are tuned to reduce the error rate.}
}
@InProceedings{ardywibowo19a,
title = {Adaptive Activity Monitoring with Uncertainty Quantification in Switching Gaussian Process Models},
author = {Ardywibowo, Randy and Zhao, Guang and Wang, Zhangyang and Mortazavi, Bobak and Huang, Shuai and Qian, Xiaoning},
pages = {266-275},
abstract = {Emerging wearable sensors have enabled the unprecedented ability to continuously monitor human activities for healthcare purposes. However, with so many ambient sensors collecting different measurements, it becomes important not only to maintain good monitoring accuracy, but also low power consumption to ensure sustainable monitoring. This power-efficient sensing scheme can be achieved by deciding which group of sensors to use at a given time, requiring an accurate characterization of the trade-off between sensor energy usage and the uncertainty in ignoring certain sensor signals while monitor- ing. To address this challenge in the context of activity monitoring, we have designed an adaptive activity monitoring framework. We first propose a switching Gaussian process to model the observed sensor signals emitting from the underlying activity states. To efficiently compute the Gaussian process model likelihood and quantify the context prediction uncertainty, we propose a block circulant embedding technique and use Fast Fourier Transforms (FFT) for inference. By computing the Bayesian loss function tailored to switching Gaussian processes, an adaptive monitoring procedure is developed to select features from available sensors that optimize the trade-off between sensor power consumption and the prediction performance quantified by state prediction entropy. We demonstrate the effectiveness of our framework on the popular benchmark of UCI Human Activity Recognition using Smartphones.}
}
@InProceedings{iyer19a,
title = {Near Optimal Algorithms for Hard Submodular Programs with Discounted Cooperative Costs},
author = {Iyer, Rishabh and Bilmes, Jeffrey},
pages = {276-285},
abstract = {In this paper, we investigate a class of submodular problems which in general are very hard. These include minimizing a submodular cost function under combinatorial constraints, which include cuts, matchings, paths, etc., optimizing a submodular function under submodular cover and submodular knapsack constraints, and minimizing a ratio of submodular functions. All these problems appear in several real world problems but have hardness factors of $\Omega(\sqrt{n})$ for general submodular cost functions. We show how we can achieve constant approximation factors when we restrict the cost functions to low rank sums of concave over modular functions. A wide variety of machine learning applications are very naturally modeled via this subclass of submodular functions. Our work therefore provides a tighter connection between theory and practice by enabling theoretically satisfying guarantees for a rich class of expressible, natural, and useful submodular cost models. We empirically demonstrate the utility of our models on real world problems of cooperative image matching and sensor placement with cooperative costs.}
}
@InProceedings{garber19a,
title = {Fast Stochastic Algorithms for Low-rank and Nonsmooth Matrix Problems},
author = {Garber, Dan and Kaplan, Atara},
pages = {286-294},
abstract = {Composite convex optimization problems which include both a nonsmooth term and a low-rank promoting term have important applications in machine learning and signal processing, such as when one wishes to recover an unknown matrix that is simultaneously low-rank and sparse. However, such problems are highly challenging to solve in large-scale: the low-rank promoting term prohibits efficient implementations of proximal methods for composite optimization and even simple subgradient methods. On the other hand, methods which are tailored for low-rank optimization, such as conditional gradient-type methods, which are often applied to a smooth approximation of the nonsmooth objective, are slow since their runtime scales with both the large Lipchitz parameter of the smoothed gradient vector and with $1/\epsilon$, where $\epsilon$ is the target accuracy. In this paper we develop efficient algorithms for \textit{stochastic} optimization of a strongly-convex objective which includes both a nonsmooth term and a low-rank promoting term. In particular, to the best of our knowledge, we present the first algorithm that enjoys all following critical properties for large-scale problems: i) (nearly) optimal sample complexity, ii) each iteration requires only a single \textit{low-rank} SVD computation, and iii) overall number of thin-SVD computations scales only with $\log{1/\epsilon}$ (as opposed to $\textrm{poly}(1/\epsilon)$ in previous methods). We also give an algorithm for the closely-related finite-sum setting. We empirically demonstrate our results on the problem of recovering a simultaneously low-rank and sparse matrix.}
}
@InProceedings{garber19b,
title = {Logarithmic Regret for Online Gradient Descent Beyond Strong Convexity},
author = {Garber, Dan},
pages = {295-303},
abstract = {Hoffman's classical result gives a bound on the distance of a point from a convex and compact polytope in terms of the magnitude of violation of the constraints. Recently, several results showed that Hoffman's bound can be used to derive strongly-convex-like rates for first-order methods for \textit{offline} convex optimization of curved, though not strongly convex, functions, over polyhedral sets. In this work, we use this classical result for the first time to obtain faster rates for \textit{online convex optimization} over polyhedral sets with curved convex, though not strongly convex, loss functions. We show that under several reasonable assumptions on the data, the standard \textit{Online Gradient Descent} algorithm guarantees logarithmic regret. To the best of our knowledge, the only previous algorithm to achieve logarithmic regret in the considered settings is the \textit{Online Newton Step} algorithm which requires quadratic (in the dimension) memory and at least quadratic runtime per iteration, which greatly limits its applicability to large-scale problems. In particular, our results hold for \textit{semi-adversarial} settings in which the data is a combination of an arbitrary (adversarial) sequence and a stochastic sequence, which might provide reasonable approximation for many real-world sequences, or under a natural assumption that the data is low-rank. We demonstrate via experiments that the regret of OGD is indeed comparable to that of ONS (and even far better) on curved though not strongly-convex losses.}
}
@InProceedings{hanzely19a,
title = {Accelerated Coordinate Descent with Arbitrary Sampling and Best Rates for Minibatches},
author = {Hanzely, Filip and Richtarik, Peter},
pages = {304-312},
abstract = {Accelerated coordinate descent is a widely popular optimization algorithm due to its efficiency on large-dimensional problems. It achieves state-of-the-art complexity on an important class of empirical risk minimization problems. In this paper we design and analyze an accelerated coordinate descent (\texttt{ACD}) method which in each iteration updates a random subset of coordinates according to an arbitrary but fixed probability law, which is a parameter of the method. While mini-batch variants of \texttt{ACD} are more popular and relevant in practice, there is no importance sampling for \texttt{ACD} that outperforms the standard uniform mini-batch sampling. Through insights enabled by our general analysis, we design new importance sampling for mini-batch \texttt{ACD} which significantly outperforms previous state-of-the-art minibatch \texttt{ACD} in practice. We prove a rate that is at most $\mathcal{O}(\sqrt{\tau})$ times worse than the rate of minibatch \texttt{ACD} with uniform sampling, but can be $\mathcal{O}(n/\tau)$ times better, where $\tau$ is the minibatch size. Since in modern supervised learning training systems it is standard practice to choose $\tau \ll n$, and often $\tau=\mathcal{O}(1)$, our method can lead to dramatic speedups. Lastly, we obtain similar results for minibatch nonaccelerated \texttt{CD} as well, achieving improvements on previous best rates.}
}
@InProceedings{mukhoty19a,
title = {Globally-convergent Iteratively Reweighted Least Squares for Robust Regression Problems},
author = {Mukhoty, Bhaskar and Gopakumar, Govind and Jain, Prateek and Kar, Purushottam},
pages = {313-322},
abstract = {We provide the first global model recovery results for the IRLS (iteratively reweighted least squares) heuristic for robust regression problems. IRLS is known to offer excellent performance, despite bad initializations and data corruption, for several parameter estimation problems. Existing analyses of IRLS frequently require careful initialization, thus offering only local convergence guarantees. We remedy this by proposing augmentations to the basic IRLS routine that not only offer guaranteed global recovery, but in practice also outperform state-of-the-art algorithms for robust regression. Our routines are more immune to hyperparameter misspecification in basic regression tasks, as well as applied tasks such as linear-armed bandit problems. Our theoretical analyses rely on a novel extension of the notions of strong convexity and smoothness to weighted strong convexity and smoothness, and establishing that sub-Gaussian designs offer bounded weighted condition numbers. These notions may be useful in analyzing other algorithms as well.}
}
@InProceedings{hollocou19a,
title = {Modularity-based Sparse Soft Graph Clustering},
author = {Hollocou, Alexandre and Bonald, Thomas and Lelarge, Marc},
pages = {323-332},
abstract = {Clustering is a central problem in machine learning for which graph-based approaches have proven their efficiency. In this paper, we study a relaxation of the modularity maximization problem, well-known in the graph partitioning literature. A solution of this relaxation gives to each element of the dataset a probability to belong to a given cluster, whereas a solution of the standard modularity problem is a partition. We introduce an efficient optimization algorithm to solve this relaxation, that is both memory efficient and local. Furthermore, we prove that our method includes, as a special case, the Louvain optimization scheme, a state-of-the-art technique to solve the traditional modularity problem. Experiments on both synthetic and real-world data illustrate that our approach provides meaningful information on various types of data.}
}
@InProceedings{jankowiak19a,
title = {Pathwise Derivatives for Multivariate Distributions},
author = {Jankowiak, Martin and Karaletsos, Theofanis},
pages = {333-342},
abstract = {We exploit the link between the transport equation and derivatives of expectations to construct efficient pathwise gradient estimators for multivariate distributions. We focus on two main threads. First, we use null solutions of the transport equation to construct adaptive control variates that can be used to construct gradient estimators with reduced variance. Second, we consider the case of multivariate mixture distributions. In particular we show how to compute pathwise derivatives for mixtures of multivariate Normal distributions with arbitrary means and diagonal covariances. We demonstrate in a variety of experiments in the context of variational inference that our gradient estimators can outperform other methods, especially in high dimensions.}
}
@InProceedings{liu19a,
title = {Distributed Inexact Newton-type Pursuit for Non-convex Sparse Learning},
author = {Liu, Bo and Yuan, Xiao-Tong and Wang, Lezi and Liu, Qingshan and Huang, Junzhou and Metaxas, Dimitris N.},
pages = {343-352},
abstract = {In this paper, we present a sample distributed greedy pursuit method for non-convex sparse learning under cardinality constraint. Given the training samples uniformly randomly partitioned across multiple machines, the proposed method alternates between local inexact sparse minimization of a Newton-type approximation and centralized global results aggregation. Theoretical analysis shows that for a general class of convex functions with Lipschitze continues Hessian, the method converges linearly with contraction factor scaling inversely to the local data size; whilst the communication complexity required to reach desirable statistical accuracy scales logarithmically with respect to the number of machines for some popular statistical learning models. For nonconvex objective functions, up to a local estimation error, our method can be shown to converge to a local stationary sparse solution with sub-linear communication complexity. Numerical results demonstrate the efficiency and accuracy of our method when applied to large-scale sparse learning tasks including deep neural nets pruning}
}
@InProceedings{chang19a,
title = {Vine copula structure learning via Monte Carlo tree search},
author = {Chang, Bo and Pan, Shenyi and Joe, Harry},
pages = {353-361},
abstract = {Monte Carlo tree search (MCTS) has been widely adopted in various game and planning problems. It can efficiently explore a search space with guided random sampling. In statistics, vine copulas are flexible multivariate dependence models that adopt vine structures, which are based on a hierarchy of trees to express conditional dependence, and bivariate copulas on the edges of the trees. The vine structure learning problem has been challenging due to the large search space. To tackle this problem, we propose a novel approach to learning vine structures using MCTS. The proposed method has significantly better performance over the existing methods under various experimental setups.}
}
@InProceedings{dong19a,
title = {Blind Demixing via Wirtinger Flow with Random Initialization},
author = {Dong, Jialin and Shi, Yuanming},
pages = {362-370},
abstract = {This paper concerns the problem of demixing a series of source signals from the sum of bilinear measurements. This problem spans diverse areas such as communication, imaging processing, machine learning, etc. However, semidefinite programming for blind demixing is prohibitive to large-scale problems due to high computational complexity and storage cost. Although several efficient algorithms have been developed recently that enjoy the benefits of fast convergence rates and even regularization free, they still call for spectral initialization. To find simple initialization approach that works equally well as spectral initialization, we propose to solve blind demixing problem via Wirtinger flow with random initialization, which yields a natural implementation. To reveal the efficiency of this algorithm, we provide the global convergence guarantee concerning randomly initialized Wirtinger flow for blind demixing. Specifically, it shows that with sufficient samples, the iterates of randomly initialized Wirtinger flow can enter a local region that enjoys strong convexity and strong smoothness within a few iterations at the first stage. At the second stage, iterates of randomly initialized Wirtinger flow further converge linearly to the ground truth.}
}
@InProceedings{hiranandani19a,
title = {Performance Metric Elicitation from Pairwise Classifier Comparisons},
author = {Hiranandani, Gaurush and Boodaghians, Shant and Mehta, Ruta and Koyejo, Oluwasanmi},
pages = {371-379},
abstract = {Given a binary prediction problem, which performance metric should the classifier optimize? We address this question by formalizing the problem of Metric Elicitation. The goal of metric elicitation is to discover the performance metric of a practitioner, which reflects her innate rewards (costs) for correct (incorrect) classification. In particular, we focus on eliciting binary classification performance metrics from pairwise feedback, where a practitioner is queried to provide relative preference between two classifiers. By exploiting key geometric properties of the space of confusion matrices, we obtain provably query efficient algorithms for eliciting linear and linear-fractional performance metrics. We further show that our method is robust to feedback and finite sample noise.}
}
@InProceedings{jung19a,
title = {Analysis of Network Lasso for Semi-Supervised Regression},
author = {Jung, Alexander and Vesselinova, Natalia},
pages = {380-387},
abstract = {We apply network Lasso to semi-supervised regression problems involving network-structured data. This approach lends quite naturally to highly scalable learning algorithms in the form of message passing over an empirical graph which represents the network structure of the data. By using a simple non-parametric regression model, which is motivated by a clustering hypothesis, we provide an analysis of the estimation error incurred by network Lasso. This analysis reveals conditions on the network structure and the available training data which guarantee network Lasso to be accurate. Remarkably, the accuracy of network Lasso is related to the existence of suciently large network flows over the empirical graph. Thus, our analysis reveals a connection between network Lasso and maximum network flow problems.}
}
@InProceedings{kargas19a,
title = {Learning Mixtures of Smooth Product Distributions: Identifiability and Algorithm},
author = {Kargas, Nikos and Sidiropoulos, Nicholas D.},
pages = {388-396},
abstract = {We study the problem of learning a mixture model of non-parametric product distributions. The problem of learning a mixture model is that of finding the component distributions along with the mixing weights using observed samples generated from the mixture. The problem is well-studied in the parametric setting, i.e., when the component distributions are members of a parametric family - such as Gaussian distributions. In this work, we focus on multivariate mixtures of non-parametric product distributions and propose a two-stage approach which recovers the component distributions of the mixture under a smoothness condition. Our approach builds upon the identifiability properties of the canonical polyadic (low-rank) decomposition of tensors, in tandem with Fourier and Shannon-Nyquist sampling staples from signal processing. We demonstrate the effectiveness of the approach on synthetic and real datasets.}
}
@InProceedings{shen19a,
title = {Robust Matrix Completion from Quantized Observations},
author = {Shen, Jie and Awasthi, Pranjal and Li, Ping},
pages = {397-407},
abstract = {1-bit matrix completion refers to the problem of recovering a real-valued low-rank matrix from a small fraction of its sign patterns. In many real-world applications, however, the observations are not only highly quantized, but also grossly corrupted. In this work, we consider the noisy statistical model where each observed entry can be flipped with some probability after quantization. We propose a simple maximum likelihood estimator which is shown to be robust to the sign flipping noise. In particular, we prove an upper bound on the statistical error, showing that with overwhelming probability $n = O(poly(1-2E[\tau])^{-2} rd \log d)$ samples are sufficient for accurate recovery, where $r$ and $d$ are the rank and dimension of the underlying matrix respectively, and tau in $[0, 1/2)$ is a random variable that parameterizes the sign flipping noise. Furthermore, a lower bound is established showing that the obtained sample complexity is near-optimal for prevalent statistical models. Finally, we substantiate our theoretical findings with a comprehensive study on synthetic and realistic data sets, and demonstrate the state-of-the-art performance.}
}
@InProceedings{mariet19a,
title = {Foundations of Sequence-to-Sequence Modeling for Time Series},
author = {Mariet, Zelda and Kuznetsov, Vitaly},
pages = {408-417},
abstract = {The availability of large amounts of time series data, paired with the performance of deep-learning algorithms on a broad class of problems, has recently led to significant interest in the use of sequence-to-sequence models for time series forecasting. We provide the first theoretical analysis of this time series forecasting framework. We include a comparison of sequence-to-sequence modeling to classical time series models, and as such our theory can serve as a quantitative guide for practitioners choosing between different modeling methodologies.}
}
@InProceedings{cao19a,
title = {Nearly Optimal Adaptive Procedure with Change Detection for Piecewise-Stationary Bandit},
author = {Cao, Yang and Wen, Zheng and Kveton, Branislav and Xie, Yao},
pages = {418-427},
abstract = {Multi-armed bandit (MAB) is a class of online learning problems where a learning agent aims to maximize its expected cumulative reward while repeatedly selecting to pull arms with unknown reward distributions. We consider a scenario where the reward distributions may change in a piecewise-stationary fashion at unknown time steps. We show that by incorporating a simple change-detection component with classic UCB algorithms to detect and adapt to changes, our so-called M-UCB algorithm can achieve nearly optimal regret bound on the order of $O(\sqrt{MKT\log T})$, where $T$ is the number of time steps, $K$ is the number of arms, and $M$ is the number of stationary segments. Comparison with the best available lower bound shows that our M-UCB is nearly optimal in $T$ up to a logarithmic factor. We also compare M-UCB with the state-of-the-art algorithms in numerical experiments using a public Yahoo! dataset and a real-world digital marketing dataset to demonstrate its superior performance.}
}
@InProceedings{zhao19a,
title = {An Optimal Algorithm for Stochastic Three-Composite Optimization},
author = {Zhao, Renbo and Haskell, William B. and Tan, Vincent Y. F.},
pages = {428-437},
abstract = {We develop an optimal primal-dual first-order algorithm for a class of stochastic three-composite convex minimization problems. The convergence rate of our method not only improves upon the existing methods, but also matches a lower bound derived for all first-order methods that solve this problem. We extend our proposed algorithm to solve a composite stochastic program with any finite number of nonsmooth functions. In addition, we generalize an optimal stochastic alternating direction method of multipliers (SADMM) algorithm proposed for the two-composite case to solve this problem, and establish its connection to our optimal primal-dual algorithm. We perform extensive numerical experiments on a variety of machine learning applications to demonstrate the superiority of our method via-a-vis the state-of-the-art.}
}
@InProceedings{cheung19a,
title = {A Thompson Sampling Algorithm for Cascading Bandits},
author = {Cheung, Wang Chi and Tan, Vincent and Zhong, Zixin},
pages = {438-447},
abstract = {We design and analyze TS-Cascade, a Thompson sampling algorithm for the cascading bandit problem. In TS-Cascade, Bayesian estimates of the click probability are constructed using a univariate Gaussian; this leads to a more efficient exploration procedure vis-ã-vis existing UCB-based approaches. We also incorporate the empirical variance of each item's click probability into the Bayesian updates. These two novel features allow us to prove an expected regret bound of the form $\tilde{O}(\sqrt{KLT})$ where $L$ and $K$ are the number of ground items and the number of items in the chosen list respectively and $T\ge L$ is the number of Thompson sampling update steps. This matches the state-of-the-art regret bounds for UCB-based algorithms. More importantly, it is the first theoretical guarantee on a Thompson sampling algorithm for any stochastic combinatorial bandit problem model with partial feedback. Empirical experiments demonstrate superiority of TS-Cascade compared to existing UCB-based procedures in terms of the expected cumulative regret and the time complexity.}
}
@InProceedings{wu19a,
title = {Lifelong Optimization with Low Regret},
author = {Wu, Yi-Shan and Wang, Po-An and Lu, Chi-Jen},
pages = {448-456},
abstract = {In this work, we study a problem arising from two lines of works: online optimization and lifelong learning. In the problem, there is a sequence of tasks arriving sequentially, and within each task, we have to make decisions one after one and then suffer corresponding losses. The tasks are related as they share some common representation, but they are different as each requires a different predictor on top of the representation. As learning a representation is usually costly in lifelong learning scenarios, the goal is to learn it continuously through time across different tasks, making the learning of later tasks easier than previous ones. We provide such learning algorithms with good regret bounds which can be seen as natural generalization of prior works on online optimization.}
}
@InProceedings{pandit19a,
title = {Sparse Multivariate Bernoulli Processes in High Dimensions},
author = {Pandit, Parthe and Sahraee-Ardakan, Mojtaba and Amini, Arash and Rangan, Sundeep and Fletcher, Alyson K},
pages = {457-466},
abstract = {We consider the problem of estimating the parameters of a multivariate Bernoulli process with auto-regressive feedback in the high-dimensional setting where the number of samples available is much less than the number of parameters. This problem arises in learning interconnections of networks of dynamical systems with spiking or binary valued data. We also allow the process to depend on its past up to a lag p, for a general $p \geq 1$, allowing for more realistic modeling in many applications. We propose and analyze an $\ell_1$-regularized maximum likelihood (ML) estimator under the assumption that the parameter tensor is approximately sparse. Rigorous analysis of such estimators is made challenging by the dependent and non-Gaussian nature of the process as well as the presence of the nonlinearities and multi-level feedback. We derive precise upper bounds on the mean-squared estimation error in terms of the number of samples, dimensions of the process, the lag $p$ and other key statistical properties of the model. The ideas presented can be used in the rigorous high-dimensional analysis of regularized $M$-estimators for other sparse nonlinear and non-Gaussian processes with long-range dependence.}
}
@InProceedings{zimmert19a,
title = {An Optimal Algorithm for Stochastic and Adversarial Bandits},
author = {Zimmert, Julian and Seldin, Yevgeny},
pages = {467-475},
abstract = {We derive an algorithm that achieves the optimal (up to constants) pseudo-regret in both adversarial and stochastic multi-armed bandits without prior knowledge of the regime and time horizon. The algorithm is based on online mirror descent with Tsallis entropy regularizer. We provide a complete characterization of such algorithms and show that Tsallis entropy with power $\alpha = 1/2$ achieves the goal. In addition, the proposed algorithm enjoys improved regret guarantees in two intermediate regimes: the moderately contaminated stochastic regime defined by Seldin and Slivkins [22] and the stochastically constrained adversary studied by Wei and Luo [26]. The algorithm also obtains adversarial and stochastic optimality in the utility-based dueling bandit setting. We provide empirical evaluation of the algorithm demonstrating that it outperforms Ucb1 and Exp3 in stochastic environments. In certain adversarial regimes the algorithm significantly outperforms Ucb1 and Thompson Sampling, which exhibit close to linear regret.}
}
@InProceedings{kleinegesse19a,
title = {Efficient Bayesian Experimental Design for Implicit Models},
author = {Kleinegesse, Steven and Gutmann, Michael U.},
pages = {476-485},
abstract = {Bayesian experimental design involves the optimal allocation of resources in an experiment, with the aim of optimising cost and performance. For implicit models, where the likelihood is intractable but sampling from the model is possible, this task is particularly difficult and therefore largely unexplored. This is mainly due to technical difficulties associated with approximating posterior distributions and utility functions. We devise a novel experimental design framework for implicit models that improves upon previous work in two ways. First, we use the mutual information between parameters and data as the utility function, which has previously not been feasible. We achieve this by utilising Likelihood-Free Inference by Ratio Estimation (LFIRE) to approximate posterior distributions, instead of the traditional approximate Bayesian computation or synthetic likelihood methods. Secondly, we use Bayesian optimisation in order to solve the optimal design problem, as opposed to the typically used grid search or sampling-based methods. We find that this increases efficiency and allows us to consider higher design dimensions.}
}
@InProceedings{adolphs19a,
title = {Local Saddle Point Optimization: A Curvature Exploitation Approach},
author = {Adolphs, Leonard and Daneshmand, Hadi and Lucchi, Aurelien and Hofmann, Thomas},
pages = {486-495},
abstract = {Gradient-based optimization methods are the most popular choice for finding local optima for classical minimization and saddle point problems. Here, we highlight a systemic issue of gradient dynamics that arise for saddle point problems, namely the presence of undesired stable stationary points that are no local optima. We propose a novel optimization approach that exploits curvature information in order to escape from these undesired stationary points. We prove that different optimization methods, including gradient method and Adagrad, equipped with curvature exploitation can escape non-optimal stationary points. We also provide empirical results on common saddle point problems which confirm the advantage of using curvature exploitation.}
}
@InProceedings{marx19a,
title = {Testing Conditional Independence on Discrete Data using Stochastic Complexity},
author = {Marx, Alexander and Vreeken, Jilles},
pages = {496-505},
abstract = {Testing for conditional independence is a core aspect of constraint-based causal discovery. Although commonly used tests are perfect in theory, they often fail to reject independence in practice---especially when conditioning on multiple variables. We focus on discrete data and propose a new test based on the notion of algorithmic independence that we instantiate using stochastic complexity. Amongst others, we show that our proposed test, SCI, is an asymptotically unbiased as well as L2 consistent estimator for conditional mutual information (CMI). Further, we show that SCI can be reformulated to find a sensible threshold for CMI that works well on limited samples. Empirical evaluation shows that SCI has a lower type II error than commonly used tests. As a result, we obtain a higher recall when we use SCI in causal discovery algorithms, without compromising the precision.}
}
@InProceedings{staib19a,
title = {Distributionally Robust Submodular Maximization},
author = {Staib, Matthew and Wilder, Bryan and Jegelka, Stefanie},
pages = {506-516},
abstract = {Submodular functions have applications throughout machine learning, but in many settings, we do not have direct access to the underlying function f. We focus on stochastic functions that are given as an expectation of functions over a distribution P. In practice, we often have only a limited set of samples f_i from P. The standard approach indirectly optimizes f by maximizing the sum of f_i. However, this ignores generalization to the true (unknown) distribution. In this paper, we achieve better performance on the actual underlying function f by directly optimizing a combination of bias and variance. Algorithmically, we accomplish this by showing how to carry out distributionally robust optimization (DRO) for submodular functions, providing efficient algorithms backed by theoretical guarantees which leverage several novel contributions to the general theory of DRO. We also show compelling empirical evidence that DRO improves generalization to the unknown stochastic submodular function.}
}
@InProceedings{zhu19a,
title = {A Robust Zero-Sum Game Framework for Pool-based Active Learning},
author = {Zhu, Dixian and Li, Zhe and Wang, Xiaoyu and Gong, Boqing and Yang, Tianbao},
pages = {517-526},
abstract = {In this paper, we present a novel robust zero- sum game framework for pool-based active learning grounded on advanced statistical learning theory. Pool-based active learning usually consists of two components, namely, learning of a classifier given labeled data and querying of unlabeled data for labeling. Most previous studies on active learning consider these as two separate tasks and propose various heuristics for selecting important unlabeled data for labeling, which may render the selection of unlabeled examples sub-optimal for minimizing the classification error. In contrast, the present work formulates active learning as a unified optimization framework for learning the classifier, i.e., the querying of labels and the learning of models are unified to minimize a common objective for statistical learning. In addition, the proposed method avoids the issues of many previous algorithms such as inefficiency, sampling bias and sensitivity to imbalanced data distribution. Besides theoretical analysis, we conduct extensive experiments on benchmark datasets and demonstrate the superior performance of the proposed active learning method compared with the state-of-the-art methods.}
}
@InProceedings{johansson19a,
title = {Support and Invertibility in Domain-Invariant Representations},
author = {Johansson, Fredrik D and Sontag, David and Ranganath, Rajesh},
pages = {527-536},
abstract = {Learning domain-invariant representations has become a popular approach to unsupervised domain adaptation and is often justified by invoking a particular suite of theoretical results. We argue that there are two significant flaws in such arguments. First, the results in question hold only for a fixed representation and do not account for information lost in non-invertible transformations. Second, domain invariance is often a far too strict requirement and does not always lead to consistent estimation, even under strong and favorable assumptions. In this work, we give generalization bounds for unsupervised domain adaptation that hold for any representation function by acknowledging the cost of non-invertibility. In addition, we show that penalizing distance between densities is often wasteful and propose a bound based on measuring the extent to which the support of the source domain covers the target domain. We perform experiments on well-known benchmarks that illustrate the short-comings of current standard practice.}
}
@InProceedings{aglietti19a,
title = {Efficient Inference in Multi-task Cox Process Models},
author = {Aglietti, Virginia and Damoulas, Theodoros and Bonilla, Edwin V.},
pages = {537-546},
abstract = {We generalize the log Gaussian Cox process (LGCP) framework to model multiple correlated point data jointly. The observations are treated as realizations of multiple LGCPs, whose log intensities are given by linear combinations of latent functions drawn from Gaussian process priors. The combination coefficients are also drawn from Gaussian processes and can incorporate additional dependencies. We derive closed-form expressions for the moments of the intensity functions and develop an efficient variational inference algorithm that is orders of magnitude faster than competing deterministic and stochastic approximations of multivariate LGCPs, coregionalization models, and multi-task permanental processes. Our approach outperforms these benchmarks in multiple problems, offering the current state of the art in modeling multivariate point processes.}
}
@InProceedings{laude19a,
title = {Optimization of Inf-Convolution Regularized Nonconvex Composite Problems},
author = {Laude, Emanuel and Wu, Tao and Cremers, Daniel},
pages = {547-556},
abstract = {In this work, we consider nonconvex composite problems that involve inf-convolution with a Legendre function, which gives rise to an anisotropic generalization of the proximal mapping and Moreau-envelope. In a convex setting such problems can be solved via alternating minimization of a splitting formulation, where the consensus constraint is penalized with a Legendre function. In contrast, for nonconvex models it is in general unclear that this approach yields stationary points to the infimal convolution problem. To this end we analytically investigate local regularity properties of the Moreau-envelope function under prox-regularity, which allows us to establish the equivalence between stationary points of the splitting model and the original inf-convolution model. We apply our theory to characterize stationary points of the penalty objective, which is minimized by the elastic averaging SGD (EASGD) method for distributed training, showing that perfect consensus between the workers is attainable via a finite penalty parameter. Numerically, we demonstrate the practical relevance of the proposed approach on the important task of distributed training of deep neural networks.}
}
@InProceedings{li19a,
title = {On Connecting Stochastic Gradient MCMC and Differential Privacy},
author = {Li, Bai and Chen, Changyou and Liu, Hao and Carin, Lawrence},
pages = {557-566},
abstract = {Concerns related to data security and confidentiality have been raised when applying machine learning to real-world applications. Differential privacy provides a principled and rigorous privacy guarantee for machine learning models. While it is common to inject noise to design a model satisfying a required differential-privacy property, it is generally hard to balance the trade-off between privacy and utility. We show that stochastic gradient Markov chain Monte Carlo (SG-MCMC) -- a class of scalable Bayesian posterior sampling algorithms -- satisfies strong differential privacy, when carefully chosen stepsizes are employed. We develop theory on the performance of the proposed differentially-private SG-MCMC method. We conduct experiments to support our analysis, and show that a standard SG-MCMC sampler with minor modification can reach state-of-the-art performance in terms of both privacy and utility on Bayesian learning.}
}
@InProceedings{carter19a,
title = {What made you do this? Understanding black-box decisions with sufficient input subsets},
author = {Carter, Brandon and Mueller, Jonas and Jain, Siddhartha and Gifford, David},
pages = {567-576},
abstract = {Local explanation frameworks aim to rationalize particular decisions made by a black-box prediction model. Existing techniques are often restricted to a specific type of predictor or based on input saliency, which may be undesirably sensitive to factors unrelated to the model's decision making process. We instead propose sufficient input subsets that identify minimal subsets of features whose observed values alone suffice for the same decision to be reached, even if all other input feature values are missing. General principles that globally govern a model's decision-making can also be revealed by searching for clusters of such input patterns across many data points. Our approach is conceptually straightforward, entirely model-agnostic, simply implemented using instance-wise backward selection, and able to produce more concise rationales than existing techniques. We demonstrate the utility of our interpretation method on various neural network models trained on text, image, and genomic data.}
}
@InProceedings{wang19a,
title = {Computation Efficient Coded Linear Transform},
author = {Wang, Sinong and Liu, Jiashang and Shroff, Ness and Yang, Pengyu},
pages = {577-585},
abstract = {In large-scale distributed linear transform problems, coded computation plays an important role to reduce the delay caused by slow machines. However, existing coded schemes could end up destroying the significant sparsity that exists in large-scale machine learning problems, and in turn increase the computational delay. In this paper, we propose a coded computation strategy, referred to as diagonal code, that achieves the optimum recovery threshold and the optimum computation load. Furthermore, by leveraging the ideas from random proposal graph theory, we design a random code that achieves a constant computation load, which significantly outperforms the existing best known result. We apply our schemes to the distributed gradient descent problem and demonstrate the advantage of the approach over current fastest coded schemes.}
}
@InProceedings{mangoubi19a,
title = {Mixing of Hamiltonian Monte Carlo on strongly log-concave distributions 2: Numerical integrators},
author = {Mangoubi, Oren and Smith, Aaron},
pages = {586-595},
abstract = {We obtain quantitative bounds on the mixing properties of the Hamiltonian Monte Carlo (HMC) algorithm with target distribution in d-dimensional Euclidean space, showing that HMC mixes quickly whenever the target log-distribution is strongly concave and has Lipschitz gradients. We use a coupling argument to show that the popular leapfrog implementation of HMC can sample approximately from the target distribution in a number of gradient evaluations which grows like d^1/2 with the dimension and grows at most polynomially in the strong convexity and Lipschitz-gradient constants. Our results significantly extend and improve on the dimension dependence of previous quantitative bounds on the mixing of HMC and of the unadjusted Langevin algorithm in this setting.}
}
@InProceedings{lee19a,
title = {Temporal Quilting for Survival Analysis},
author = {Lee, Changhee and Zame, William and Alaa, Ahmed and van der Schaar, Mihaela},
pages = {596-605},
abstract = {The importance of survival analysis in many disciplines (especially in medicine) has led to the development of a variety of approaches to modeling the survival function. Models constructed via various approaches offer different strengths and weaknesses in terms of discriminative performance and calibration, but no one model is best across all datasets or even across all time horizons within a single dataset. Because we require both good calibration and good discriminative performance over different time horizons, conventional model selection and ensemble approaches are not applicable. This paper develops a novel approach that combines the collective intelligence of different underlying survival models to produce a valid survival function that is well-calibrated and offers superior discriminative performance at different time horizons. Empirical results show that our approach provides significant gains over the benchmarks on a variety of real-world datasets.}
}
@InProceedings{blondel19a,
title = {Learning Classifiers with Fenchel-Young Losses: Generalized Entropies, Margins, and Algorithms},
author = {Blondel, Mathieu and Martins, Andre and Niculae, Vlad},
pages = {606-615},
abstract = {This paper studies Fenchel-Young losses, a generic way to construct convex loss functions from a regularization function. We analyze their properties in depth, showing that they unify many well-known loss functions and allow to create useful new ones easily. Fenchel-Young losses constructed from a generalized entropy, including the Shannon and Tsallis entropies, induce predictive probability distributions. We formulate conditions for a generalized entropy to yield losses with a separation margin, and probability distributions with sparse support. Finally, we derive efficient algorithms, making Fenchel-Young losses appealing both in theory and practice.}
}
@InProceedings{li19b,
title = {On Target Shift in Adversarial Domain Adaptation},
author = {Li, Yitong and Murias, Michael and Major, Samantha and Dawson, Geraldine and Carlson, David},
pages = {616-625},
abstract = {Discrepancy between training and testing domains is a fundamental problem in the generalization of machine learning techniques. Recently, several approaches have been proposed to learn domain invariant feature representations through adversarial deep learning. However, label shift, where the percentage of data in each class is different between domains, has received less attention. Label shift naturally arises in many contexts, especially in behavioral studies where the behaviors are freely chosen. In this work, we propose a method called Domain Adversarial nets for Target Shift (DATS) to address label shift while learning a domain invariant representation. This is accomplished by using distribution matching to estimate label proportions in a blind test set. We extend this framework to handle multiple domains by developing a scheme to upweight source domains most similar to the target domain. Empirical results show that this framework performs well under large label shift in synthetic and real experiments, demonstrating the practical importance.}
}
@InProceedings{schmit19a,
title = {Optimal Testing in the Experiment-rich Regime},
author = {Schmit, Sven and Shah, Virag and Johari, Ramesh},
pages = {626-633},
abstract = {Motivated by the widespread adoption of large-scale A/B testing in industry, we propose a new experimentation framework for the setting where potential experiments are abundant (i.e., many hypotheses are available to test), and observations are costly; we refer to this as the experiment-rich regime. Such scenarios require the experimenter to internalize the opportunity cost of assigning a sample to a particular experiment. We fully characterize the optimal policy and give an algorithm to compute it. Furthermore, we develop a simple heuristic that also provides intuition for the optimal policy. We use simulations based on real data to compare both the optimal algorithm and the heuristic to other natural alternative experimental design frameworks. In particular, we discuss the paradox of power: high-powered "classical" tests can lead to highly inefficient sampling in the experiment-rich regime.}
}
@InProceedings{roberts19a,
title = {Reversible Jump Probabilistic Programming},
author = {Roberts, David A and Gallagher, Marcus and Taimre, Thomas},
pages = {634-643},
abstract = {In this paper we present a method for automatically deriving a Reversible Jump Markov chain Monte Carlo sampler from probabilistic programs that specify the target and proposal distributions. The main challenge in automatically deriving such an inference procedure, in comparison to deriving a generic Metropolis-Hastings sampler, is in calculating the Jacobian adjustment to the proposal acceptance ratio. To achieve this, our approach relies on the interaction of several different components, including automatic differentiation, transformation inversion, and optimised code generation. We also present Stochaskell, a new probabilistic programming language embedded in Haskell, which provides an implementation of our method.}
}
@InProceedings{okuno19a,
title = {Graph Embedding with Shifted Inner Product Similarity and Its Improved Approximation Capability},
author = {Okuno, Akifumi and Kim, Geewook and Shimodaira, Hidetoshi},
pages = {644-653},
abstract = {We propose shifted inner-product similarity (SIPS), which is a novel yet very simple extension of the ordinary inner-product similarity (IPS) for neural-network based graph embedding (GE). In contrast to IPS, that is limited to approximating positive-definite (PD) similarities, SIPS goes beyond the limitation by introducing bias terms in IPS; we theoretically prove that SIPS is capable of approximating not only PD but also conditionally PD (CPD) similarities with many examples such as cosine similarity, negative Poincare distance and negative Wasserstein distance. Since SIPS with sufficiently large neural networks learns a variety of similarities, SIPS alleviates the need for configuring the similarity function of GE. Approximation error rate is also evaluated, and experiments on two real-world datasets demonstrate that graph embedding using SIPS indeed outperforms existing methods.}
}
@InProceedings{feng19a,
title = {High-dimensional Mixed Graphical Model with Ordinal Data: Parameter Estimation and Statistical Inference},
author = {Feng, Huijie and Ning, Yang},
pages = {654-663},
abstract = {We consider parameter estimation and statistical inference of high-dimensional undirected graphical models for mixed data comprising both ordinal and continuous variables. We propose a flexible model called Latent Mixed Gaussian Copula Model that simultaneously deals with such mixed data by assuming that the observed ordinal variables are generated by latent variables. For parameter estimation, we introduce a convenient rank-based ensemble approach to estimate the latent correlation matrix, which can be subsequently applied to recover the latent graph structure. In addition, based on the ensemble estimator, we develop test statistics via a pseudo-likelihood approach to quantify the uncertainty associated with the low dimensional components of high-dimensional parameters. Our theoretical analysis shows the consistency of the estimator and asymptotic normality of the test statistic. Experiments on simulated and real gene expression data are conducted to validate our approach.}
}
@InProceedings{okuno19b,
title = {Robust Graph Embedding with Noisy Link Weights},
author = {Okuno, Akifumi and Shimodaira, Hidetoshi},
pages = {664-673},
abstract = {We propose $\beta$-graph embedding for robustly learning feature vectors from data vectors and noisy link weights. A newly introduced empirical moment $\beta$-score reduces the influence of contamination and robustly measures the difference between the underlying correct expected weights of links and the specified generative model. The proposed method is computationally tractable; we employ a minibatch-based efficient stochastic algorithm and prove that this algorithm locally minimizes the empirical moment $\beta$-score. We conduct numerical experiments on synthetic and real-world datasets.}
}
@InProceedings{yu19a,
title = {Exploring Fast and Communication-Efficient Algorithms in Large-Scale Distributed Networks},
author = {Yu, Yue and Wu, Jiaxiang and Huang, Junzhou},
pages = {674-683},
abstract = {The communication overhead has become a significant bottleneck in data-parallel network with the increasing of model size and data samples. In this work, we propose a new algorithm LPC-SVRG with quantized gradients and its acceleration ALPC-SVRG to effectively reduce the communication complexity while maintaining the same convergence as the unquantized algorithms. Specifically, we formulate the heuristic gradient clipping technique within the quantization scheme and show that unbiased quantization methods in related works [3, 33, 38] are special cases of ours. We introduce double sampling in the accelerated algorithm ALPC-SVRG to fully combine the gradients of full-precision and low-precision, and then achieve acceleration with fewer communication overhead. Our analysis focuses on the nonsmooth composite problem, which makes our algorithms more general. The experiments on linear models and deep neural networks validate the effectiveness of our algorithms.}
}
@InProceedings{zhang19b,
title = {Defending against Whitebox Adversarial Attacks via Randomized Discretization},
author = {Zhang, Yuchen and Liang, Percy},
pages = {684-693},
abstract = {Adversarial perturbations dramatically decrease the accuracy of state-of-the-art image classifiers. In this paper, we propose and analyze a simple and computationally efficient defense strategy: inject random Gaussian noise, discretize each pixel, and then feed the result into any pre-trained classifier. Theoretically, we show that our randomized discretization strategy reduces the KL divergence between original and adversarial inputs, leading to a lower bound on the classification accuracy of any classifier against any (potentially whitebox) $L_{\infty}$-bounded adversarial attack. Empirically, we evaluate our defense on adversarial examples generated by a strong iterative PGD attack. On ImageNet, our defense is more robust than adversarially-trained networks and the winning defenses of the NIPS 2017 Adversarial Attacks & Defenses competition.}
}
@InProceedings{amari19a,
title = {Fisher Information and Natural Gradient Learning in Random Deep Networks},
author = {Amari, Shun-ichi and Karakida, Ryo and Oizumi, Masafumi},
pages = {694-702},
abstract = {The parameter space of a deep neural network is a Riemannian manifold, where the metric is defined by the Fisher information matrix. The natural gradient method uses the steepest descent direction in a Riemannian manifold, but it requires inversion of the Fisher matrix, however, which is practically difficult. The present paper uses statistical neurodynamical method to reveal the properties of the Fisher information matrix in a net of random connections. We prove that the Fisher information matrix is unit-wise block diagonal supplemented by small order terms of off-block-diagonal elements. We further prove that the Fisher information matrix of a single unit has a simple reduced form, a sum of a diagonal matrix and a rank 2 matrix of weight-bias correlations. We obtain the inverse of Fisher information explicitly. We then have an explicit form of the approximate natural gradient, without relying on the matrix inversion.}
}
@InProceedings{holland19a,
title = {Robust descent using smoothed multiplicative noise},
author = {Holland, Matthew J},
pages = {703-711},
abstract = {In this work, we propose a novel robust gradient descent procedure which makes use of a smoothed multiplicative noise applied directly to observations before constructing a sum of soft-truncated gradient coordinates. We show that the procedure has competitive theoretical guarantees, with the major advantage of a simple implementation that does not require an iterative sub-routine for robustification. Empirical tests reinforce the theory, showing more efficient generalization over a much wider class of data distributions.}
}
@InProceedings{holland19b,
title = {Classification using margin pursuit},
author = {Holland, Matthew J},
pages = {712-720},
abstract = {In this work, we study a new approach to optimizing the margin distribution realized by binary classifiers, in which the learner searches the hypothesis space in such a way that a pre-set margin level ends up being a distribution-robust estimator of the margin location. This procedure is easily implemented using gradient descent, and admits finite-sample bounds on the excess risk under unbounded inputs, yielding competitive rates under mild assumptions. Empirical tests on real-world benchmark data reinforce the basic principles highlighted by the theory.}
}
@InProceedings{bassily19a,
title = {Linear Queries Estimation with Local Differential Privacy},
author = {Bassily, Raef},
pages = {721-729},
abstract = {We study the problem of estimating a set of d linear queries with respect to some unknown distribution p over a domain $[J]$ based on a sensitive data set of n individuals under the constraint of local differential privacy. This problem subsumes a wide range of estimation tasks, e.g., distribution estimation and d-dimensional mean estimation. We provide new algorithms for both the offline (non-adaptive) and adaptive versions of this problem. In the offline setting, the set of queries are fixed before the algorithm starts. In the regime where $n < d^2/\log(J)$, our algorithms attain $L_2$ estimation error that is independent of d. For the special case of distribution estimation, we show that projecting the output estimate of an algorithm due to [Acharya et al. 2018] on the probability simplex yields an $L_2$ error that depends only sub-logarithmically on $J$ in the regime where $n < J^2/\log(J)$. Our bounds are within a factor of at most $(\log(J))^{1/4}$ from the optimal $L_2$ error. These results show the possibility of accurate estimation of linear queries in the high-dimensional settings under the $L_2$ error criterion. In the adaptive setting, the queries are generated over d rounds; one query at a time. In each round, a query can be chosen adaptively based on all the history of previous queries and answers. We give an algorithm for this problem with optimal $L_{\infty}$ estimation error (worst error in the estimated values for the queries w.r.t. the data distribution). Our bound matches a lower bound on the $L_{\infty}$ error in the offline version of this problem [Duchi et al. 2013].}
}
@InProceedings{dikov19a,
title = {Bayesian Learning of Neural Network Architectures},
author = {Dikov, Georgi and Bayer, Justin},
pages = {730-738},
abstract = {In this paper we propose a Bayesian method for estimating architectural parameters of neural networks, namely layer size and network depth. We do this by learning concrete distributions over these parameters. Our results show that regular networks with a learned structure can generalise better on small datasets, while fully stochastic networks can be more robust to parameter initialisation. The proposed method relies on standard neural variational learning and, unlike randomised architecture search, does not require a retraining of the model, thus keeping the computational overhead at minimum.}
}
@InProceedings{bollapragada19a,
title = {Nonlinear Acceleration of Primal-Dual Algorithms},
author = {Bollapragada, Raghu and Scieur, Damien and d'Aspremont, Alexandre},
pages = {739-747},
abstract = {We describe a convergence acceleration scheme for multi-step optimization algorithms. The extrapolated solution is written as a nonlinear average of the iterates produced by the original optimization algorithm. Our scheme does not need the underlying fixed-point operator to be symmetric, hence handles e.g. algorithms with momentum terms such as Nesterov's accelerated method, or primal-dual methods such as Chambolle-Pock. The weights are computed via a simple linear system and we analyze performance in both online and offline modes. We use Crouzeix's conjecture to show that acceleration is controlled by the solution of a Chebyshev problem on the numerical range of a nonsymmetric operator modelling the behavior of iterates near the optimum. Numerical experiments are detailed on image processing and logistic regression problems.}
}
@InProceedings{kazlauskaite19a,
title = {Gaussian Process Latent Variable Alignment Learning},
author = {Kazlauskaite, Ieva and Ek, Carl Henrik and Campbell, Neill},
pages = {748-757},
abstract = {We present a model that can automatically learn alignments between high-dimensional data in an unsupervised manner. Our proposed method casts alignment learning in a framework where both alignment and data are modelled simultaneously. Further, we automatically infer groupings of different types of sequences within the same dataset. We derive a probabilistic model built on non-parametric priors that allows for flexible warps while at the same time providing means to specify interpretable constraints. We demonstrate the efficacy of our approach with superior quantitative performance to the state-of-the-art approaches and provide examples to illustrate the versatility of our model in automatic inference of sequence groupings, absent from previous approaches, as well as easy specification of high level priors for different modalities of data.}
}
@InProceedings{lee19b,
title = {A Bayesian model for sparse graphs with flexible degree distribution and overlapping community structure},
author = {Lee, Juho and James, Lancelot and Choi, Seungjin and Caron, Francois},
pages = {758-767},
abstract = {We consider a non-projective class of inhomogeneous random graph models with interpretable parameters and a number of interesting asymptotic properties. Using the results of Bollob\'{a}s et al. (2007), we show that i) the class of models is sparse and ii) depending on the choice of the parameters, the model is either scale-free, with power-law exponent greater than 2, or with an asymptotic degree distribution which is power-law with exponential cut-off. We propose an extension of the model that can accommodate an overlapping community structure. Scalable posterior inference can be performed due to the specific choice of the link probability. We present experiments on five different real world networks with up to 100,000 nodes and edges, showing that the model can provide a good fit to the degree distribution and recovers well the latent community structure.}
}
@InProceedings{letarte19a,
title = {Pseudo-Bayesian Learning with Kernel Fourier Transform as Prior},
author = {Letarte, Ga\"{e}l and Morvant, Emilie and Germain, Pascal},
pages = {768-776},
abstract = {We revisit Rahimi and Recht (2007)'s kernel random Fourier features (RFF) method through the lens of the PAC-Bayesian theory. While the primary goal of RFF is to approximate a kernel, we look at the Fourier transform as a prior distribution over trigonometric hypotheses. It naturally suggests learning a posterior on these hypotheses. We derive generalization bounds that are optimized by learning a pseudo-posterior obtained from a closed-form expression. Based on this study, we consider two learning strategies: The first one finds a compact landmarks-based representation of the data where each landmark is given by a distribution-tailored similarity measure, while the second one provides a PAC-Bayesian justification to the kernel alignment method of Sinha and Duchi (2016).}
}
@InProceedings{ambrogioni19a,
title = {Forward Amortized Inference for Likelihood-Free Variational Marginalization},
author = {Ambrogioni, Luca and G\"{u}\c{c}l\"{u}, Umut and Berezutskaya, Julia and van den Borne, Eva and G\"{u}\c{c}l\"{u}t\"{u}rk, Ya\v{g}mur and Hinne, Max and Maris, Eric and van Gerven, Marcel},
pages = {777-786},
abstract = {In this paper, we introduce a new form of amortized variational inference by using the forward KL divergence in a joint-contrastive variational loss. The resulting forward amortized variational inference is a likelihood-free method as its gradient can be sampled without bias and without requiring any evaluation of either the model joint distribution or its derivatives. We prove that our new variational loss is optimized by the exact posterior marginals in the fully factorized mean-field approximation, a property that is not shared with the more conventional reverse KL inference. Furthermore, we show that forward amortized inference can be easily marginalized over large families of latent variables in order to obtain a marginalized variational posterior. We consider two examples of variational marginalization. In our first example we train a Bayesian forecaster for predicting a simplified chaotic model of atmospheric convection. In the second example we train an amortized variational approximation of a Bayesian optimal classifier by marginalizing over the model space. The result is a powerful meta-classification network that can solve arbitrary classification problems without further training.}
}
@InProceedings{ambrogioni19b,
title = {SpikeCaKe: Semi-Analytic Nonparametric Bayesian Inference for Spike-Spike Neuronal Connectivity},
author = {Ambrogioni, Luca and Ebel, Patrick and Hinne, Max and G\"{u}\c{c}l\"{u}, Umut and van Gerven, Marcel and Maris, Eric},
pages = {787-795},
abstract = {In this paper we introduce a semi-analytic variational framework for approximating the posterior of a Gaussian processes coupled through non-linear emission models. While the semi-analytic method can be applied to a large class of models, the present paper is devoted to the analysis of causal connectivity between biological spiking neurons. Estimating causal connectivity between spiking neurons from measured spike sequences is one of the main challenges of systems neuroscience. This semi-analytic method exploits the tractability of GP regression when the membrane potential is observed. The resulting posterior is then marginalized analytically in order to obtain the posterior of the response functions given the spike sequences alone. We validate our methods on both simulated data and real neuronal recordings.}
}
@InProceedings{huggins19a,
title = {Scalable Gaussian Process Inference with Finite-data Mean and Variance Guarantees},
author = {Huggins, Jonathan H and Campbell, Trevor and Kasprzak, Mikolaj and Broderick, Tamara},
pages = {796-805},
abstract = {Gaussian processes (GPs) offer a flexible class of priors for nonparametric Bayesian regression, but popular GP posterior inference methods are typically prohibitively slow or lack desirable finite-data guarantees on quality. We develop a scalable approach to approximate GP regression, with finite-data guarantees on the accuracy of our pointwise posterior mean and variance estimates. Our main contribution is a novel objective for approximate inference in the nonparametric setting: the preconditioned Fisher (pF) divergence. We show that unlike the Kullback--Leibler divergence (used in variational inference), the pF divergence bounds bounds the 2-Wasserstein distance, which in turn provides tight bounds on the pointwise error of mean and variance estimates. We demonstrate that, for sparse GP likelihood approximations, we can minimize the pF divergence bounds efficiently. Our experiments show that optimizing the pF divergence bounds has the same computational requirements as variational sparse GPs while providing comparable empirical performance---in addition to our novel finite-data quality guarantees.}
}
@InProceedings{kohler19a,
title = {Exponential convergence rates for Batch Normalization: The power of length-direction decoupling in non-convex optimization},
author = {Kohler, Jonas and Daneshmand, Hadi and Lucchi, Aurelien and Hofmann, Thomas and Zhou, Ming and Neymeyr, Klaus},
pages = {806-815},
abstract = {Normalization techniques such as Batch Normalization have been applied very successfully for training deep neural networks. Yet, despite its apparent empirical benefits, the reasons behind the success of Batch Normalization are mostly hypothetical. We here aim to provide a more thorough theoretical understanding from a classical optimization perspective. Our main contribution towards this goal is the identification of various problem instances in the realm of machine learning where Batch Normalization can provably accelerate optimization. We argue that this acceleration is due to the fact that Batch Normalization splits the optimization task into optimizing length and direction of the parameters separately. This allows gradient-based methods to leverage a favourable global structure in the loss landscape that we prove to exist in Learning Halfspace problems and neural network training with Gaussian inputs. We thereby turn Batch Normalization from an effective practical heuristic into a provably converging algorithm for these settings. Furthermore, we substantiate our analysis with empirical evidence that suggests the validity of our theoretical results in a broader context.}
}
@InProceedings{shi19a,
title = {A new evaluation framework for topic modeling algorithms based on synthetic corpora},
author = {Shi, Hanyu and Gerlach, Martin and Diersen, Isabel and Downey, Doug and Amaral, Luis},
pages = {816-826},
abstract = {Topic models are in widespread use in natural language processing and beyond. Here, we propose a new framework for the evaluation of topic modeling algorithms based on synthetic corpora containing an unambiguously defined ground truth topic structure. The major innovation of our approach is the ability to quantify the agreement between the planted and inferred topic structures by comparing the assigned topic labels at the level of the tokens. In experiments, our approach yields novel insights about the relative strengths of topic models as corpus characteristics vary, and the first evidence of an ``undetectable phase'' for topic models when the planted structure is weak. We also establish the practical relevance of the insights gained for synthetic corpora by predicting the performance of topic modeling algorithms in classification tasks in real-world corpora.}
}
@InProceedings{szabo19a,
title = {On Kernel Derivative Approximation with Random Fourier Features},
author = {Szabo, Zoltan and Sriperumbudur, Bharath},
pages = {827-836},
abstract = {Random Fourier features (RFF) represent one of the most popular and wide-spread techniques in machine learning to scale up kernel algorithms. Despite the numerous successful applications of RFFs, unfortunately, quite little is understood theoretically on their optimality and limitations of their performance. Only recently, precise statistical-computational trade-offs have been established for RFFs in the approximation of kernel values, kernel ridge regression, kernel PCA and SVM classification. Our goal is to spark the investigation of optimality of RFF-based approximations in tasks involving not only function values but derivatives, which naturally lead to optimization problems with kernel derivatives. Particularly, in this paper, we focus on the approximation quality of RFFs for kernel derivatives and prove that the existing finite-sample guarantees can be improved exponentially in terms of the domain where they hold, using recent tools from unbounded empirical process theory. Our result implies that the same approximation guarantee is attainable for kernel derivatives using RFF as achieved for kernel values.}
}
@InProceedings{papamakarios19a,
title = {Sequential Neural Likelihood: Fast Likelihood-free Inference with Autoregressive Flows},
author = {Papamakarios, George and Sterratt, David and Murray, Iain},
pages = {837-848},
abstract = {We present Sequential Neural Likelihood (SNL), a new method for Bayesian inference in simulator models, where the likelihood is intractable but simulating data from the model is possible. SNL trains an autoregressive flow on simulated data in order to learn a model of the likelihood in the region of high posterior density. A sequential training procedure guides simulations and reduces simulation cost by orders of magnitude. We show that SNL is more robust, more accurate and requires less tuning than related neural-based methods, and we discuss diagnostics for assessing calibration, convergence and goodness-of-fit.}
}
@InProceedings{redko19a,
title = {Optimal Transport for Multi-source Domain Adaptation under Target Shift},
author = {Redko, Ievgen and Courty, Nicolas and Flamary, R\'emi and Tuia, Devis},
pages = {849-858},
abstract = {In this paper, we tackle the problem of reducing discrepancies between multiple domains, i.e. multi-source domain adaptation, and consider it under the target shift assumption: in all domains we aim to solve a classification problem with the same output classes, but with different labels proportions. This problem, generally ignored in the vast majority of domain adaptation papers, is nevertheless critical in real-world applications, and we theoretically show its impact on the success of the adaptation. Our proposed method is based on optimal transport, a theory that has been successfully used to tackle adaptation problems in machine learning. The introduced approach, Joint Class Proportion and Optimal Transport (JCPOT), performs multi-source adaptation and target shift correction simultaneously by learning the class probabilities of the unlabeled target sample and the coupling allowing to align two (or more) probability distributions. Experiments on both synthetic and real-world data (satellite image pixel classification) task show the superiority of the proposed method over the state-of-the-art.}
}
@InProceedings{hyvarinen19a,
title = {Nonlinear ICA Using Auxiliary Variables and Generalized Contrastive Learning},
author = {Hyvarinen, Aapo and Sasaki, Hiroaki and Turner, Richard},
pages = {859-868},
abstract = {Nonlinear ICA is a fundamental problem for unsupervised representation learning, emphasizing the capacity to recover the underlying latent variables generating the data (i.e., identifiability). Recently, the very first identifiability proofs for nonlinear ICA have been proposed, leveraging the temporal structure of the independent components. Here, we propose a general framework for nonlinear ICA, which, as a special case, can make use of temporal structure. It is based on augmenting the data by an auxiliary variable, such as the time index, the history of the time series, or any other available information. We propose to learn nonlinear ICA by discriminating between true augmented data, or data in which the auxiliary variable has been randomized. This enables the framework to be implemented algorithmically through logistic regression, possibly in a neural network. We provide a comprehensive proof of the identifiability of the model as well as the consistency of our estimation method. The approach not only provides a general theoretical framework combining and generalizing previously proposed nonlinear ICA models and algorithms, but also brings practical advantages.}
}
@InProceedings{imaizumi19a,
title = {Deep Neural Networks Learn Non-Smooth Functions Effectively},
author = {Imaizumi, Masaaki and Fukumizu, Kenji},
pages = {869-878},
abstract = {We elucidate a theoretical reason that deep neural networks (DNNs) perform better than other models in some cases from the viewpoint of their statistical properties for non-smooth functions. While DNNs have empirically shown higher performance than other standard methods, understanding its mechanism is still a challenging problem. From an aspect of the statistical theory, it is known many standard methods attain the optimal rate of generalization errors for smooth functions in large sample asymptotics, and thus it has not been straightforward to find theoretical advantages of DNNs. This paper fills this gap by considering learning of a certain class of non-smooth functions, which was not covered by the previous theory. We derive the generalization error of estimators by DNNs with a ReLU activation, and show that convergence rates of the generalization by DNNs are almost optimal to estimate the non-smooth functions, while some of the popular models do not attain the optimal rate. In addition, our theoretical result provides guidelines for selecting an appropriate number of layers and edges of DNNs. We provide numerical experiments to support the theoretical results.}
}
@InProceedings{dev19a,
title = {Attenuating Bias in Word vectors},
author = {Dev, Sunipa and Phillips, Jeff},
pages = {879-887},
abstract = {Word vector representations are well developed tools for various NLP and Machine Learning tasks and are known to retain significant semantic and syntactic structure of languages. But they are prone to carrying and amplifying bias which can perpetrate discrimination in various applications. In this work, we explore new simple ways to detect the most stereotypically gendered words in an embedding and remove the bias from them. We verify how names are masked carriers of gender bias and then use that as a tool to attenuate bias in embeddings. Further, we extend this property of names to show how names can be used to detect other types of bias in the embeddings such as bias based on race, ethnicity, and age.}
}
@InProceedings{liang19a,
title = {Fisher-Rao Metric, Geometry, and Complexity of Neural Networks},
author = {Liang, Tengyuan and Poggio, Tomaso and Rakhlin, Alexander and Stokes, James},
pages = {888-896},
abstract = {We study the relationship between geometry and capacity measures for deep neural networks from an invariance viewpoint. We introduce a new notion of capacity --- the Fisher-Rao norm --- that possesses desirable invariance properties and is motivated by Information Geometry. We discover an analytical characterization of the new capacity measure, through which we establish norm-comparison inequalities and further show that the new measure serves as an umbrella for several existing norm-based complexity measures. We discuss upper bounds on the generalization error induced by the proposed measure. Extensive numerical experiments on CIFAR-10 support our theoretical findings. Our theoretical analysis rests on a key structural lemma about partial derivatives of multi-layer rectifier networks.}
}
@InProceedings{hendrikx19a,
title = {Accelerated Decentralized Optimization with Local Updates for Smooth and Strongly Convex Objectives},
author = {Hendrikx, Hadrien and Bach, Francis and Massoulie, Laurent},
pages = {897-906},
abstract = {In this paper, we study the problem of minimizing a sum of smooth and strongly convex functions split over the nodes of a network in a decentralized fashion. We propose the algorithm ESDACD, a decentralized accelerated algorithm that only requires local synchrony. Its rate depends on the condition number $\kappa$ of the local functions as well as the network topology and delays. Under mild assumptions on the topology of the graph, ESDACD takes a time $O((\tau_{\max} + \Delta_{\max})\sqrt{{\kappa}/{\gamma}}\ln(\epsilon^{-1}))$ to reach a precision $\epsilon$ where $\gamma$ is the spectral gap of the graph, $\tau_{\max}$ the maximum communication delay and $\Delta_{\max}$ the maximum computation time. Therefore, it matches the rate of SSDA, which is optimal when $\tau_{\max} = \Omega\left(\Delta_{\max}\right)$. Applying ESDACD to quadratic local functions leads to an accelerated randomized gossip algorithm of rate $O( \sqrt{\theta_{\rm gossip}/n})$ where $\theta_{\rm gossip}$ is the rate of the standard randomized gossip. To the best of our knowledge, it is the first asynchronous algorithm with a provably improved rate of convergence of the second moment of the error. We illustrate these results with experiments in idealized settings.}
}
@InProceedings{liang19b,
title = {Interaction Matters: A Note on Non-asymptotic Local Convergence of Generative Adversarial Networks},
author = {Liang, Tengyuan and Stokes, James},
pages = {907-915},
abstract = {Motivated by the pursuit of a systematic computational and algorithmic understanding of Generative Adversarial Networks (GANs), we present a simple yet unified non-asymptotic local convergence theory for smooth two-player games, which subsumes several discrete-time gradient-based saddle point dynamics. The analysis reveals the surprising nature of the off-diagonal interaction term as both a blessing and a curse. On the one hand, this interaction term explains the origin of the slow-down effect in the convergence of Simultaneous Gradient Ascent (SGA) to stable Nash equilibria. On the other hand, for the unstable equilibria, exponential convergence can be proved thanks to the interaction term, for four modified dynamics proposed to stabilize GAN training: Optimistic Mirror Descent (OMD), Consensus Optimization (CO), Implicit Updates (IU) and Predictive Method (PM). The analysis uncovers the intimate connections among these stabilizing techniques, and provides detailed characterization on the choice of learning rate. As a by-product, we present a new analysis for OMD proposed in Daskalakis, Ilyas, Syrgkanis, and Zeng [2017] with improved rates.}
}
@InProceedings{chen19a,
title = {On Constrained Nonconvex Stochastic Optimization: A Case Study for Generalized Eigenvalue Decomposition},
author = {Chen, Zhehui and Li, Xingguo and Yang, Lin and Haupt, Jarvis and Zhao, Tuo},
pages = {916-925},
abstract = {We study constrained nonconvex optimization problems in machine learning and signal processing. It is well-known that these problems can be rewritten to a min-max problem in a Lagrangian form. However, due to the lack of convexity, their landscape is not well understood and how to find the stable equilibria of the Lagrangian function is still unknown. To bridge the gap, we study the landscape of the Lagrangian function. Further, we define a special class of Lagrangian functions. They enjoy the following two properties: 1. Equilibria are either stable or unstable (Formal definition in Section 2); 2.Stable equilibria correspond to the global optima of the original problem. We show that a generalized eigenvalue (GEV) problem, including canonical correlation analysis and other problems as special examples, belongs to the class. Specifically, we characterize its stable and unstable equilibria by leveraging an invariant group and symmetric property (more details in Section 3). Motivated by these neat geometric structures, we propose a simple, efficient, and stochastic primal-dual algorithm solving the online GEV problem. Theoretically, under sufficient conditions, we establish an asymptotic rate of convergence and obtain the first sample complexity result for the online GEV problem by diffusion approximations, which are widely used in applied probability. Numerical results are also provided to support our theory.}
}
@InProceedings{liu19b,
title = {Generalized Boltzmann Machine with Deep Neural Structure},
author = {Liu, Yingru and Xie, Dongliang and Wang, Xin},
pages = {926-934},
abstract = {Restricted Boltzmann Machine (RBM) is an essential component in many machine learning applications. As a probabilistic graphical model, RBM posits a shallow structure, which makes it less capable of modeling real-world applications. In this paper, to bridge the gap between RBM and artificial neural network, we propose an energy-based probabilistic model that is more flexible on modeling continuous data. By introducing the pair-wise inverse autoregressive flow into RBM, we propose two generalized continuous RBMs which contain deep neural network structure to more flexibly track the practical data distribution while still keeping the inference tractable. In addition, we extend the generalized RBM structures into sequential setting to better model the stochastic process of time series. Performance improvements on probabilistic modeling and representation learning are demonstrated by the experiments on diverse datasets.}
}
@InProceedings{zhang19c,
title = {Extreme Stochastic Variational Inference: Distributed Inference for Large Scale Mixture Models},
author = {Zhang, Jiong and Raman, Parameswaran and Ji, Shihao and Yu, Hsiang-Fu and Vishwanathan, S.V.N. and Dhillon, Inderjit},
pages = {935-943},
abstract = {Mixture of exponential family models are among the most fundamental and widely used statistical models. Stochastic variational inference (SVI), the state-of-the-art algorithm for parameter estimation in such models is inherently serial. Moreover, it requires the parameters to fit in the memory of a single processor; this poses serious limitations on scalability when the number of parameters is in billions. In this paper, we present extreme stochastic variational inference (ESVI), a distributed, asynchronous and lock-free algorithm to perform variational inference for mixture models on massive real world datasets. ESVI overcomes the limitations of SVI by requiring that each processor only access a subset of the data and a subset of the parameters, thus providing data and model parallelism simultaneously. Our empirical study demonstrates that ESVI not only outperforms VI and SVI in wallclock-time, but also achieves a better quality solution. To further speed up computation and save memory when fitting large number of topics, we propose a variant ESVI-TOPK which maintains only the top-k important topics. Empirically, we found that using top 25\% topics suffices to achieve the same accuracy as storing all the topics.}
}
@InProceedings{derezinski19a,
title = {Correcting the bias in least squares regression with volume-rescaled sampling},
author = {Derezinski, Michal and Warmuth, Manfred K. and Hsu, Daniel},
pages = {944-953},
abstract = {Consider linear regression where the examples are generated by an unknown distribution on R^d x R. Without any assumptions on the noise, the linear least squares solution for any i.i.d. sample will typically be biased w.r.t. the least squares optimum over the entire distribution. However, we show that if an i.i.d. sample of any size k is augmented by a certain small additional sample, then the solution of the combined sample becomes unbiased. We show this when the additional sample consists of d points drawn jointly according to the input distribution rescaled by the squared volume spanned by the points. Furthermore, we propose algorithms to sample from this volume-rescaled distribution when the data distribution is only known through an i.i.d sample.}
}
@InProceedings{katariya19a,
title = {Conservative Exploration using Interleaving},
author = {Katariya, Sumeet and Kveton, Branislav and Wen, Zheng and Potluru, Vamsi K},
pages = {954-963},
abstract = {In many practical problems, a learning agent may want to learn the best action in hindsight without ever taking a bad action, which is much worse than a default production action. In general, this is impossible because the agent has to explore unknown actions, some of which can be bad, to learn better actions. However, when the actions are structured, this is possible if the unknown action can be evaluated by interleaving it with the default action. We formalize this concept as learning in stochastic combinatorial semi-bandits with exchangeable actions. We design efficient learning algorithms for this problem, bound their n-step regret, and evaluate them on both synthetic and real-world problems. Our real-world experiments show that our algorithms can learn to recommend K most attractive movies without ever making disastrous recommendations, both overall and subject to a diversity constraint.}
}
@InProceedings{taghia19a,
title = {Conditionally Independent Multiresolution Gaussian Processes},
author = {Taghia, Jalil and Sch\"{o}n, Thomas},
pages = {964-973},
abstract = {The multiresolution Gaussian process (GP) has gained increasing attention as a viable approach towards improving the quality of approximations in GPs that scale well to large-scale data. Most of the current constructions assume full independence across resolutions. This assumption simplifies the inference, but it underestimates the uncertainties in transitioning from one resolution to another. This in turn results in models which are prone to overfitting in the sense of excessive sensitivity to the chosen resolution, and predictions which are non-smooth at the boundaries. Our contribution is a new construction which instead assumes conditional independence among GPs across resolutions. We show that relaxing the full independence assumption enables robustness against overfitting, and that it delivers predictions that are smooth at the boundaries. Our new model is compared against current state of the art on 2 synthetic and 9 real-world datasets. In most cases, our new conditionally independent construction performed favorably when compared against models based on the full independence assumption. In particular, it exhibits little to no signs of overfitting.}
}
@InProceedings{tarbouriech19a,
title = {Active Exploration in Markov Decision Processes},
author = {Tarbouriech, Jean and Lazaric, Alessandro},
pages = {974-982},
abstract = {We introduce the active exploration problem in Markov decision processes (MDPs). Each state of the MDP is characterized by a random value and the learner should gather samples to estimate the mean value of each state as accurately as possible. Similarly to active exploration in multi-armed bandit (MAB), states may have different levels of noise, so that the higher the noise, the more samples are needed. As the noise level is initially unknown, we need to trade off the exploration of the environment to estimate the noise and the exploitation of these estimates to compute a policy maximizing the accuracy of the mean predictions. We introduce a novel learning algorithm to solve this problem showing that active exploration in MDPs may be significantly more difficult than in MAB. We also derive a heuristic procedure to mitigate the negative effect of slowly mixing policies. Finally, we validate our findings on simple numerical simulations.}
}
@InProceedings{li19c,
title = {On the Convergence of Stochastic Gradient Descent with Adaptive Stepsizes},
author = {Li, Xiaoyu and Orabona, Francesco},
pages = {983-992},
abstract = {Stochastic gradient descent is the method of choice for large scale optimization of machine learning objective functions. Yet, its performance is greatly variable and heavily depends on the choice of the stepsizes. This has motivated a large body of research on adaptive stepsizes. However, there is currently a gap in our theoretical understanding of these methods, especially in the non-convex setting. In this paper, we start closing this gap: we theoretically analyze in the convex and non-convex settings a generalized version of the AdaGrad stepsizes. We show sufficient conditions for these stepsizes to achieve almost sure asymptotic convergence of the gradients to zero, proving the first guarantee for generalized AdaGrad stepsizes in the non-convex setting. Moreover, we show that these stepsizes allow to automatically adapt to the level of noise of the stochastic gradients in both the convex and non-convex settings, interpolating between O(1/T) and O(1/sqrt(T)), up to logarithmic terms.}
}
@InProceedings{li19d,
title = {Bandit Online Learning with Unknown Delays},
author = {Li, Bingcong and Chen, Tianyi and Giannakis, Georgios B.},
pages = {993-1002},
abstract = {This paper deals with bandit online learning, where feedback of unknown delay can emerge in non-stochastic multi-armed bandit (MAB) and bandit convex optimization (BCO) settings. MAB and BCO require only values of the objective function to become available through feedback, and are used to estimate the gradient appearing in the corresponding iterative algorithms. Since the challenging case of feedback with unknown delays prevents one from constructing the sought gradient estimates, existing MAB and BCO algorithms become intractable. Delayed exploration, exploitation, and exponential (DEXP3) iterations, along with delayed bandit gradient descent (DBGD) iterations are developed for MAB and BCO with unknown delays, respectively. Based on a unifying analysis framework, it is established that both DEXP3 and DBGD guarantee an $\tilde{\cal O}\big( \sqrt{K(T+D)} \big)$ regret, where $D$ denotes the delay accumulated over $T$ slots, and $K$ represents the number of arms in MAB or the dimension of decision variables in BCO. Numerical tests using both synthetic and real data validate DEXP3 and DBGD.}
}
@InProceedings{ma19a,
title = {Learning Invariant Representations with Kernel Warping},
author = {Ma, Yingyi and Ganapathiraman, Vignesh and Zhang, Xinhua},
pages = {1003-1012},
abstract = {Invariance is an effective prior that has been extensively used to bias supervised learning with a \emph{given} representation of data. In order to learn invariant representations, wavelet and scattering based methods ``hard code'' invariance over the \emph{entire} sample space, hence restricted to a limited range of transformations. Kernels based on Haar integration also work only on a \emph{group} of transformations. In this work, we break this limitation by designing a new representation learning algorithm that incorporates invariances \emph{beyond transformation}. Our approach, which is based on warping the kernel in a data-dependent fashion, is computationally efficient using random features, and leads to a deep kernel through multiple layers. We apply it to convolutional kernel networks and demonstrate its stability.}
}
@InProceedings{chen19b,
title = {$\beta^3$-IRT: A New Item Response Model and its Applications},
author = {Chen, Yu and Filho, Telmo Silva and Prudencio, Ricardo B and Diethe, Tom and Flach, Peter},
pages = {1013-1021},
abstract = {Item Response Theory (IRT) aims to assess latent abilities of respondents based on the correctness of their answers in aptitude test items with different difficulty levels. In this paper, we propose the $\beta^3$-IRT model, which models continuous responses and can generate a much enriched family of Item Characteristic Curves. In experiments we applied the proposed model to data from an online exam platform, and show our model outperforms a more standard 2PL-ND model on all datasets. Furthermore, we show how to apply $\beta^3$-IRT to assess the ability of machine learning classifiers.This novel application results in a new metric for evaluating the quality of the classifier's probability estimates, based on the inferred difficulty and discrimination of data instances.}
}
@InProceedings{schulam19a,
title = {Can You Trust This Prediction? Auditing Pointwise Reliability After Learning},
author = {Schulam, Peter and Saria, Suchi},
pages = {1022-1031},
abstract = {To use machine learning in high stakes applications (e.g. medicine), we need tools for building confidence in the system and evaluating whether it is reliable. Methods to improve model reliability often require new learning algorithms (e.g. using Bayesian inference to obtain uncertainty estimates). An alternative is to audit a model after it is trained. In this paper, we describe resampling uncertainty estimation (RUE), an algorithm to audit the pointwise reliability of predictions. Intuitively, RUE estimates the amount that a prediction would change if the model had been fit on different training data. The algorithm uses the gradient and Hessian of the model's loss function to create an ensemble of predictions. Experimentally, we show that RUE more effectively detects inaccurate predictions than existing tools for auditing reliability subsequent to training. We also show that RUE can create predictive distributions that are competitive with state-of-the-art methods like Monte Carlo dropout, probabilistic backpropagation, and deep ensembles, but does not depend on specific algorithms at train-time like these methods do.}
}
@InProceedings{karakida19a,
title = {Universal Statistics of Fisher Information in Deep Neural Networks: Mean Field Approach},
author = {Karakida, Ryo and Akaho, Shotaro and Amari, Shun-ichi},
pages = {1032-1041},
abstract = {The Fisher information matrix (FIM) is a fundamental quantity to represent the characteristics of a stochastic model, including deep neural networks (DNNs). The present study reveals novel statistics of FIM that are universal among a wide class of DNNs. To this end, we use random weights and large width limits, which enables us to utilize mean field theories. We investigate the asymptotic statistics of the FIM's eigenvalues and reveal that most of them are close to zero while the maximum eigenvalue takes a huge value. Because the landscape of the parameter space is defined by the FIM, it is locally flat in most dimensions, but strongly distorted in others. Moreover, we demonstrate the potential usage of the derived statistics in learning strategies. First, small eigenvalues that induce flatness can be connected to a norm-based capacity measure of generalization ability. Second, the maximum eigenvalue that induces the distortion enables us to quantitatively estimate an appropriately sized learning rate for gradient methods to converge.}
}
@InProceedings{hainline19a,
title = {Conditional Sparse $L_p$-norm Regression With Optimal Probability},
author = {Hainline, John and Juba, Brendan and Le, Hai S and Woodruff, David},
pages = {1042-1050},
abstract = {We consider the following conditional linear regression problem: the task is to identify both (i) a $k$-DNF condition $c$ and (ii) a linear rule $f$ such that the probability of $c$ is (approximately) at least some given bound $\mu$, and minimizing the $l_p$ loss of $f$ at predicting the target $z$ in the distribution conditioned on $c$. Thus, the task is to identify a portion of the distribution on which a linear rule can provide a good fit. Algorithms for this task are useful in cases where portions of the distribution are not modeled well by simple, learnable rules, but on other portions such rules perform well. The prior state-of-the-art for such algorithms could only guarantee finding a condition of probability $O(\mu/n^k )$ when a condition of probability $\mu$ exists, and achieved a $O(n^k)$-approximation to the target loss. Here, we give efficient algorithms for solving this task with a condition $c$ that nearly matches the probability of the ideal condition, while also improving the approximation to the target loss to a $O ~(n^{k/2})$ factor. We also give an algorithm for finding a k-DNF reference class for prediction at a given query point, that obtains a sparse regression fit that has loss within $O(n^k)$ of optimal among all sparse regression parameters and sufficiently large $k$-DNF reference classes containing the query point.}
}
@InProceedings{mondelli19a,
title = {On the Connection Between Learning Two-Layer Neural Networks and Tensor Decomposition},
author = {Mondelli, Marco and Montanari, Andrea},
pages = {1051-1060},
abstract = {We establish connections between the problem of learning a two-layer neural network and tensor decomposition. We consider a model with feature vectors $x$, $r$ hidden units with weights $w_i$ and output $y$, i.e., $y=\sum_{i=1}^r \sigma(w_i^{T} x)$, with activation functions given by low-degree polynomials. In particular, if $\sigma(x) = a_0+a_1x+a_3x^3$, we prove that no polynomial-time algorithm can outperform the trivial predictor that assigns to each example the response variable $E(y)$, when $d^{3/2}<< r <<d^2$. Our conclusion holds for a 'natural data distribution', namely standard Gaussian feature vectors $x$, and output distributed according to a two-layer neural network with random isotropic weights, and under a certain complexity-theoretic assumption on tensor decomposition. Roughly speaking, we assume that no polynomial-time algorithm can substantially outperform current methods for tensor decomposition based on the sum-of-squares hierarchy. We also prove generalizations of this statement for higher degree polynomial activations, and non-random weight vectors. Remarkably, several existing algorithms for learning two-layer networks with rigorous guarantees are based on tensor decomposition. Our results support the idea that this is indeed the core computational difficulty in learning such networks, under the stated generative model for the data. As a side result, we show that under this model learning the network requires accurate learning of its weights, a property that does not hold in a more general setting.}
}
@InProceedings{laforgue19a,
title = {Autoencoding any Data through Kernel Autoencoders},
author = {Laforgue, Pierre and Cl\'{e}men\c{c}on, St\'{e}phan and d'Alche-Buc, Florence},
pages = {1061-1069},
abstract = {This paper investigates a novel algorithmic approach to data representation based on kernel methods. Assuming that the observations lie in a Hilbert space X , the introduced Kernel Autoencoder (KAE) is the composition of mappings from vector-valued Reproducing Kernel Hilbert Spaces (vv-RKHSs) that minimizes the expected reconstruction error. Beyond a first extension of the autoencoding scheme to possibly infinite dimensional Hilbert spaces, KAE further allows to autoencode any kind of data by choosing X to be itself a RKHS. A theoretical analysis of the model is carried out, providing a generalization bound, and shedding light on its connection with Kernel Principal Component Analysis. The proposed algorithms are then detailed at length: they crucially rely on the form taken by the minimizers, revealed by a dedicated Representer Theorem. Finally, numerical experiments on both simulated data and real labeled graphs (molecules) provide empirical evidence of the KAE performances.}
}
@InProceedings{wu19b,
title = {Towards Understanding the Generalization Bias of Two Layer Convolutional Linear Classifiers with Gradient Descent},
author = {Wu, Yifan and Poczos, Barnabas and Singh, Aarti},
pages = {1070-1078},
abstract = {A major challenge in understanding the generalization of deep learning is to explain why (stochastic) gradient descent can exploit the network architecture to find solutions that have good generalization performance when using high capacity models. We find simple but realistic examples showing that this phenomenon exists even when learning linear classifiers --- between two linear networks with the same capacity, the one with a convolutional layer can generalize better than the other when the data distribution has some underlying spatial structure. We argue that this difference results from a combination of the convolution architecture, data distribution and gradient descent, all of which are necessary to be included in a meaningful analysis. We analyze of the generalization performance as a function of data distribution and convolutional filter size, given gradient descent as the optimization algorithm, then interpret the results using concrete examples. Experimental results show that our analysis is able to explain what happens in our introduced examples.}
}
@InProceedings{cheung19b,
title = {Learning to Optimize under Non-Stationarity},
author = {Cheung, Wang Chi and Simchi-Levi, David and Zhu, Ruihao},
pages = {1079-1087},
abstract = {We introduce algorithms that achieve state-of-the-art dynamic regret bounds for non-stationary linear stochastic bandit setting. It captures natural applications such as dynamic pricing and ads allocation in a changing environment. We show how the difficulty posed by the non-stationarity can be overcome by a novel marriage between stochastic and adversarial bandits learning algorithms. Our main contributions are the tuned Sliding Window UCB (SW-UCB) algorithm with optimal dynamic regret, and the tuning free bandit-over-bandit (BOB) framework built on top of the SW-UCB algorithm with best (compared to existing literature) dynamic regret.}
}
@InProceedings{cucuringu19a,
title = {SPONGE: A generalized eigenproblem for clustering signed networks},
author = {Cucuringu, Mihai and Davies, Peter and Glielmo, Aldo and Tyagi, Hemant},
pages = {1088-1098},
abstract = {We introduce a principled and theoretically sound spectral method for k-way clustering in signed graphs, where the affinity measure between nodes takes either positive or negative values. Our approach is motivated by social balance theory, where the task of clustering aims to decompose the network into disjoint groups such that individuals within the same group are connected by as many positive edges as possible, while individuals from different groups are connected by as many negative edges as possible. Our algorithm relies on a generalized eigenproblem formulation inspired by recent work on constrained clustering. We provide theoretical guarantees for our approach in the setting of a signed stochastic block model, by leveraging tools from matrix perturbation theory and random matrix theory. An extensive set of numerical experiments on both synthetic and real data shows that our approach compares favorably with state-of-the-art methods for signed clustering, especially for large number of clusters and sparse measurement graphs.}
}
@InProceedings{zhang19d,
title = {Deep Neural Networks with Multi-Branch Architectures Are Intrinsically Less Non-Convex},
author = {Zhang, Hongyang and Shao, Junru and Salakhutdinov, Ruslan},
pages = {1099-1109},
abstract = {Several recently proposed architectures of neural networks such as ResNeXt, Inception, Xception, SqueezeNet and Wide ResNet are based on the designing idea of having multiple branches and have demonstrated improved performance in many applications. We show that one cause for such success is due to the fact that the multi-branch architecture is less non-convex in terms of duality gap. The duality gap measures the degree of intrinsic non-convexity of an optimization problem: smaller gap in relative value implies lower degree of intrinsic non-convexity. The challenge is to quantitatively measure the duality gap of highly non-convex problems such as deep neural networks. In this work, we provide strong guarantees of this quantity for two classes of network architectures. For the neural networks with arbitrary activation functions, multi-branch architecture and a variant of hinge loss, we show that the duality gap of both population and empirical risks shrinks to zero as the number of branches increases. This result sheds light on better understanding the power of over-parametrization where increasing the number of branches tends to make the loss surface less non-convex. For the neural networks with linear activation function and $\ell_2$ loss, we show that the duality gap of empirical risk is zero. Our two results work for arbitrary depths, while the analytical techniques might be of independent interest to non-convex optimization more broadly. Experiments on both synthetic and real-world datasets validate our results.}
}
@InProceedings{sun19a,
title = {Are we there yet? Manifold identification of gradient-related proximal methods},
author = {Sun, Yifan and Jeong, Halyun and Nutini, Julie and Schmidt, Mark},
pages = {1110-1119},
abstract = {In machine learning, models that generalize better often generate outputs that lie on a low-dimensional manifold. Recently, several works have separately shown finite-time manifold identification by some proximal methods. In this work we provide a unified view by giving a simple condition under which any proximal method using a constant step size can achieve finite-iteration manifold detection. For several key methods (FISTA, DRS, ADMM, SVRG, SAGA, and RDA) we give an iteration bound, characterized in terms of their variable convergence rate and a problem-dependent constant that indicates problem degeneracy. For popular models, this constant is related to certain data assumptions, which gives intuition as to when lower active set complexity may be expected in practice.}
}
@InProceedings{acharya19a,
title = {Hadamard Response: Estimating Distributions Privately, Efficiently, and with Little Communication},
author = {Acharya, Jayadev and Sun, Ziteng and Zhang, Huanyu},
pages = {1120-1129},
abstract = {We study the problem of estimating $k$-ary distributions under $\eps$-local differential privacy. $n$ samples are distributed across users who send privatized versions of their sample to a central server. All previously known sample optimal algorithms require linear (in $k$) communication from each user in the high privacy regime $(\eps=O(1))$, and run in time that grows as $n\cdot k$, which can be prohibitive for large domain size $k$. We propose Hadamard Response (HR), a local privatization scheme that requires no shared randomness and is symmetric with respect to the users. Our scheme has order optimal sample complexity for all $\eps$, a communication of at most $\log k+2$ bits per user, and nearly linear running time of $\tilde{O}(n + k)$. Our encoding and decoding are based on Hadamard matrices and are simple to implement. The statistical performance relies on the coding theoretic aspects of Hadamard matrices, ie, the large Hamming distance between the rows. An efficient implementation of the algorithm using the Fast Walsh-Hadamard transform gives the computational gains. We compare our approach with Randomized Response (RR), RAPPOR, and subset-selection mechanisms (SS), both theoretically, and experimentally. For $k=10000$, our algorithm runs about 100x faster than SS, and RAPPOR.}
}
@InProceedings{he19a,
title = {XBART: Accelerated Bayesian Additive Regression Trees},
author = {He, Jingyu and Yalov, Saar and Hahn, P. Richard},
pages = {1130-1138},
abstract = {Bayesian additive regression trees (BART) (Chipman et. al., 2010) is a powerful predictive model that often outperforms alternative models at out-of-sample prediction. BART is especially well-suited to settings with unstructured predictor variables and substantial sources of unmeasured variation as is typical in the social, behavioral and health sciences. This paper develops a modified version of BART that is amenable to fast posterior estimation. We present a stochastic hill climbing algorithm that matches the remarkable predictive accuracy of previous BART implementations, but is many times faster and less memory intensive. Simulation studies show that the new method is comparable in computation time and more accurate at function estimation than both random forests and gradient boosting.}
}
@InProceedings{giordano19a,
title = {A Swiss Army Infinitesimal Jackknife},
author = {Giordano, Ryan and Stephenson, William and Liu, Runjing and Jordan, Michael and Broderick, Tamara},
pages = {1139-1147},
abstract = {The error or variability of machine learning algorithms is often assessed by repeatedly refitting a model with different weighted versions of the observed data. The ubiquitous tools of cross-validation (CV) and the bootstrap are examples of this technique. These methods are powerful in large part due to their model agnosticism but can be slow to run on modern, large data sets due to the need to repeatedly re-fit the model. In this work, we use a linear approximation to the dependence of the fitting procedure on the weights, producing results that can be faster than repeated re-fitting by an order of magnitude. This linear approximation is sometimes known as the "infinitesimal jackknife" in the statistics literature, where it is mostly used as a theoretical tool to prove asymptotic results. We provide explicit finite-sample error bounds for the infinitesimal jackknife in terms of a small number of simple, verifiable assumptions. Our results apply whether the weights and data are stochastic or deterministic, and so can be used as a tool for proving the accuracy of the infinitesimal jackknife on a wide variety of problems. As a corollary, we state mild regularity conditions under which our approximation consistently estimates true leave k-out cross-validation for any fixed k. These theoretical results, together with modern automatic differentiation software, support the application of the infinitesimal jackknife to a wide variety of practical problems in machine learning, providing a "Swiss Army infinitesimal jackknife." We demonstrate the accuracy of our methods on a range of simulated and real datasets.}
}
@InProceedings{zhang19e,
title = {Online Multiclass Boosting with Bandit Feedback},
author = {Zhang, Daniel T. and Jung, Young Hun and Tewari, Ambuj},
pages = {1148-1156},
abstract = {We present online boosting algorithms for multiclass classification with bandit feedback, where the learner only receives feedback about the correctness of its prediction. We propose an unbiased estimate of the loss using a randomized prediction, allowing the model to update its weak learners with limited information. Using the unbiased estimate, we extend two full information boosting algorithms (Jung et al., 2017) to the bandit setting. We prove that the asymptotic error bounds of the bandit algorithms exactly match their full information counterparts. The cost of restricted feedback is reflected in the larger sample complexity. Experimental results also support our theoretical findings, and performance of the proposed models is comparable to that of an existing bandit boosting algorithm, which is limited to use binary weak learners.}
}
@InProceedings{gao19a,
title = {Auto-Encoding Total Correlation Explanation},
author = {Gao, Shuyang and Brekelmans, Rob and Steeg, Greg Ver and Galstyan, Aram},
pages = {1157-1166},
abstract = {Advances in unsupervised learning enable reconstruction and generation of samples from complex distributions, but this success is marred by the inscrutability of the representations learned. We propose an information-theoretic approach to characterizing disentanglement and dependence in representation learning using multivariate mutual information, also called total correlation. The principle of Total Cor-relation Ex-planation (CorEx) has motivated successful unsupervised learning applications across a variety of domains but under some restrictive assumptions. Here we relax those restrictions by introducing a flexible variational lower bound to CorEx. Surprisingly, we find this lower bound is equivalent to the one in variational autoencoders (VAE) under certain conditions. This information-theoretic view of VAE deepens our understanding of hierarchical VAE and motivates a new algorithm, AnchorVAE, that makes latent codes more interpretable through information maximization and enables generation of richer and more realistic samples.}
}
@InProceedings{jia19a,
title = {Towards Efficient Data Valuation Based on the Shapley Value},
author = {Jia, Ruoxi and Dao, David and Wang, Boxin and Hubis, Frances Ann and Hynes, Nick and G\"{u}rel, Nezihe Merve and Li, Bo and Zhang, Ce and Song, Dawn and Spanos, Costas J.},
pages = {1167-1176},
abstract = {{\em ``How much is my data worth?''} is an increasingly common question posed by organizations and individuals alike. An answer to this question could allow, for instance, fairly distributing profits among multiple data contributors and determining prospective compensation when data breaches happen. In this paper, we study the problem of \emph{data valuation} by utilizing the Shapley value, a popular notion of value which originated in coopoerative game theory. The Shapley value defines a unique payoff scheme that satisfies many desiderata for the notion of data value. However, the Shapley value often requires \emph{exponential} time to compute. To meet this challenge, we propose a repertoire of efficient algorithms for approximating the Shapley value. We also demonstrate the value of each training instance for various benchmark datasets.}
}
@InProceedings{oliveira19a,
title = {Bayesian optimisation under uncertain inputs},
author = {Oliveira, Rafael and Ott, Lionel and Ramos, Fabio},
pages = {1177-1184},
abstract = {Bayesian optimisation (BO) has been a successful approach to optimise functions which are expensive to evaluate and whose observations are noisy. Classical BO algorithms, however, do not account for errors about the location where observations are taken, which is a common issue in problems with physical components. In these cases, the estimation of the actual query location is also subject to uncertainty. In this context, we propose an upper confidence bound (UCB) algorithm for BO problems where both the outcome of a query and the true query location are uncertain. The algorithm employs a Gaussian process model that takes probability distributions as inputs. Theoretical results are provided for both the proposed algorithm and a conventional UCB approach within the uncertain-inputs setting. Finally, we evaluate each method's performance experimentally, comparing them to other input noise aware BO approaches on simulated scenarios involving synthetic and real data.}
}
@InProceedings{ko19a,
title = {Optimal Minimization of the Sum of Three Convex Functions with a Linear Operator},
author = {Ko, Seyoon and Won, Joong-Ho},
pages = {1185-1194},
abstract = {We propose a class of optimal-rate primal-dual algorithms for minimization of the sum of three convex functions with a linear operator. We first establish the optimal convergence rates for solving the saddle-point reformulation of the problem only using first-order information under deterministic and stochastic settings, respectively. We then proceed to show that the proposed algorithm class achieves these rates. The studied algorithm class does not require matrix inversion and is simple to implement. To our knowledge, this is the first work to establish and attain the optimal rates for this class of problems with minimal assumptions. Numerical experiments show that our method outperforms state-of-the-art methods.}
}
@InProceedings{vaswani19a,
title = {Fast and Faster Convergence of SGD for Over-Parameterized Models and an Accelerated Perceptron},