-
Notifications
You must be signed in to change notification settings - Fork 52.4k
/
vhost.c
3013 lines (2594 loc) · 71.1 KB
/
vhost.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2009 Red Hat, Inc.
* Copyright (C) 2006 Rusty Russell IBM Corporation
*
* Author: Michael S. Tsirkin <mst@redhat.com>
*
* Inspiration, some code, and most witty comments come from
* Documentation/virtual/lguest/lguest.c, by Rusty Russell
*
* Generic code for virtio server in host kernel.
*/
#include <linux/eventfd.h>
#include <linux/vhost.h>
#include <linux/uio.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/sort.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/vhost_task.h>
#include <linux/interval_tree_generic.h>
#include <linux/nospec.h>
#include <linux/kcov.h>
#include "vhost.h"
static ushort max_mem_regions = 64;
module_param(max_mem_regions, ushort, 0444);
MODULE_PARM_DESC(max_mem_regions,
"Maximum number of memory regions in memory map. (default: 64)");
static int max_iotlb_entries = 2048;
module_param(max_iotlb_entries, int, 0444);
MODULE_PARM_DESC(max_iotlb_entries,
"Maximum number of iotlb entries. (default: 2048)");
enum {
VHOST_MEMORY_F_LOG = 0x1,
};
#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
vq->user_be = !virtio_legacy_is_little_endian();
}
static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
{
vq->user_be = true;
}
static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
{
vq->user_be = false;
}
static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
{
struct vhost_vring_state s;
if (vq->private_data)
return -EBUSY;
if (copy_from_user(&s, argp, sizeof(s)))
return -EFAULT;
if (s.num != VHOST_VRING_LITTLE_ENDIAN &&
s.num != VHOST_VRING_BIG_ENDIAN)
return -EINVAL;
if (s.num == VHOST_VRING_BIG_ENDIAN)
vhost_enable_cross_endian_big(vq);
else
vhost_enable_cross_endian_little(vq);
return 0;
}
static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
int __user *argp)
{
struct vhost_vring_state s = {
.index = idx,
.num = vq->user_be
};
if (copy_to_user(argp, &s, sizeof(s)))
return -EFAULT;
return 0;
}
static void vhost_init_is_le(struct vhost_virtqueue *vq)
{
/* Note for legacy virtio: user_be is initialized at reset time
* according to the host endianness. If userspace does not set an
* explicit endianness, the default behavior is native endian, as
* expected by legacy virtio.
*/
vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
}
#else
static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
}
static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
{
return -ENOIOCTLCMD;
}
static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
int __user *argp)
{
return -ENOIOCTLCMD;
}
static void vhost_init_is_le(struct vhost_virtqueue *vq)
{
vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
|| virtio_legacy_is_little_endian();
}
#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
static void vhost_reset_is_le(struct vhost_virtqueue *vq)
{
vhost_init_is_le(vq);
}
struct vhost_flush_struct {
struct vhost_work work;
struct completion wait_event;
};
static void vhost_flush_work(struct vhost_work *work)
{
struct vhost_flush_struct *s;
s = container_of(work, struct vhost_flush_struct, work);
complete(&s->wait_event);
}
static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct vhost_poll *poll;
poll = container_of(pt, struct vhost_poll, table);
poll->wqh = wqh;
add_wait_queue(wqh, &poll->wait);
}
static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
void *key)
{
struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
struct vhost_work *work = &poll->work;
if (!(key_to_poll(key) & poll->mask))
return 0;
if (!poll->dev->use_worker)
work->fn(work);
else
vhost_poll_queue(poll);
return 0;
}
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
{
clear_bit(VHOST_WORK_QUEUED, &work->flags);
work->fn = fn;
}
EXPORT_SYMBOL_GPL(vhost_work_init);
/* Init poll structure */
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
__poll_t mask, struct vhost_dev *dev,
struct vhost_virtqueue *vq)
{
init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
init_poll_funcptr(&poll->table, vhost_poll_func);
poll->mask = mask;
poll->dev = dev;
poll->wqh = NULL;
poll->vq = vq;
vhost_work_init(&poll->work, fn);
}
EXPORT_SYMBOL_GPL(vhost_poll_init);
/* Start polling a file. We add ourselves to file's wait queue. The caller must
* keep a reference to a file until after vhost_poll_stop is called. */
int vhost_poll_start(struct vhost_poll *poll, struct file *file)
{
__poll_t mask;
if (poll->wqh)
return 0;
mask = vfs_poll(file, &poll->table);
if (mask)
vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
if (mask & EPOLLERR) {
vhost_poll_stop(poll);
return -EINVAL;
}
return 0;
}
EXPORT_SYMBOL_GPL(vhost_poll_start);
/* Stop polling a file. After this function returns, it becomes safe to drop the
* file reference. You must also flush afterwards. */
void vhost_poll_stop(struct vhost_poll *poll)
{
if (poll->wqh) {
remove_wait_queue(poll->wqh, &poll->wait);
poll->wqh = NULL;
}
}
EXPORT_SYMBOL_GPL(vhost_poll_stop);
static void vhost_worker_queue(struct vhost_worker *worker,
struct vhost_work *work)
{
if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
/* We can only add the work to the list after we're
* sure it was not in the list.
* test_and_set_bit() implies a memory barrier.
*/
llist_add(&work->node, &worker->work_list);
vhost_task_wake(worker->vtsk);
}
}
bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
{
struct vhost_worker *worker;
bool queued = false;
rcu_read_lock();
worker = rcu_dereference(vq->worker);
if (worker) {
queued = true;
vhost_worker_queue(worker, work);
}
rcu_read_unlock();
return queued;
}
EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
/**
* __vhost_worker_flush - flush a worker
* @worker: worker to flush
*
* The worker's flush_mutex must be held.
*/
static void __vhost_worker_flush(struct vhost_worker *worker)
{
struct vhost_flush_struct flush;
if (!worker->attachment_cnt || worker->killed)
return;
init_completion(&flush.wait_event);
vhost_work_init(&flush.work, vhost_flush_work);
vhost_worker_queue(worker, &flush.work);
/*
* Drop mutex in case our worker is killed and it needs to take the
* mutex to force cleanup.
*/
mutex_unlock(&worker->mutex);
wait_for_completion(&flush.wait_event);
mutex_lock(&worker->mutex);
}
static void vhost_worker_flush(struct vhost_worker *worker)
{
mutex_lock(&worker->mutex);
__vhost_worker_flush(worker);
mutex_unlock(&worker->mutex);
}
void vhost_dev_flush(struct vhost_dev *dev)
{
struct vhost_worker *worker;
unsigned long i;
xa_for_each(&dev->worker_xa, i, worker)
vhost_worker_flush(worker);
}
EXPORT_SYMBOL_GPL(vhost_dev_flush);
/* A lockless hint for busy polling code to exit the loop */
bool vhost_vq_has_work(struct vhost_virtqueue *vq)
{
struct vhost_worker *worker;
bool has_work = false;
rcu_read_lock();
worker = rcu_dereference(vq->worker);
if (worker && !llist_empty(&worker->work_list))
has_work = true;
rcu_read_unlock();
return has_work;
}
EXPORT_SYMBOL_GPL(vhost_vq_has_work);
void vhost_poll_queue(struct vhost_poll *poll)
{
vhost_vq_work_queue(poll->vq, &poll->work);
}
EXPORT_SYMBOL_GPL(vhost_poll_queue);
static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq)
{
int j;
for (j = 0; j < VHOST_NUM_ADDRS; j++)
vq->meta_iotlb[j] = NULL;
}
static void vhost_vq_meta_reset(struct vhost_dev *d)
{
int i;
for (i = 0; i < d->nvqs; ++i)
__vhost_vq_meta_reset(d->vqs[i]);
}
static void vhost_vring_call_reset(struct vhost_vring_call *call_ctx)
{
call_ctx->ctx = NULL;
memset(&call_ctx->producer, 0x0, sizeof(struct irq_bypass_producer));
}
bool vhost_vq_is_setup(struct vhost_virtqueue *vq)
{
return vq->avail && vq->desc && vq->used && vhost_vq_access_ok(vq);
}
EXPORT_SYMBOL_GPL(vhost_vq_is_setup);
static void vhost_vq_reset(struct vhost_dev *dev,
struct vhost_virtqueue *vq)
{
vq->num = 1;
vq->desc = NULL;
vq->avail = NULL;
vq->used = NULL;
vq->last_avail_idx = 0;
vq->avail_idx = 0;
vq->last_used_idx = 0;
vq->signalled_used = 0;
vq->signalled_used_valid = false;
vq->used_flags = 0;
vq->log_used = false;
vq->log_addr = -1ull;
vq->private_data = NULL;
vq->acked_features = 0;
vq->acked_backend_features = 0;
vq->log_base = NULL;
vq->error_ctx = NULL;
vq->kick = NULL;
vq->log_ctx = NULL;
vhost_disable_cross_endian(vq);
vhost_reset_is_le(vq);
vq->busyloop_timeout = 0;
vq->umem = NULL;
vq->iotlb = NULL;
rcu_assign_pointer(vq->worker, NULL);
vhost_vring_call_reset(&vq->call_ctx);
__vhost_vq_meta_reset(vq);
}
static bool vhost_run_work_list(void *data)
{
struct vhost_worker *worker = data;
struct vhost_work *work, *work_next;
struct llist_node *node;
node = llist_del_all(&worker->work_list);
if (node) {
__set_current_state(TASK_RUNNING);
node = llist_reverse_order(node);
/* make sure flag is seen after deletion */
smp_wmb();
llist_for_each_entry_safe(work, work_next, node, node) {
clear_bit(VHOST_WORK_QUEUED, &work->flags);
kcov_remote_start_common(worker->kcov_handle);
work->fn(work);
kcov_remote_stop();
cond_resched();
}
}
return !!node;
}
static void vhost_worker_killed(void *data)
{
struct vhost_worker *worker = data;
struct vhost_dev *dev = worker->dev;
struct vhost_virtqueue *vq;
int i, attach_cnt = 0;
mutex_lock(&worker->mutex);
worker->killed = true;
for (i = 0; i < dev->nvqs; i++) {
vq = dev->vqs[i];
mutex_lock(&vq->mutex);
if (worker ==
rcu_dereference_check(vq->worker,
lockdep_is_held(&vq->mutex))) {
rcu_assign_pointer(vq->worker, NULL);
attach_cnt++;
}
mutex_unlock(&vq->mutex);
}
worker->attachment_cnt -= attach_cnt;
if (attach_cnt)
synchronize_rcu();
/*
* Finish vhost_worker_flush calls and any other works that snuck in
* before the synchronize_rcu.
*/
vhost_run_work_list(worker);
mutex_unlock(&worker->mutex);
}
static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
{
kfree(vq->indirect);
vq->indirect = NULL;
kfree(vq->log);
vq->log = NULL;
kfree(vq->heads);
vq->heads = NULL;
}
/* Helper to allocate iovec buffers for all vqs. */
static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
{
struct vhost_virtqueue *vq;
int i;
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
vq->indirect = kmalloc_array(UIO_MAXIOV,
sizeof(*vq->indirect),
GFP_KERNEL);
vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log),
GFP_KERNEL);
vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
GFP_KERNEL);
if (!vq->indirect || !vq->log || !vq->heads)
goto err_nomem;
}
return 0;
err_nomem:
for (; i >= 0; --i)
vhost_vq_free_iovecs(dev->vqs[i]);
return -ENOMEM;
}
static void vhost_dev_free_iovecs(struct vhost_dev *dev)
{
int i;
for (i = 0; i < dev->nvqs; ++i)
vhost_vq_free_iovecs(dev->vqs[i]);
}
bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
int pkts, int total_len)
{
struct vhost_dev *dev = vq->dev;
if ((dev->byte_weight && total_len >= dev->byte_weight) ||
pkts >= dev->weight) {
vhost_poll_queue(&vq->poll);
return true;
}
return false;
}
EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
unsigned int num)
{
size_t event __maybe_unused =
vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
return size_add(struct_size(vq->avail, ring, num), event);
}
static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
unsigned int num)
{
size_t event __maybe_unused =
vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
return size_add(struct_size(vq->used, ring, num), event);
}
static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
unsigned int num)
{
return sizeof(*vq->desc) * num;
}
void vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue **vqs, int nvqs,
int iov_limit, int weight, int byte_weight,
bool use_worker,
int (*msg_handler)(struct vhost_dev *dev, u32 asid,
struct vhost_iotlb_msg *msg))
{
struct vhost_virtqueue *vq;
int i;
dev->vqs = vqs;
dev->nvqs = nvqs;
mutex_init(&dev->mutex);
dev->log_ctx = NULL;
dev->umem = NULL;
dev->iotlb = NULL;
dev->mm = NULL;
dev->iov_limit = iov_limit;
dev->weight = weight;
dev->byte_weight = byte_weight;
dev->use_worker = use_worker;
dev->msg_handler = msg_handler;
init_waitqueue_head(&dev->wait);
INIT_LIST_HEAD(&dev->read_list);
INIT_LIST_HEAD(&dev->pending_list);
spin_lock_init(&dev->iotlb_lock);
xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
vq->log = NULL;
vq->indirect = NULL;
vq->heads = NULL;
vq->dev = dev;
mutex_init(&vq->mutex);
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
vhost_poll_init(&vq->poll, vq->handle_kick,
EPOLLIN, dev, vq);
}
}
EXPORT_SYMBOL_GPL(vhost_dev_init);
/* Caller should have device mutex */
long vhost_dev_check_owner(struct vhost_dev *dev)
{
/* Are you the owner? If not, I don't think you mean to do that */
return dev->mm == current->mm ? 0 : -EPERM;
}
EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
/* Caller should have device mutex */
bool vhost_dev_has_owner(struct vhost_dev *dev)
{
return dev->mm;
}
EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
static void vhost_attach_mm(struct vhost_dev *dev)
{
/* No owner, become one */
if (dev->use_worker) {
dev->mm = get_task_mm(current);
} else {
/* vDPA device does not use worker thead, so there's
* no need to hold the address space for mm. This help
* to avoid deadlock in the case of mmap() which may
* held the refcnt of the file and depends on release
* method to remove vma.
*/
dev->mm = current->mm;
mmgrab(dev->mm);
}
}
static void vhost_detach_mm(struct vhost_dev *dev)
{
if (!dev->mm)
return;
if (dev->use_worker)
mmput(dev->mm);
else
mmdrop(dev->mm);
dev->mm = NULL;
}
static void vhost_worker_destroy(struct vhost_dev *dev,
struct vhost_worker *worker)
{
if (!worker)
return;
WARN_ON(!llist_empty(&worker->work_list));
xa_erase(&dev->worker_xa, worker->id);
vhost_task_stop(worker->vtsk);
kfree(worker);
}
static void vhost_workers_free(struct vhost_dev *dev)
{
struct vhost_worker *worker;
unsigned long i;
if (!dev->use_worker)
return;
for (i = 0; i < dev->nvqs; i++)
rcu_assign_pointer(dev->vqs[i]->worker, NULL);
/*
* Free the default worker we created and cleanup workers userspace
* created but couldn't clean up (it forgot or crashed).
*/
xa_for_each(&dev->worker_xa, i, worker)
vhost_worker_destroy(dev, worker);
xa_destroy(&dev->worker_xa);
}
static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
{
struct vhost_worker *worker;
struct vhost_task *vtsk;
char name[TASK_COMM_LEN];
int ret;
u32 id;
worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
if (!worker)
return NULL;
worker->dev = dev;
snprintf(name, sizeof(name), "vhost-%d", current->pid);
vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
worker, name);
if (!vtsk)
goto free_worker;
mutex_init(&worker->mutex);
init_llist_head(&worker->work_list);
worker->kcov_handle = kcov_common_handle();
worker->vtsk = vtsk;
vhost_task_start(vtsk);
ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
if (ret < 0)
goto stop_worker;
worker->id = id;
return worker;
stop_worker:
vhost_task_stop(vtsk);
free_worker:
kfree(worker);
return NULL;
}
/* Caller must have device mutex */
static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
struct vhost_worker *worker)
{
struct vhost_worker *old_worker;
mutex_lock(&worker->mutex);
if (worker->killed) {
mutex_unlock(&worker->mutex);
return;
}
mutex_lock(&vq->mutex);
old_worker = rcu_dereference_check(vq->worker,
lockdep_is_held(&vq->mutex));
rcu_assign_pointer(vq->worker, worker);
worker->attachment_cnt++;
if (!old_worker) {
mutex_unlock(&vq->mutex);
mutex_unlock(&worker->mutex);
return;
}
mutex_unlock(&vq->mutex);
mutex_unlock(&worker->mutex);
/*
* Take the worker mutex to make sure we see the work queued from
* device wide flushes which doesn't use RCU for execution.
*/
mutex_lock(&old_worker->mutex);
if (old_worker->killed) {
mutex_unlock(&old_worker->mutex);
return;
}
/*
* We don't want to call synchronize_rcu for every vq during setup
* because it will slow down VM startup. If we haven't done
* VHOST_SET_VRING_KICK and not done the driver specific
* SET_ENDPOINT/RUNNUNG then we can skip the sync since there will
* not be any works queued for scsi and net.
*/
mutex_lock(&vq->mutex);
if (!vhost_vq_get_backend(vq) && !vq->kick) {
mutex_unlock(&vq->mutex);
old_worker->attachment_cnt--;
mutex_unlock(&old_worker->mutex);
/*
* vsock can queue anytime after VHOST_VSOCK_SET_GUEST_CID.
* Warn if it adds support for multiple workers but forgets to
* handle the early queueing case.
*/
WARN_ON(!old_worker->attachment_cnt &&
!llist_empty(&old_worker->work_list));
return;
}
mutex_unlock(&vq->mutex);
/* Make sure new vq queue/flush/poll calls see the new worker */
synchronize_rcu();
/* Make sure whatever was queued gets run */
__vhost_worker_flush(old_worker);
old_worker->attachment_cnt--;
mutex_unlock(&old_worker->mutex);
}
/* Caller must have device mutex */
static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
struct vhost_vring_worker *info)
{
unsigned long index = info->worker_id;
struct vhost_dev *dev = vq->dev;
struct vhost_worker *worker;
if (!dev->use_worker)
return -EINVAL;
worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
if (!worker || worker->id != info->worker_id)
return -ENODEV;
__vhost_vq_attach_worker(vq, worker);
return 0;
}
/* Caller must have device mutex */
static int vhost_new_worker(struct vhost_dev *dev,
struct vhost_worker_state *info)
{
struct vhost_worker *worker;
worker = vhost_worker_create(dev);
if (!worker)
return -ENOMEM;
info->worker_id = worker->id;
return 0;
}
/* Caller must have device mutex */
static int vhost_free_worker(struct vhost_dev *dev,
struct vhost_worker_state *info)
{
unsigned long index = info->worker_id;
struct vhost_worker *worker;
worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
if (!worker || worker->id != info->worker_id)
return -ENODEV;
mutex_lock(&worker->mutex);
if (worker->attachment_cnt || worker->killed) {
mutex_unlock(&worker->mutex);
return -EBUSY;
}
/*
* A flush might have raced and snuck in before attachment_cnt was set
* to zero. Make sure flushes are flushed from the queue before
* freeing.
*/
__vhost_worker_flush(worker);
mutex_unlock(&worker->mutex);
vhost_worker_destroy(dev, worker);
return 0;
}
static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
struct vhost_virtqueue **vq, u32 *id)
{
u32 __user *idxp = argp;
u32 idx;
long r;
r = get_user(idx, idxp);
if (r < 0)
return r;
if (idx >= dev->nvqs)
return -ENOBUFS;
idx = array_index_nospec(idx, dev->nvqs);
*vq = dev->vqs[idx];
*id = idx;
return 0;
}
/* Caller must have device mutex */
long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
void __user *argp)
{
struct vhost_vring_worker ring_worker;
struct vhost_worker_state state;
struct vhost_worker *worker;
struct vhost_virtqueue *vq;
long ret;
u32 idx;
if (!dev->use_worker)
return -EINVAL;
if (!vhost_dev_has_owner(dev))
return -EINVAL;
ret = vhost_dev_check_owner(dev);
if (ret)
return ret;
switch (ioctl) {
/* dev worker ioctls */
case VHOST_NEW_WORKER:
ret = vhost_new_worker(dev, &state);
if (!ret && copy_to_user(argp, &state, sizeof(state)))
ret = -EFAULT;
return ret;
case VHOST_FREE_WORKER:
if (copy_from_user(&state, argp, sizeof(state)))
return -EFAULT;
return vhost_free_worker(dev, &state);
/* vring worker ioctls */
case VHOST_ATTACH_VRING_WORKER:
case VHOST_GET_VRING_WORKER:
break;
default:
return -ENOIOCTLCMD;
}
ret = vhost_get_vq_from_user(dev, argp, &vq, &idx);
if (ret)
return ret;
switch (ioctl) {
case VHOST_ATTACH_VRING_WORKER:
if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) {
ret = -EFAULT;
break;
}
ret = vhost_vq_attach_worker(vq, &ring_worker);
break;
case VHOST_GET_VRING_WORKER:
worker = rcu_dereference_check(vq->worker,
lockdep_is_held(&dev->mutex));
if (!worker) {
ret = -EINVAL;
break;
}
ring_worker.index = idx;
ring_worker.worker_id = worker->id;
if (copy_to_user(argp, &ring_worker, sizeof(ring_worker)))
ret = -EFAULT;
break;
default:
ret = -ENOIOCTLCMD;
break;
}
return ret;
}
EXPORT_SYMBOL_GPL(vhost_worker_ioctl);
/* Caller should have device mutex */
long vhost_dev_set_owner(struct vhost_dev *dev)
{
struct vhost_worker *worker;
int err, i;
/* Is there an owner already? */
if (vhost_dev_has_owner(dev)) {
err = -EBUSY;
goto err_mm;
}
vhost_attach_mm(dev);
err = vhost_dev_alloc_iovecs(dev);
if (err)
goto err_iovecs;
if (dev->use_worker) {
/*
* This should be done last, because vsock can queue work
* before VHOST_SET_OWNER so it simplifies the failure path
* below since we don't have to worry about vsock queueing
* while we free the worker.
*/
worker = vhost_worker_create(dev);
if (!worker) {
err = -ENOMEM;
goto err_worker;
}
for (i = 0; i < dev->nvqs; i++)
__vhost_vq_attach_worker(dev->vqs[i], worker);
}
return 0;
err_worker:
vhost_dev_free_iovecs(dev);
err_iovecs:
vhost_detach_mm(dev);
err_mm:
return err;
}
EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
static struct vhost_iotlb *iotlb_alloc(void)
{
return vhost_iotlb_alloc(max_iotlb_entries,
VHOST_IOTLB_FLAG_RETIRE);
}
struct vhost_iotlb *vhost_dev_reset_owner_prepare(void)
{
return iotlb_alloc();
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
/* Caller should have device mutex */
void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
{
int i;
vhost_dev_cleanup(dev);
dev->umem = umem;
/* We don't need VQ locks below since vhost_dev_cleanup makes sure
* VQs aren't running.
*/
for (i = 0; i < dev->nvqs; ++i)
dev->vqs[i]->umem = umem;
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
void vhost_dev_stop(struct vhost_dev *dev)
{
int i;
for (i = 0; i < dev->nvqs; ++i) {
if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick)
vhost_poll_stop(&dev->vqs[i]->poll);