This repository has been archived by the owner on May 8, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 67
/
multi.c
5166 lines (4552 loc) · 178 KB
/
multi.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/******************************************************************************
* arch/x86/mm/shadow/multi.c
*
* Simple, mostly-synchronous shadow page tables.
* Parts of this code are Copyright (c) 2006 by XenSource Inc.
* Parts of this code are Copyright (c) 2006 by Michael A Fetterman
* Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; If not, see <http://www.gnu.org/licenses/>.
*/
/* Allow uniquely identifying static symbols in the 3 generated objects. */
asm(".file \"" __OBJECT_FILE__ "\"");
#include <xen/types.h>
#include <xen/mm.h>
#include <xen/trace.h>
#include <xen/sched.h>
#include <xen/perfc.h>
#include <xen/domain_page.h>
#include <xen/iocap.h>
#include <xsm/xsm.h>
#include <asm/page.h>
#include <asm/current.h>
#include <asm/shadow.h>
#include <asm/flushtlb.h>
#include <asm/hvm/hvm.h>
#include <asm/hvm/cacheattr.h>
#include <asm/mtrr.h>
#include <asm/guest_pt.h>
#include <public/sched.h>
#include "private.h"
#include "types.h"
/* THINGS TO DO LATER:
*
* TEARDOWN HEURISTICS
* Also: have a heuristic for when to destroy a previous paging-mode's
* shadows. When a guest is done with its start-of-day 32-bit tables
* and reuses the memory we want to drop those shadows. Start with
* shadows in a page in two modes as a hint, but beware of clever tricks
* like reusing a pagetable for both PAE and 64-bit during boot...
*
* PAE LINEAR MAPS
* Rework shadow_get_l*e() to have the option of using map_domain_page()
* instead of linear maps. Add appropriate unmap_l*e calls in the users.
* Then we can test the speed difference made by linear maps. If the
* map_domain_page() version is OK on PAE, we could maybe allow a lightweight
* l3-and-l2h-only shadow mode for PAE PV guests that would allow them
* to share l2h pages again.
*
* PSE disabled / PSE36
* We don't support any modes other than PSE enabled, PSE36 disabled.
* Neither of those would be hard to change, but we'd need to be able to
* deal with shadows made in one mode and used in another.
*/
#define FETCH_TYPE_PREFETCH 1
#define FETCH_TYPE_DEMAND 2
#define FETCH_TYPE_WRITE 4
typedef enum {
ft_prefetch = FETCH_TYPE_PREFETCH,
ft_demand_read = FETCH_TYPE_DEMAND,
ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
} fetch_type_t;
extern const char *const fetch_type_names[];
#if defined(DEBUG_TRACE_DUMP) && CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS
const char *const fetch_type_names[] = {
[ft_prefetch] = "prefetch",
[ft_demand_read] = "demand read",
[ft_demand_write] = "demand write",
};
#endif
/**************************************************************************/
/* Hash table mapping from guest pagetables to shadows
*
* Normal case: maps the mfn of a guest page to the mfn of its shadow page.
* FL1's: maps the *gfn* of the start of a superpage to the mfn of a
* shadow L1 which maps its "splinters".
*/
static inline mfn_t
get_fl1_shadow_status(struct domain *d, gfn_t gfn)
/* Look for FL1 shadows in the hash table */
{
mfn_t smfn = shadow_hash_lookup(d, gfn_x(gfn), SH_type_fl1_shadow);
ASSERT(!mfn_valid(smfn) || mfn_to_page(smfn)->u.sh.head);
return smfn;
}
static inline mfn_t
get_shadow_status(struct domain *d, mfn_t gmfn, u32 shadow_type)
/* Look for shadows in the hash table */
{
mfn_t smfn = shadow_hash_lookup(d, mfn_x(gmfn), shadow_type);
ASSERT(!mfn_valid(smfn) || mfn_to_page(smfn)->u.sh.head);
perfc_incr(shadow_get_shadow_status);
return smfn;
}
static inline void
set_fl1_shadow_status(struct domain *d, gfn_t gfn, mfn_t smfn)
/* Put an FL1 shadow into the hash table */
{
SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%"PRI_mfn"\n",
gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
ASSERT(mfn_to_page(smfn)->u.sh.head);
shadow_hash_insert(d, gfn_x(gfn), SH_type_fl1_shadow, smfn);
}
static inline void
set_shadow_status(struct domain *d, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
/* Put a shadow into the hash table */
{
int res;
SHADOW_PRINTK("d%d gmfn=%lx, type=%08x, smfn=%lx\n",
d->domain_id, mfn_x(gmfn), shadow_type, mfn_x(smfn));
ASSERT(mfn_to_page(smfn)->u.sh.head);
/* 32-bit PV guests don't own their l4 pages so can't get_page them */
if ( !is_pv_32bit_domain(d) || shadow_type != SH_type_l4_64_shadow )
{
res = get_page(mfn_to_page(gmfn), d);
ASSERT(res == 1);
}
shadow_hash_insert(d, mfn_x(gmfn), shadow_type, smfn);
}
static inline void
delete_fl1_shadow_status(struct domain *d, gfn_t gfn, mfn_t smfn)
/* Remove a shadow from the hash table */
{
SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%"PRI_mfn"\n",
gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn));
ASSERT(mfn_to_page(smfn)->u.sh.head);
shadow_hash_delete(d, gfn_x(gfn), SH_type_fl1_shadow, smfn);
}
static inline void
delete_shadow_status(struct domain *d, mfn_t gmfn, u32 shadow_type, mfn_t smfn)
/* Remove a shadow from the hash table */
{
SHADOW_PRINTK("d%d gmfn=%"PRI_mfn", type=%08x, smfn=%"PRI_mfn"\n",
d->domain_id, mfn_x(gmfn), shadow_type, mfn_x(smfn));
ASSERT(mfn_to_page(smfn)->u.sh.head);
shadow_hash_delete(d, mfn_x(gmfn), shadow_type, smfn);
/* 32-bit PV guests don't own their l4 pages; see set_shadow_status */
if ( !is_pv_32bit_domain(d) || shadow_type != SH_type_l4_64_shadow )
put_page(mfn_to_page(gmfn));
}
/**************************************************************************/
/* Functions for walking the guest page tables */
static inline bool
sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
uint32_t pfec)
{
return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec,
#if GUEST_PAGING_LEVELS == 3 /* PAE */
INVALID_MFN,
v->arch.paging.shadow.gl3e
#else /* 32 or 64 */
pagetable_get_mfn(v->arch.guest_table),
v->arch.paging.shadow.guest_vtable
#endif
);
}
/* This validation is called with lock held, and after write permission
* removal. Then check is atomic and no more inconsistent content can
* be observed before lock is released
*
* Return 1 to indicate success and 0 for inconsistency
*/
static inline uint32_t
shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version)
{
struct domain *d = v->domain;
guest_l1e_t *l1p;
guest_l2e_t *l2p;
#if GUEST_PAGING_LEVELS >= 4
guest_l3e_t *l3p;
guest_l4e_t *l4p;
#endif
int mismatch = 0;
ASSERT(paging_locked_by_me(d));
if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) )
return 1;
/* We may consider caching guest page mapping from last
* guest table walk. However considering this check happens
* relatively less-frequent, and a bit burden here to
* remap guest page is better than caching mapping in each
* guest table walk.
*
* Also when inconsistency occurs, simply return to trigger
* another fault instead of re-validate new path to make
* logic simple.
*/
perfc_incr(shadow_check_gwalk);
#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable;
mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4);
l3p = map_domain_page(gw->l3mfn);
mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3);
unmap_domain_page(l3p);
#else
mismatch |= (gw->l3e.l3 !=
v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3);
#endif
l2p = map_domain_page(gw->l2mfn);
mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
unmap_domain_page(l2p);
#else
l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable;
mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2);
#endif
if ( !(guest_can_use_l2_superpages(v) &&
(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
{
l1p = map_domain_page(gw->l1mfn);
mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1);
unmap_domain_page(l1p);
}
return !mismatch;
}
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
static int
shadow_check_gl1e(struct vcpu *v, walk_t *gw)
{
guest_l1e_t *l1p, nl1e;
if ( !mfn_valid(gw->l1mfn) )
return 0;
/* Can't just pull-through because mfn may have changed */
l1p = map_domain_page(gw->l1mfn);
nl1e.l1 = l1p[guest_l1_table_offset(gw->va)].l1;
unmap_domain_page(l1p);
return gw->l1e.l1 != nl1e.l1;
}
#endif
/* Remove write access permissions from a gwalk_t in a batch, and
* return OR-ed result for TLB flush hint and need to rewalk the guest
* pages.
*
* Syncing pages will remove write access to that page; but it may
* also give write access to other pages in the path. If we resync any
* pages, re-walk from the beginning.
*/
#define GW_RMWR_FLUSHTLB 1
#define GW_RMWR_REWALK 2
static inline uint32_t
gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
{
struct domain *d = v->domain;
uint32_t rc = 0;
#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
if ( mfn_is_out_of_sync(gw->l3mfn) )
{
sh_resync(d, gw->l3mfn);
rc = GW_RMWR_REWALK;
}
else
#endif /* OOS */
if ( sh_remove_write_access(d, gw->l3mfn, 3, va) )
rc = GW_RMWR_FLUSHTLB;
#endif /* GUEST_PAGING_LEVELS >= 4 */
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
if ( mfn_is_out_of_sync(gw->l2mfn) )
{
sh_resync(d, gw->l2mfn);
rc |= GW_RMWR_REWALK;
}
else
#endif /* OOS */
if ( sh_remove_write_access(d, gw->l2mfn, 2, va) )
rc |= GW_RMWR_FLUSHTLB;
#endif /* GUEST_PAGING_LEVELS >= 3 */
if ( !(guest_can_use_l2_superpages(v) &&
(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
&& !mfn_is_out_of_sync(gw->l1mfn)
#endif /* OOS */
&& sh_remove_write_access(d, gw->l1mfn, 1, va) )
rc |= GW_RMWR_FLUSHTLB;
return rc;
}
/* Lightweight audit: pass all the shadows associated with this guest walk
* through the audit mechanisms */
static void sh_audit_gw(struct vcpu *v, const walk_t *gw)
{
#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
struct domain *d = v->domain;
mfn_t smfn;
if ( !(SHADOW_AUDIT_ENABLE) )
return;
#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
if ( mfn_valid(gw->l4mfn)
&& mfn_valid((smfn = get_shadow_status(d, gw->l4mfn,
SH_type_l4_shadow))) )
(void) sh_audit_l4_table(v, smfn, INVALID_MFN);
if ( mfn_valid(gw->l3mfn)
&& mfn_valid((smfn = get_shadow_status(d, gw->l3mfn,
SH_type_l3_shadow))) )
(void) sh_audit_l3_table(v, smfn, INVALID_MFN);
#endif /* PAE or 64... */
if ( mfn_valid(gw->l2mfn) )
{
if ( mfn_valid((smfn = get_shadow_status(d, gw->l2mfn,
SH_type_l2_shadow))) )
(void) sh_audit_l2_table(v, smfn, INVALID_MFN);
#if GUEST_PAGING_LEVELS == 3
if ( mfn_valid((smfn = get_shadow_status(d, gw->l2mfn,
SH_type_l2h_shadow))) )
(void) sh_audit_l2_table(v, smfn, INVALID_MFN);
#endif
}
if ( mfn_valid(gw->l1mfn)
&& mfn_valid((smfn = get_shadow_status(d, gw->l1mfn,
SH_type_l1_shadow))) )
(void) sh_audit_l1_table(v, smfn, INVALID_MFN);
else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT)
&& (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)
&& mfn_valid(
(smfn = get_fl1_shadow_status(d, guest_l2e_get_gfn(gw->l2e)))) )
(void) sh_audit_fl1_table(v, smfn, INVALID_MFN);
#endif /* SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES */
}
/*
* Write a new value into the guest pagetable, and update the shadows
* appropriately. Returns 0 if we page-faulted, 1 for success.
*/
static bool_t
sh_write_guest_entry(struct vcpu *v, intpte_t *p, intpte_t new, mfn_t gmfn)
{
#if CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS
int failed;
paging_lock(v->domain);
failed = __copy_to_user(p, &new, sizeof(new));
if ( failed != sizeof(new) )
sh_validate_guest_entry(v, gmfn, p, sizeof(new));
paging_unlock(v->domain);
return !failed;
#else
return 0;
#endif
}
/*
* Cmpxchg a new value into the guest pagetable, and update the shadows
* appropriately. Returns 0 if we page-faulted, 1 if not.
* N.B. caller should check the value of "old" to see if the cmpxchg itself
* was successful.
*/
static bool_t
sh_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p, intpte_t *old,
intpte_t new, mfn_t gmfn)
{
#if CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS
int failed;
guest_intpte_t t = *old;
paging_lock(v->domain);
failed = cmpxchg_user(p, t, new);
if ( t == *old )
sh_validate_guest_entry(v, gmfn, p, sizeof(new));
*old = t;
paging_unlock(v->domain);
return !failed;
#else
return 0;
#endif
}
/**************************************************************************/
/* Functions to compute the correct index into a shadow page, given an
* index into the guest page (as returned by guest_get_index()).
* This is trivial when the shadow and guest use the same sized PTEs, but
* gets more interesting when those sizes are mismatched (e.g. 32-bit guest,
* PAE- or 64-bit shadows).
*
* These functions also increment the shadow mfn, when necessary. When PTE
* sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1
* page. In this case, we allocate 2 contiguous pages for the shadow L1, and
* use simple pointer arithmetic on a pointer to the guest L1e to figure out
* which shadow page we really want. Similarly, when PTE sizes are
* mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest
* way to see this is: a 32-bit guest L2 page maps 4GB of virtual address
* space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address
* space.)
*/
#if GUEST_PAGING_LEVELS == 2
/* From one page of a multi-page shadow, find the next one */
static inline mfn_t sh_next_page(mfn_t smfn)
{
struct page_info *pg = mfn_to_page(smfn), *next;
struct page_list_head h = PAGE_LIST_HEAD_INIT(h);
ASSERT(pg->u.sh.type == SH_type_l1_32_shadow
|| pg->u.sh.type == SH_type_fl1_32_shadow
|| pg->u.sh.type == SH_type_l2_32_shadow);
ASSERT(pg->u.sh.type == SH_type_l2_32_shadow || pg->u.sh.head);
next = page_list_next(pg, &h);
ASSERT(next);
ASSERT(next->u.sh.type == pg->u.sh.type);
ASSERT(!next->u.sh.head);
return page_to_mfn(next);
}
#endif
static inline u32
guest_index(void *ptr)
{
return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t);
}
static u32
shadow_l1_index(mfn_t *smfn, u32 guest_index)
{
#if (GUEST_PAGING_LEVELS == 2)
ASSERT(mfn_to_page(*smfn)->u.sh.head);
if ( guest_index >= SHADOW_L1_PAGETABLE_ENTRIES )
*smfn = sh_next_page(*smfn);
return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES);
#else
return guest_index;
#endif
}
static u32
shadow_l2_index(mfn_t *smfn, u32 guest_index)
{
#if (GUEST_PAGING_LEVELS == 2)
int i;
ASSERT(mfn_to_page(*smfn)->u.sh.head);
// Because we use 2 shadow l2 entries for each guest entry, the number of
// guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2
for ( i = 0; i < guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2); i++ )
*smfn = sh_next_page(*smfn);
// We multiply by two to get the index of the first of the two entries
// used to shadow the specified guest entry.
return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2;
#else
return guest_index;
#endif
}
#if GUEST_PAGING_LEVELS >= 4
static u32
shadow_l3_index(mfn_t *smfn, u32 guest_index)
{
return guest_index;
}
static u32
shadow_l4_index(mfn_t *smfn, u32 guest_index)
{
return guest_index;
}
#endif // GUEST_PAGING_LEVELS >= 4
/**************************************************************************/
/* Function which computes shadow entries from their corresponding guest
* entries. This is the "heart" of the shadow code. It operates using
* level-1 shadow types, but handles all levels of entry.
* Don't call it directly, but use the four wrappers below.
*/
static always_inline void
_sh_propagate(struct vcpu *v,
guest_intpte_t guest_intpte,
mfn_t target_mfn,
void *shadow_entry_ptr,
int level,
fetch_type_t ft,
p2m_type_t p2mt)
{
guest_l1e_t guest_entry = { guest_intpte };
shadow_l1e_t *sp = shadow_entry_ptr;
struct domain *d = v->domain;
struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
u32 pass_thru_flags;
u32 gflags, sflags;
bool_t mmio_mfn;
/* We don't shadow PAE l3s */
ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3);
/* Check there's something for the shadows to map to */
if ( (!p2m_is_valid(p2mt) && !p2m_is_grant(p2mt))
|| !gfn_valid(d, target_gfn) )
{
*sp = shadow_l1e_empty();
goto done;
}
gflags = guest_l1e_get_flags(guest_entry);
if ( unlikely(!(gflags & _PAGE_PRESENT)) )
{
#if !(SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
/* If a guest l1 entry is not present, shadow with the magic
* guest-not-present entry. */
if ( level == 1 )
*sp = sh_l1e_gnp();
else
#endif /* !OOS */
*sp = shadow_l1e_empty();
goto done;
}
if ( level == 1 && p2mt == p2m_mmio_dm )
{
/* Guest l1e maps emulated MMIO space */
*sp = sh_l1e_mmio(target_gfn, gflags);
if ( !d->arch.paging.shadow.has_fast_mmio_entries )
d->arch.paging.shadow.has_fast_mmio_entries = 1;
goto done;
}
// Must have a valid target_mfn unless this is a prefetch or an l1
// pointing at MMIO space. In the case of a prefetch, an invalid
// mfn means that we can not usefully shadow anything, and so we
// return early.
//
mmio_mfn = !mfn_valid(target_mfn)
|| (level == 1
&& page_get_owner(mfn_to_page(target_mfn)) == dom_io);
if ( mmio_mfn
&& !(level == 1 && (!shadow_mode_refcounts(d)
|| p2mt == p2m_mmio_direct)) )
{
ASSERT((ft == ft_prefetch));
*sp = shadow_l1e_empty();
goto done;
}
// Propagate bits from the guest to the shadow.
// Some of these may be overwritten, below.
// Since we know the guest's PRESENT bit is set, we also set the shadow's
// SHADOW_PRESENT bit.
//
pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER |
_PAGE_RW | _PAGE_PRESENT);
if ( guest_nx_enabled(v) )
pass_thru_flags |= _PAGE_NX_BIT;
if ( level == 1 && !shadow_mode_refcounts(d) && mmio_mfn )
pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT;
sflags = gflags & pass_thru_flags;
/*
* For HVM domains with direct access to MMIO areas, set the correct
* caching attributes in the shadows to match what was asked for.
*/
if ( (level == 1) && is_hvm_domain(d) &&
!is_xen_heap_mfn(mfn_x(target_mfn)) )
{
int type;
ASSERT(!(sflags & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)));
/* compute the PAT index for shadow page entry when VT-d is enabled
* and device assigned.
* 1) direct MMIO: compute the PAT index with gMTRR=UC and gPAT.
* 2) if enables snoop control, compute the PAT index as WB.
* 3) if disables snoop control, compute the PAT index with
* gMTRR and gPAT.
*/
if ( !mmio_mfn &&
(type = hvm_get_mem_pinned_cacheattr(d, target_gfn, 0)) >= 0 )
sflags |= pat_type_2_pte_flags(type);
else if ( d->arch.hvm_domain.is_in_uc_mode )
sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
else
if ( iomem_access_permitted(d, mfn_x(target_mfn), mfn_x(target_mfn)) )
{
if ( p2mt == p2m_mmio_direct )
sflags |= get_pat_flags(v,
gflags,
gfn_to_paddr(target_gfn),
pfn_to_paddr(mfn_x(target_mfn)),
MTRR_TYPE_UNCACHABLE);
else if ( iommu_snoop )
sflags |= pat_type_2_pte_flags(PAT_TYPE_WRBACK);
else
sflags |= get_pat_flags(v,
gflags,
gfn_to_paddr(target_gfn),
pfn_to_paddr(mfn_x(target_mfn)),
NO_HARDCODE_MEM_TYPE);
}
}
// Set the A&D bits for higher level shadows.
// Higher level entries do not, strictly speaking, have dirty bits, but
// since we use shadow linear tables, each of these entries may, at some
// point in time, also serve as a shadow L1 entry.
// By setting both the A&D bits in each of these, we eliminate the burden
// on the hardware to update these bits on initial accesses.
//
if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) )
sflags |= _PAGE_ACCESSED | _PAGE_DIRTY;
// If the A or D bit has not yet been set in the guest, then we must
// prevent the corresponding kind of access.
//
if ( unlikely(!(gflags & _PAGE_ACCESSED)) )
sflags &= ~_PAGE_PRESENT;
/* D bits exist in L1es and PSE L2es */
if ( unlikely(((level == 1) ||
((level == 2) &&
(gflags & _PAGE_PSE) &&
guest_can_use_l2_superpages(v)))
&& !(gflags & _PAGE_DIRTY)) )
sflags &= ~_PAGE_RW;
// shadow_mode_log_dirty support
//
// Only allow the guest write access to a page a) on a demand fault,
// or b) if the page is already marked as dirty.
//
// (We handle log-dirty entirely inside the shadow code, without using the
// p2m_ram_logdirty p2m type: only HAP uses that.)
if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) )
{
if ( mfn_valid(target_mfn) ) {
if ( ft & FETCH_TYPE_WRITE )
paging_mark_dirty(d, target_mfn);
else if ( !paging_mfn_is_dirty(d, target_mfn) )
sflags &= ~_PAGE_RW;
}
}
if ( unlikely((level == 1) && dirty_vram
&& dirty_vram->last_dirty == -1
&& gfn_x(target_gfn) >= dirty_vram->begin_pfn
&& gfn_x(target_gfn) < dirty_vram->end_pfn) )
{
if ( ft & FETCH_TYPE_WRITE )
dirty_vram->last_dirty = NOW();
else
sflags &= ~_PAGE_RW;
}
/* Read-only memory */
if ( p2m_is_readonly(p2mt) )
sflags &= ~_PAGE_RW;
else if ( p2mt == p2m_mmio_direct &&
rangeset_contains_singleton(mmio_ro_ranges, mfn_x(target_mfn)) )
{
sflags &= ~(_PAGE_RW | _PAGE_PAT);
sflags |= _PAGE_PCD | _PAGE_PWT;
}
// protect guest page tables
//
if ( unlikely((level == 1)
&& sh_mfn_is_a_page_table(target_mfn)
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
/* Unless the page is out of sync and the guest is
writing to it. */
&& !(mfn_oos_may_write(target_mfn)
&& (ft == ft_demand_write))
#endif /* OOS */
) )
sflags &= ~_PAGE_RW;
// PV guests in 64-bit mode use two different page tables for user vs
// supervisor permissions, making the guest's _PAGE_USER bit irrelevant.
// It is always shadowed as present...
if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32bit_domain(d)
&& is_pv_domain(d) )
{
sflags |= _PAGE_USER;
}
*sp = shadow_l1e_from_mfn(target_mfn, sflags);
done:
SHADOW_DEBUG(PROPAGATE,
"%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n",
fetch_type_names[ft], level, guest_entry.l1, sp->l1);
}
/* These four wrappers give us a little bit of type-safety back around
* the use of void-* pointers and intpte types in _sh_propagate(), and
* allow the compiler to optimize out some level checks. */
#if GUEST_PAGING_LEVELS >= 4
static void
l4e_propagate_from_guest(struct vcpu *v,
guest_l4e_t gl4e,
mfn_t sl3mfn,
shadow_l4e_t *sl4e,
fetch_type_t ft)
{
if ( !mfn_eq(sl3mfn, INVALID_MFN) &&
(guest_l4e_get_flags(gl4e) & _PAGE_PRESENT) )
ASSERT(!guest_l4e_rsvd_bits(v, gl4e));
_sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw);
}
static void
l3e_propagate_from_guest(struct vcpu *v,
guest_l3e_t gl3e,
mfn_t sl2mfn,
shadow_l3e_t *sl3e,
fetch_type_t ft)
{
if ( !mfn_eq(sl2mfn, INVALID_MFN) &&
(guest_l3e_get_flags(gl3e) & _PAGE_PRESENT) )
ASSERT(!guest_l3e_rsvd_bits(v, gl3e));
_sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw);
}
#endif // GUEST_PAGING_LEVELS >= 4
static void
l2e_propagate_from_guest(struct vcpu *v,
guest_l2e_t gl2e,
mfn_t sl1mfn,
shadow_l2e_t *sl2e,
fetch_type_t ft)
{
if ( !mfn_eq(sl1mfn, INVALID_MFN) &&
(guest_l2e_get_flags(gl2e) & _PAGE_PRESENT) )
ASSERT(!guest_l2e_rsvd_bits(v, gl2e));
_sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw);
}
static void
l1e_propagate_from_guest(struct vcpu *v,
guest_l1e_t gl1e,
mfn_t gmfn,
shadow_l1e_t *sl1e,
fetch_type_t ft,
p2m_type_t p2mt)
{
if ( !mfn_eq(gmfn, INVALID_MFN) &&
(guest_l1e_get_flags(gl1e) & _PAGE_PRESENT) )
ASSERT(!guest_l1e_rsvd_bits(v, gl1e));
_sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt);
}
/**************************************************************************/
/* These functions update shadow entries (and do bookkeeping on the shadow
* tables they are in). It is intended that they are the only
* functions which ever write (non-zero) data onto a shadow page.
*/
static inline void safe_write_entry(void *dst, void *src)
/* Copy one PTE safely when processors might be running on the
* destination pagetable. This does *not* give safety against
* concurrent writes (that's what the paging lock is for), just
* stops the hardware picking up partially written entries. */
{
volatile unsigned long *d = dst;
unsigned long *s = src;
ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1)));
/* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word,
* which will be an atomic write, since the entry is aligned. */
BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long));
*d = *s;
}
static inline void
shadow_write_entries(void *d, void *s, int entries, mfn_t mfn)
/* This function does the actual writes to shadow pages.
* It must not be called directly, since it doesn't do the bookkeeping
* that shadow_set_l*e() functions do. */
{
shadow_l1e_t *dst = d;
shadow_l1e_t *src = s;
void *map = NULL;
int i;
/* Because we mirror access rights at all levels in the shadow, an
* l2 (or higher) entry with the RW bit cleared will leave us with
* no write access through the linear map.
* We detect that by writing to the shadow with copy_to_user() and
* using map_domain_page() to get a writeable mapping if we need to. */
if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 )
{
perfc_incr(shadow_linear_map_failed);
map = map_domain_page(mfn);
dst = map + ((unsigned long)dst & (PAGE_SIZE - 1));
}
for ( i = 0; i < entries; i++ )
safe_write_entry(dst++, src++);
if ( map != NULL ) unmap_domain_page(map);
}
/* type is only used to distinguish grant map pages from ordinary RAM
* i.e. non-p2m_is_grant() pages are treated as p2m_ram_rw. */
static int inline
shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d, p2m_type_t type)
{
int res;
mfn_t mfn;
struct domain *owner;
ASSERT(!sh_l1e_is_magic(sl1e));
if ( !shadow_mode_refcounts(d) )
return 1;
res = get_page_from_l1e(sl1e, d, d);
// If a privileged domain is attempting to install a map of a page it does
// not own, we let it succeed anyway.
//
if ( unlikely(res < 0) &&
!shadow_mode_translate(d) &&
mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
(owner = page_get_owner(mfn_to_page(mfn))) &&
(d != owner) )
{
res = xsm_priv_mapping(XSM_TARGET, d, owner);
if ( !res ) {
res = get_page_from_l1e(sl1e, d, owner);
SHADOW_PRINTK("privileged domain %d installs map of mfn %"PRI_mfn" "
"which is owned by d%d: %s\n",
d->domain_id, mfn_x(mfn), owner->domain_id,
res >= 0 ? "success" : "failed");
}
}
/* Okay, it might still be a grant mapping PTE. Try it. */
if ( unlikely(res < 0) &&
(type == p2m_grant_map_rw ||
(type == p2m_grant_map_ro &&
!(shadow_l1e_get_flags(sl1e) & _PAGE_RW))) )
{
/* It's a grant mapping. The grant table implementation will
already have checked that we're supposed to have access, so
we can just grab a reference directly. */
mfn = shadow_l1e_get_mfn(sl1e);
if ( mfn_valid(mfn) )
res = get_page_from_l1e(sl1e, d, page_get_owner(mfn_to_page(mfn)));
}
if ( unlikely(res < 0) )
{
perfc_incr(shadow_get_page_fail);
SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n");
}
return res;
}
static void inline
shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
{
if ( !shadow_mode_refcounts(d) )
return;
put_page_from_l1e(sl1e, d);
}
#if GUEST_PAGING_LEVELS >= 4
static int shadow_set_l4e(struct domain *d,
shadow_l4e_t *sl4e,
shadow_l4e_t new_sl4e,
mfn_t sl4mfn)
{
int flags = 0, ok;
shadow_l4e_t old_sl4e;
paddr_t paddr;
ASSERT(sl4e != NULL);
old_sl4e = *sl4e;
if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */
paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT)
| (((unsigned long)sl4e) & ~PAGE_MASK));
if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT )
{
/* About to install a new reference */
mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e);
ok = sh_get_ref(d, sl3mfn, paddr);
/* Are we pinning l3 shadows to handle wierd linux behaviour? */
if ( sh_type_is_pinnable(d, SH_type_l3_64_shadow) )
ok |= sh_pin(d, sl3mfn);
if ( !ok )
{
domain_crash(d);
return SHADOW_SET_ERROR;
}
}
/* Write the new entry */
shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
flags |= SHADOW_SET_CHANGED;
if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
{
/* We lost a reference to an old mfn. */
mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e);
if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e)))
|| !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e),
shadow_l4e_get_flags(new_sl4e)) )
{
flags |= SHADOW_SET_FLUSH;
}
sh_put_ref(d, osl3mfn, paddr);
}
return flags;
}
static int shadow_set_l3e(struct domain *d,
shadow_l3e_t *sl3e,
shadow_l3e_t new_sl3e,
mfn_t sl3mfn)
{
int flags = 0;
shadow_l3e_t old_sl3e;
paddr_t paddr;
ASSERT(sl3e != NULL);
old_sl3e = *sl3e;
if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */
paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT)
| (((unsigned long)sl3e) & ~PAGE_MASK));
if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
{
/* About to install a new reference */
if ( !sh_get_ref(d, shadow_l3e_get_mfn(new_sl3e), paddr) )
{
domain_crash(d);
return SHADOW_SET_ERROR;
}
}
/* Write the new entry */
shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
flags |= SHADOW_SET_CHANGED;
if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT )