diff --git a/Makefile b/Makefile index fb7ffe2..1d86490 100644 --- a/Makefile +++ b/Makefile @@ -13,9 +13,12 @@ tools: tests: $(MAKE) -C tests +gpu: + $(MAKE) -C tools gpu + clean: $(MAKE) -C daxfs clean $(MAKE) -C tools clean $(MAKE) -C tests clean -.PHONY: all daxfs tools tests clean +.PHONY: all daxfs tools tests gpu clean diff --git a/bench_results/agents.tsv b/bench_results/agents.tsv new file mode 100644 index 0000000..27116d6 --- /dev/null +++ b/bench_results/agents.tsv @@ -0,0 +1,343 @@ +experiment parameter iteration operation latency_us ops_count delta_bytes notes +scale_agents 1 1 branch_create_total 4076 1 +scale_agents 1 1 branch_create_avg 4076 1 +scale_agents 1 1 parallel_workload 526182 1 fails=0 +scale_agents 1 1 commit 7195 1 +scale_agents 1 1 cleanup 14207 1 +scale_agents 1 2 branch_create_total 5431 1 +scale_agents 1 2 branch_create_avg 5431 1 +scale_agents 1 2 parallel_workload 528399 1 fails=0 +scale_agents 1 2 commit 6421 1 +scale_agents 1 2 cleanup 19778 1 +scale_agents 1 3 branch_create_total 5494 1 +scale_agents 1 3 branch_create_avg 5494 1 +scale_agents 1 3 parallel_workload 547399 1 fails=0 +scale_agents 1 3 commit 9607 1 +scale_agents 1 3 cleanup 16249 1 +scale_agents 2 1 branch_create_total 7572 2 +scale_agents 2 1 branch_create_avg 3786 2 +scale_agents 2 1 parallel_workload 582696 2 fails=0 +scale_agents 2 1 commit 6274 1 +scale_agents 2 1 estale_detect 8140 1 rc=1 +scale_agents 2 1 cleanup 22013 2 +scale_agents 2 2 branch_create_total 10103 2 +scale_agents 2 2 branch_create_avg 5051 2 +scale_agents 2 2 parallel_workload 543486 2 fails=0 +scale_agents 2 2 commit 9582 1 +scale_agents 2 2 estale_detect 4476 1 rc=1 +scale_agents 2 2 cleanup 16557 2 +scale_agents 2 3 branch_create_total 9241 2 +scale_agents 2 3 branch_create_avg 4620 2 +scale_agents 2 3 parallel_workload 548317 2 fails=0 +scale_agents 2 3 commit 5399 1 +scale_agents 2 3 estale_detect 5502 1 rc=1 +scale_agents 2 3 cleanup 16312 2 +scale_agents 4 1 branch_create_total 21351 4 +scale_agents 4 1 branch_create_avg 5337 4 +scale_agents 4 1 parallel_workload 602268 4 fails=0 +scale_agents 4 1 commit 8011 1 +scale_agents 4 1 estale_detect 6145 1 rc=1 +scale_agents 4 1 cleanup 22772 4 +scale_agents 4 2 branch_create_total 16742 4 +scale_agents 4 2 branch_create_avg 4185 4 +scale_agents 4 2 parallel_workload 599315 4 fails=0 +scale_agents 4 2 commit 4615 1 +scale_agents 4 2 estale_detect 4333 1 rc=1 +scale_agents 4 2 cleanup 24804 4 +scale_agents 4 3 branch_create_total 17419 4 +scale_agents 4 3 branch_create_avg 4354 4 +scale_agents 4 3 parallel_workload 619138 4 fails=0 +scale_agents 4 3 commit 4957 1 +scale_agents 4 3 estale_detect 3831 1 rc=1 +scale_agents 4 3 cleanup 26757 4 +scale_agents 8 1 branch_create_total 38580 8 +scale_agents 8 1 branch_create_avg 4822 8 +scale_agents 8 1 parallel_workload 669078 8 fails=0 +scale_agents 8 1 commit 5471 1 +scale_agents 8 1 estale_detect 4193 1 rc=1 +scale_agents 8 1 cleanup 38947 8 +scale_agents 8 2 branch_create_total 37897 8 +scale_agents 8 2 branch_create_avg 4737 8 +scale_agents 8 2 parallel_workload 628198 8 fails=0 +scale_agents 8 2 commit 6667 1 +scale_agents 8 2 estale_detect 4457 1 rc=1 +scale_agents 8 2 cleanup 31432 8 +scale_agents 8 3 branch_create_total 37498 8 +scale_agents 8 3 branch_create_avg 4687 8 +scale_agents 8 3 parallel_workload 639915 8 fails=0 +scale_agents 8 3 commit 7485 1 +scale_agents 8 3 estale_detect 4584 1 rc=1 +scale_agents 8 3 cleanup 28399 8 +scale_agents 16 1 branch_create_total 92533 16 +scale_agents 16 1 branch_create_avg 5783 16 +scale_agents 16 1 parallel_workload 763445 16 fails=0 +scale_agents 16 1 commit 10479 1 +scale_agents 16 1 estale_detect 4315 1 rc=1 +scale_agents 16 1 cleanup 45969 16 +scale_agents 16 2 branch_create_total 74579 16 +scale_agents 16 2 branch_create_avg 4661 16 +scale_agents 16 2 parallel_workload 793028 16 fails=0 +scale_agents 16 2 commit 7999 1 +scale_agents 16 2 estale_detect 5960 1 rc=1 +scale_agents 16 2 cleanup 49899 16 +scale_agents 16 3 branch_create_total 82432 16 +scale_agents 16 3 branch_create_avg 5152 16 +scale_agents 16 3 parallel_workload 787314 16 fails=0 +scale_agents 16 3 commit 8582 1 +scale_agents 16 3 estale_detect 4980 1 rc=1 +scale_agents 16 3 cleanup 50737 16 +scale_agents 32 1 branch_create_total 152822 32 +scale_agents 32 1 branch_create_avg 4775 32 +scale_agents 32 1 parallel_workload 859543 32 fails=0 +scale_agents 32 1 commit 5684 1 +scale_agents 32 1 estale_detect 4547 1 rc=1 +scale_agents 32 1 cleanup 104499 32 +scale_agents 32 2 branch_create_total 153409 32 +scale_agents 32 2 branch_create_avg 4794 32 +scale_agents 32 2 parallel_workload 846043 32 fails=0 +scale_agents 32 2 commit 11058 1 +scale_agents 32 2 estale_detect 4032 1 rc=1 +scale_agents 32 2 cleanup 106802 32 +scale_agents 32 3 branch_create_total 156743 32 +scale_agents 32 3 branch_create_avg 4898 32 +scale_agents 32 3 parallel_workload 907180 32 fails=0 +scale_agents 32 3 commit 6461 1 +scale_agents 32 3 estale_detect 5525 1 rc=1 +scale_agents 32 3 cleanup 98815 32 +scale_depth 1 1 chain_create 11518 1 +scale_depth 1 1 workload 551627 31 +scale_depth 1 1 commit 6019 1 +scale_depth 1 2 chain_create 9727 1 +scale_depth 1 2 workload 502185 31 +scale_depth 1 2 commit 6570 1 +scale_depth 1 3 chain_create 10794 1 +scale_depth 1 3 workload 515411 31 +scale_depth 1 3 commit 11072 1 +scale_depth 2 1 chain_create 10614 2 +scale_depth 2 1 workload 526138 31 +scale_depth 2 1 commit 10166 2 +scale_depth 2 2 chain_create 16656 2 +scale_depth 2 2 workload 523364 31 +scale_depth 2 2 commit 16197 2 +scale_depth 2 3 chain_create 16981 2 +scale_depth 2 3 workload 547954 31 +scale_depth 2 3 commit 5271 2 +scale_depth 4 1 chain_create 22894 4 +scale_depth 4 1 workload 538063 31 +scale_depth 4 1 commit 10491 4 +scale_depth 4 2 chain_create 22786 4 +scale_depth 4 2 workload 511997 31 +scale_depth 4 2 commit 6308 4 +scale_depth 4 3 chain_create 23795 4 +scale_depth 4 3 workload 530256 31 +scale_depth 4 3 commit 6508 4 +scale_depth 8 1 chain_create 36028 8 +scale_depth 8 1 workload 530269 31 +scale_depth 8 1 commit 10982 8 +scale_depth 8 2 chain_create 39830 8 +scale_depth 8 2 workload 500300 31 +scale_depth 8 2 commit 6466 8 +scale_depth 8 3 chain_create 36476 8 +scale_depth 8 3 workload 542417 31 +scale_depth 8 3 commit 14053 8 +commit_cost 10 1 workload 202319 10 +commit_cost 10 1 commit 14481 10 +commit_cost 10 2 workload 181876 10 +commit_cost 10 2 commit 5625 10 +commit_cost 10 3 workload 183334 10 +commit_cost 10 3 commit 7841 10 +commit_cost 50 1 workload 1466946 50 +commit_cost 50 1 commit 8523 50 +commit_cost 50 2 workload 1464664 50 +commit_cost 50 2 commit 10627 50 +commit_cost 50 3 workload 1425226 50 +commit_cost 50 3 commit 12598 50 +commit_cost 100 1 workload 3626982 100 +commit_cost 100 1 commit 11834 100 +commit_cost 100 2 workload 3530194 100 +commit_cost 100 2 commit 7431 100 +commit_cost 100 3 workload 3541508 100 +commit_cost 100 3 commit 11713 100 +baseline_tmpfs 1 1 branch_create_total 2443 1 +baseline_tmpfs 1 1 branch_create_avg 2443 1 +baseline_tmpfs 1 1 parallel_workload 28326 1 +baseline_tmpfs 1 1 branch_create_total 3515 1 +baseline_tmpfs 1 1 branch_create_avg 3515 1 +baseline_tmpfs 1 1 parallel_workload 32225 1 +baseline_tmpfs 1 1 commit 86604 1 +baseline_tmpfs 1 1 cleanup 4935 1 +baseline_tmpfs 1 2 branch_create_total 3464 1 +baseline_tmpfs 1 2 branch_create_avg 3464 1 +baseline_tmpfs 1 2 parallel_workload 31332 1 +baseline_tmpfs 1 2 commit 46055 1 +baseline_tmpfs 1 2 cleanup 4084 1 +baseline_tmpfs 1 3 branch_create_total 2905 1 +baseline_tmpfs 1 3 branch_create_avg 2905 1 +baseline_tmpfs 1 3 parallel_workload 31260 1 +baseline_tmpfs 1 3 commit 45946 1 +baseline_tmpfs 1 3 cleanup 3764 1 +baseline_tmpfs 2 1 branch_create_total 5445 2 +baseline_tmpfs 2 1 branch_create_avg 2722 2 +baseline_tmpfs 2 1 parallel_workload 31963 2 +baseline_tmpfs 2 1 commit 46219 1 +baseline_tmpfs 2 1 cleanup 5454 2 +baseline_tmpfs 2 2 branch_create_total 5455 2 +baseline_tmpfs 2 2 branch_create_avg 2727 2 +baseline_tmpfs 2 2 parallel_workload 32328 2 +baseline_tmpfs 2 2 commit 46005 1 +baseline_tmpfs 2 2 cleanup 4468 2 +baseline_tmpfs 2 3 branch_create_total 5913 2 +baseline_tmpfs 2 3 branch_create_avg 2956 2 +baseline_tmpfs 2 3 parallel_workload 32508 2 +baseline_tmpfs 2 3 commit 45440 1 +baseline_tmpfs 2 3 cleanup 4795 2 +baseline_tmpfs 4 1 branch_create_total 10858 4 +baseline_tmpfs 4 1 branch_create_avg 2714 4 +baseline_tmpfs 4 1 parallel_workload 34528 4 +baseline_tmpfs 4 1 commit 45547 1 +baseline_tmpfs 4 1 cleanup 5669 4 +baseline_tmpfs 4 2 branch_create_total 11170 4 +baseline_tmpfs 4 2 branch_create_avg 2792 4 +baseline_tmpfs 4 2 parallel_workload 33417 4 +baseline_tmpfs 4 2 commit 46485 1 +baseline_tmpfs 4 2 cleanup 5774 4 +baseline_tmpfs 4 3 branch_create_total 9925 4 +baseline_tmpfs 4 3 branch_create_avg 2481 4 +baseline_tmpfs 4 3 parallel_workload 33273 4 +baseline_tmpfs 4 3 commit 45777 1 +baseline_tmpfs 4 3 cleanup 5591 4 +baseline_tmpfs 8 1 branch_create_total 22590 8 +baseline_tmpfs 8 1 branch_create_avg 2823 8 +baseline_tmpfs 8 1 parallel_workload 37262 8 +baseline_tmpfs 8 1 commit 46174 1 +baseline_tmpfs 8 1 cleanup 7164 8 +baseline_tmpfs 8 2 branch_create_total 21631 8 +baseline_tmpfs 8 2 branch_create_avg 2703 8 +baseline_tmpfs 8 2 parallel_workload 35463 8 +baseline_tmpfs 8 2 commit 45251 1 +baseline_tmpfs 8 2 cleanup 8081 8 +baseline_tmpfs 8 3 branch_create_total 21555 8 +baseline_tmpfs 8 3 branch_create_avg 2694 8 +baseline_tmpfs 8 3 parallel_workload 36366 8 +baseline_tmpfs 8 3 commit 44625 1 +baseline_tmpfs 8 3 cleanup 8296 8 +baseline_tmpfs 16 1 branch_create_total 41263 16 +baseline_tmpfs 16 1 branch_create_avg 2578 16 +baseline_tmpfs 16 1 parallel_workload 38129 16 +baseline_tmpfs 16 1 commit 45669 1 +baseline_tmpfs 16 1 cleanup 12275 16 +baseline_tmpfs 16 2 branch_create_total 42322 16 +baseline_tmpfs 16 2 branch_create_avg 2645 16 +baseline_tmpfs 16 2 parallel_workload 38157 16 +baseline_tmpfs 16 2 commit 45554 1 +baseline_tmpfs 16 2 cleanup 11999 16 +baseline_tmpfs 16 3 branch_create_total 45596 16 +baseline_tmpfs 16 3 branch_create_avg 2849 16 +baseline_tmpfs 16 3 parallel_workload 39854 16 +baseline_tmpfs 16 3 commit 45796 1 +baseline_tmpfs 16 3 cleanup 13391 16 +baseline_tmpfs 32 1 branch_create_total 93096 32 +baseline_tmpfs 32 1 branch_create_avg 2909 32 +baseline_tmpfs 32 1 parallel_workload 47494 32 +baseline_tmpfs 32 1 commit 45508 1 +baseline_tmpfs 32 1 cleanup 22176 32 +baseline_tmpfs 32 2 branch_create_total 93741 32 +baseline_tmpfs 32 2 branch_create_avg 2929 32 +baseline_tmpfs 32 2 parallel_workload 47132 32 +baseline_tmpfs 32 2 commit 46098 1 +baseline_tmpfs 32 2 cleanup 23098 32 +baseline_tmpfs 32 3 branch_create_total 94132 32 +baseline_tmpfs 32 3 branch_create_avg 2941 32 +baseline_tmpfs 32 3 parallel_workload 47511 32 +baseline_tmpfs 32 3 commit 46239 1 +baseline_tmpfs 32 3 cleanup 22055 32 +baseline_overlayfs 1 1 branch_create_total 3291 1 +baseline_overlayfs 1 1 branch_create_avg 3291 1 +baseline_overlayfs 1 1 parallel_workload 33499 1 +baseline_overlayfs 1 1 commit 3278 1 +baseline_overlayfs 1 1 cleanup 4828 1 +baseline_overlayfs 1 2 branch_create_total 2831 1 +baseline_overlayfs 1 2 branch_create_avg 2831 1 +baseline_overlayfs 1 2 parallel_workload 32808 1 +baseline_overlayfs 1 2 commit 2925 1 +baseline_overlayfs 1 2 cleanup 5142 1 +baseline_overlayfs 1 3 branch_create_total 2909 1 +baseline_overlayfs 1 3 branch_create_avg 2909 1 +baseline_overlayfs 1 3 parallel_workload 32252 1 +baseline_overlayfs 1 3 commit 2652 1 +baseline_overlayfs 1 3 cleanup 5210 1 +baseline_overlayfs 2 1 branch_create_total 5302 2 +baseline_overlayfs 2 1 branch_create_avg 2651 2 +baseline_overlayfs 2 1 parallel_workload 33196 2 +baseline_overlayfs 2 1 commit 3125 1 +baseline_overlayfs 2 1 cleanup 7034 2 +baseline_overlayfs 2 2 branch_create_total 5865 2 +baseline_overlayfs 2 2 branch_create_avg 2932 2 +baseline_overlayfs 2 2 parallel_workload 34825 2 +baseline_overlayfs 2 2 commit 2536 1 +baseline_overlayfs 2 2 cleanup 6319 2 +baseline_overlayfs 2 3 branch_create_total 5533 2 +baseline_overlayfs 2 3 branch_create_avg 2766 2 +baseline_overlayfs 2 3 parallel_workload 34535 2 +baseline_overlayfs 2 3 commit 3048 1 +baseline_overlayfs 2 3 cleanup 6330 2 +baseline_overlayfs 4 1 branch_create_total 10830 4 +baseline_overlayfs 4 1 branch_create_avg 2707 4 +baseline_overlayfs 4 1 parallel_workload 35952 4 +baseline_overlayfs 4 1 commit 2599 1 +baseline_overlayfs 4 1 cleanup 9030 4 +baseline_overlayfs 4 2 branch_create_total 12304 4 +baseline_overlayfs 4 2 branch_create_avg 3076 4 +baseline_overlayfs 4 2 parallel_workload 36010 4 +baseline_overlayfs 4 2 commit 2603 1 +baseline_overlayfs 4 2 cleanup 8558 4 +baseline_overlayfs 4 3 branch_create_total 12863 4 +baseline_overlayfs 4 3 branch_create_avg 3215 4 +baseline_overlayfs 4 3 parallel_workload 35081 4 +baseline_overlayfs 4 3 commit 2977 1 +baseline_overlayfs 4 3 cleanup 8511 4 +baseline_overlayfs 8 1 branch_create_total 21960 8 +baseline_overlayfs 8 1 branch_create_avg 2745 8 +baseline_overlayfs 8 1 parallel_workload 38572 8 +baseline_overlayfs 8 1 commit 3419 1 +baseline_overlayfs 8 1 cleanup 14066 8 +baseline_overlayfs 8 2 branch_create_total 23409 8 +baseline_overlayfs 8 2 branch_create_avg 2926 8 +baseline_overlayfs 8 2 parallel_workload 38714 8 +baseline_overlayfs 8 2 commit 2969 1 +baseline_overlayfs 8 2 cleanup 14206 8 +baseline_overlayfs 8 3 branch_create_total 21952 8 +baseline_overlayfs 8 3 branch_create_avg 2744 8 +baseline_overlayfs 8 3 parallel_workload 38664 8 +baseline_overlayfs 8 3 commit 3138 1 +baseline_overlayfs 8 3 cleanup 14526 8 +baseline_overlayfs 16 1 branch_create_total 46128 16 +baseline_overlayfs 16 1 branch_create_avg 2883 16 +baseline_overlayfs 16 1 parallel_workload 39639 16 +baseline_overlayfs 16 1 commit 3359 1 +baseline_overlayfs 16 1 cleanup 25121 16 +baseline_overlayfs 16 2 branch_create_total 45612 16 +baseline_overlayfs 16 2 branch_create_avg 2850 16 +baseline_overlayfs 16 2 parallel_workload 38792 16 +baseline_overlayfs 16 2 commit 2510 1 +baseline_overlayfs 16 2 cleanup 23516 16 +baseline_overlayfs 16 3 branch_create_total 44094 16 +baseline_overlayfs 16 3 branch_create_avg 2755 16 +baseline_overlayfs 16 3 parallel_workload 37822 16 +baseline_overlayfs 16 3 commit 2596 1 +baseline_overlayfs 16 3 cleanup 25847 16 +baseline_overlayfs 32 1 branch_create_total 87711 32 +baseline_overlayfs 32 1 branch_create_avg 2740 32 +baseline_overlayfs 32 1 parallel_workload 44015 32 +baseline_overlayfs 32 1 commit 2854 1 +baseline_overlayfs 32 1 cleanup 46727 32 +baseline_overlayfs 32 2 branch_create_total 90229 32 +baseline_overlayfs 32 2 branch_create_avg 2819 32 +baseline_overlayfs 32 2 parallel_workload 45849 32 +baseline_overlayfs 32 2 commit 2800 1 +baseline_overlayfs 32 2 cleanup 43240 32 +baseline_overlayfs 32 3 branch_create_total 90360 32 +baseline_overlayfs 32 3 branch_create_avg 2823 32 +baseline_overlayfs 32 3 parallel_workload 48086 32 +baseline_overlayfs 32 3 commit 3001 1 +baseline_overlayfs 32 3 cleanup 47871 32 diff --git a/bench_results/daxfs_bench.png b/bench_results/daxfs_bench.png new file mode 100644 index 0000000..16b5ec2 Binary files /dev/null and b/bench_results/daxfs_bench.png differ diff --git a/bench_results/daxfs_bench_branch_avg.png b/bench_results/daxfs_bench_branch_avg.png new file mode 100644 index 0000000..e23e7de Binary files /dev/null and b/bench_results/daxfs_bench_branch_avg.png differ diff --git a/bench_results/daxfs_bench_branch_creation.png b/bench_results/daxfs_bench_branch_creation.png new file mode 100644 index 0000000..f951846 Binary files /dev/null and b/bench_results/daxfs_bench_branch_creation.png differ diff --git a/bench_results/daxfs_bench_commit_cost.png b/bench_results/daxfs_bench_commit_cost.png new file mode 100644 index 0000000..0baf123 Binary files /dev/null and b/bench_results/daxfs_bench_commit_cost.png differ diff --git a/bench_results/daxfs_bench_commit_time.png b/bench_results/daxfs_bench_commit_time.png new file mode 100644 index 0000000..966b15c Binary files /dev/null and b/bench_results/daxfs_bench_commit_time.png differ diff --git a/bench_results/daxfs_bench_nesting_depth.png b/bench_results/daxfs_bench_nesting_depth.png new file mode 100644 index 0000000..7ffb024 Binary files /dev/null and b/bench_results/daxfs_bench_nesting_depth.png differ diff --git a/bench_results/daxfs_bench_parallel_workload.png b/bench_results/daxfs_bench_parallel_workload.png new file mode 100644 index 0000000..6dbd430 Binary files /dev/null and b/bench_results/daxfs_bench_parallel_workload.png differ diff --git a/bench_results/daxfs_gpu_bench.png b/bench_results/daxfs_gpu_bench.png new file mode 100644 index 0000000..b89a565 Binary files /dev/null and b/bench_results/daxfs_gpu_bench.png differ diff --git a/bench_results/daxfs_gpu_bench_cas_throughput.png b/bench_results/daxfs_gpu_bench_cas_throughput.png new file mode 100644 index 0000000..7a99f51 Binary files /dev/null and b/bench_results/daxfs_gpu_bench_cas_throughput.png differ diff --git a/bench_results/daxfs_gpu_bench_claim_throughput.png b/bench_results/daxfs_gpu_bench_claim_throughput.png new file mode 100644 index 0000000..3a900d5 Binary files /dev/null and b/bench_results/daxfs_gpu_bench_claim_throughput.png differ diff --git a/bench_results/daxfs_gpu_bench_latency_bars.png b/bench_results/daxfs_gpu_bench_latency_bars.png new file mode 100644 index 0000000..ba53aa3 Binary files /dev/null and b/bench_results/daxfs_gpu_bench_latency_bars.png differ diff --git a/bench_results/daxfs_gpu_bench_lock_contention.png b/bench_results/daxfs_gpu_bench_lock_contention.png new file mode 100644 index 0000000..e0e456f Binary files /dev/null and b/bench_results/daxfs_gpu_bench_lock_contention.png differ diff --git a/bench_results/daxfs_gpu_bench_lookup_latency.png b/bench_results/daxfs_gpu_bench_lookup_latency.png new file mode 100644 index 0000000..605355c Binary files /dev/null and b/bench_results/daxfs_gpu_bench_lookup_latency.png differ diff --git a/bench_results/daxfs_gpu_bench_lookup_throughput.png b/bench_results/daxfs_gpu_bench_lookup_throughput.png new file mode 100644 index 0000000..b458849 Binary files /dev/null and b/bench_results/daxfs_gpu_bench_lookup_throughput.png differ diff --git a/bench_results/daxfs_gpu_bench_mt_acl_overhead.png b/bench_results/daxfs_gpu_bench_mt_acl_overhead.png new file mode 100644 index 0000000..41db17f Binary files /dev/null and b/bench_results/daxfs_gpu_bench_mt_acl_overhead.png differ diff --git a/bench_results/daxfs_gpu_bench_mt_isolation.png b/bench_results/daxfs_gpu_bench_mt_isolation.png new file mode 100644 index 0000000..92eb8bc Binary files /dev/null and b/bench_results/daxfs_gpu_bench_mt_isolation.png differ diff --git a/bench_results/daxfs_gpu_bench_mt_read_tenants.png b/bench_results/daxfs_gpu_bench_mt_read_tenants.png new file mode 100644 index 0000000..e490354 Binary files /dev/null and b/bench_results/daxfs_gpu_bench_mt_read_tenants.png differ diff --git a/bench_results/daxfs_gpu_bench_mt_write_tenants.png b/bench_results/daxfs_gpu_bench_mt_write_tenants.png new file mode 100644 index 0000000..c6323a4 Binary files /dev/null and b/bench_results/daxfs_gpu_bench_mt_write_tenants.png differ diff --git a/bench_results/daxfs_gpu_bench_p2p_bandwidth.png b/bench_results/daxfs_gpu_bench_p2p_bandwidth.png new file mode 100644 index 0000000..81843ec Binary files /dev/null and b/bench_results/daxfs_gpu_bench_p2p_bandwidth.png differ diff --git a/bench_results/daxfs_gpu_bench_p2p_latency.png b/bench_results/daxfs_gpu_bench_p2p_latency.png new file mode 100644 index 0000000..8b72f0b Binary files /dev/null and b/bench_results/daxfs_gpu_bench_p2p_latency.png differ diff --git a/bench_results/daxfs_gpu_bench_p2p_mt.png b/bench_results/daxfs_gpu_bench_p2p_mt.png new file mode 100644 index 0000000..bce5afb Binary files /dev/null and b/bench_results/daxfs_gpu_bench_p2p_mt.png differ diff --git a/bench_results/gpu.tsv b/bench_results/gpu.tsv new file mode 100644 index 0000000..ddec281 --- /dev/null +++ b/bench_results/gpu.tsv @@ -0,0 +1,143 @@ +experiment parameter iteration operation latency_ns ops_count thru_mops notes +gpu_coord_lock 1 1 lock_unlock_rt 2047.8 10000 0.569 single_thread +gpu_coord_lock 1 2 lock_unlock_rt 2048.9 10000 0.569 single_thread +gpu_coord_lock 1 3 lock_unlock_rt 2046.7 10000 0.569 single_thread +gpu_coord_lock 1 4 lock_unlock_rt 2050.9 10000 0.568 single_thread +gpu_coord_lock 1 5 lock_unlock_rt 2045.4 10000 0.570 single_thread +gpu_commit_seq 1 1 volatile_read 532.3 10000 2.189 single_thread +gpu_commit_seq 1 2 volatile_read 532.8 10000 2.187 single_thread +gpu_commit_seq 1 3 volatile_read 531.8 10000 2.191 single_thread +gpu_commit_seq 1 4 volatile_read 524.5 10000 2.219 single_thread +gpu_commit_seq 1 5 volatile_read 524.1 10000 2.221 single_thread +gpu_pending_ctr 1 1 cas_inc_dec 1786.7 20000 0.653 single_thread +gpu_pending_ctr 1 2 cas_inc_dec 1817.5 20000 0.641 single_thread +gpu_pending_ctr 1 3 cas_inc_dec 1817.8 20000 0.641 single_thread +gpu_pending_ctr 1 4 cas_inc_dec 1817.6 20000 0.641 single_thread +gpu_pending_ctr 1 5 cas_inc_dec 1817.6 20000 0.641 single_thread +gpu_pcache_lookup 1 1 lookup_throughput 918.8 10000 1.088 +gpu_pcache_lookup 1 2 lookup_throughput 914.4 10000 1.094 +gpu_pcache_lookup 1 3 lookup_throughput 908.5 10000 1.101 +gpu_pcache_lookup 1 4 lookup_throughput 908.4 10000 1.101 +gpu_pcache_lookup 1 5 lookup_throughput 908.7 10000 1.101 +gpu_pcache_lookup 32 1 lookup_throughput 30.4 320000 32.929 +gpu_pcache_lookup 32 2 lookup_throughput 30.4 320000 32.860 +gpu_pcache_lookup 32 3 lookup_throughput 30.4 320000 32.847 +gpu_pcache_lookup 32 4 lookup_throughput 30.4 320000 32.860 +gpu_pcache_lookup 32 5 lookup_throughput 30.5 320000 32.833 +gpu_pcache_lookup 64 1 lookup_throughput 20.3 640000 49.253 +gpu_pcache_lookup 64 2 lookup_throughput 20.3 640000 49.204 +gpu_pcache_lookup 64 3 lookup_throughput 20.3 640000 49.161 +gpu_pcache_lookup 64 4 lookup_throughput 20.3 640000 49.246 +gpu_pcache_lookup 64 5 lookup_throughput 20.3 640000 49.247 +gpu_pcache_lookup 128 1 lookup_throughput 15.2 1280000 65.662 +gpu_pcache_lookup 128 2 lookup_throughput 15.2 1280000 65.661 +gpu_pcache_lookup 128 3 lookup_throughput 15.2 1280000 65.683 +gpu_pcache_lookup 128 4 lookup_throughput 15.2 1280000 65.671 +gpu_pcache_lookup 128 5 lookup_throughput 15.2 1280000 65.663 +gpu_pcache_lookup 256 1 lookup_throughput 7.6 2560000 131.157 +gpu_pcache_lookup 256 2 lookup_throughput 7.6 2560000 131.091 +gpu_pcache_lookup 256 3 lookup_throughput 7.6 2560000 130.796 +gpu_pcache_lookup 256 4 lookup_throughput 7.6 2560000 130.978 +gpu_pcache_lookup 256 5 lookup_throughput 7.6 2560000 131.228 +gpu_pcache_lookup 512 1 lookup_throughput 3.7 5120000 268.549 +gpu_pcache_lookup 512 2 lookup_throughput 3.7 5120000 267.784 +gpu_pcache_lookup 512 3 lookup_throughput 3.7 5120000 268.770 +gpu_pcache_lookup 512 4 lookup_throughput 3.7 5120000 267.830 +gpu_pcache_lookup 512 5 lookup_throughput 3.7 5120000 268.546 +gpu_pcache_lookup 1024 1 lookup_throughput 1.9 10240000 539.197 +gpu_pcache_lookup 1024 2 lookup_throughput 1.9 10240000 538.267 +gpu_pcache_lookup 1024 3 lookup_throughput 1.9 10240000 537.556 +gpu_pcache_lookup 1024 4 lookup_throughput 1.9 10240000 537.570 +gpu_pcache_lookup 1024 5 lookup_throughput 1.9 10240000 538.479 +gpu_slot_cas 1 1 cas_throughput 1407.7 1000 0.710 independent +gpu_slot_cas 1 2 cas_throughput 1366.5 1000 0.732 independent +gpu_slot_cas 1 3 cas_throughput 1365.6 1000 0.732 independent +gpu_slot_cas 1 4 cas_throughput 1364.6 1000 0.733 independent +gpu_slot_cas 1 5 cas_throughput 1365.9 1000 0.732 independent +gpu_slot_cas 32 1 cas_throughput 137.0 32000 7.297 independent +gpu_slot_cas 32 2 cas_throughput 137.5 32000 7.275 independent +gpu_slot_cas 32 3 cas_throughput 137.1 32000 7.295 independent +gpu_slot_cas 32 4 cas_throughput 138.1 32000 7.242 independent +gpu_slot_cas 32 5 cas_throughput 137.2 32000 7.290 independent +gpu_slot_cas 64 1 cas_throughput 129.5 64000 7.723 independent +gpu_slot_cas 64 2 cas_throughput 129.4 64000 7.727 independent +gpu_slot_cas 64 3 cas_throughput 129.5 64000 7.720 independent +gpu_slot_cas 64 4 cas_throughput 129.4 64000 7.725 independent +gpu_slot_cas 64 5 cas_throughput 129.5 64000 7.722 independent +gpu_slot_cas 128 1 cas_throughput 137.4 128000 7.278 independent +gpu_slot_cas 128 2 cas_throughput 140.6 128000 7.114 independent +gpu_slot_cas 128 3 cas_throughput 141.1 128000 7.086 independent +gpu_slot_cas 128 4 cas_throughput 143.3 128000 6.981 independent +gpu_slot_cas 128 5 cas_throughput 140.9 128000 7.099 independent +gpu_slot_cas 256 1 cas_throughput 87.9 256000 11.383 independent +gpu_slot_cas 256 2 cas_throughput 91.7 256000 10.910 independent +gpu_slot_cas 256 3 cas_throughput 91.7 256000 10.908 independent +gpu_slot_cas 256 4 cas_throughput 89.1 256000 11.230 independent +gpu_slot_cas 256 5 cas_throughput 89.9 256000 11.122 independent +gpu_slot_cas 512 1 cas_throughput 86.4 512000 11.568 independent +gpu_slot_cas 512 2 cas_throughput 84.0 512000 11.912 independent +gpu_slot_cas 512 3 cas_throughput 85.7 512000 11.671 independent +gpu_slot_cas 512 4 cas_throughput 85.0 512000 11.759 independent +gpu_slot_cas 512 5 cas_throughput 85.7 512000 11.664 independent +gpu_slot_cas 1024 1 cas_throughput 85.5 1024000 11.703 independent +gpu_slot_cas 1024 2 cas_throughput 85.6 1024000 11.678 independent +gpu_slot_cas 1024 3 cas_throughput 86.0 1024000 11.629 independent +gpu_slot_cas 1024 4 cas_throughput 85.9 1024000 11.645 independent +gpu_slot_cas 1024 5 cas_throughput 85.6 1024000 11.681 independent +gpu_lock_contention 1 1 lock_acquisition 2664.0 100 0.375 counter=100 +gpu_lock_contention 1 2 lock_acquisition 2257.0 100 0.443 counter=100 +gpu_lock_contention 1 3 lock_acquisition 2277.1 100 0.439 counter=100 +gpu_lock_contention 1 4 lock_acquisition 2251.5 100 0.444 counter=100 +gpu_lock_contention 1 5 lock_acquisition 2260.2 100 0.442 counter=100 +gpu_lock_contention 2 1 lock_acquisition 1151.2 200 0.869 counter=101 +gpu_lock_contention 2 2 lock_acquisition 1140.6 200 0.877 counter=101 +gpu_lock_contention 2 3 lock_acquisition 1151.2 200 0.869 counter=101 +gpu_lock_contention 2 4 lock_acquisition 1149.6 200 0.870 counter=101 +gpu_lock_contention 2 5 lock_acquisition 1149.6 200 0.870 counter=101 +gpu_lock_contention 4 1 lock_acquisition 591.2 400 1.691 counter=103 +gpu_lock_contention 4 2 lock_acquisition 592.6 400 1.688 counter=103 +gpu_lock_contention 4 3 lock_acquisition 592.6 400 1.688 counter=103 +gpu_lock_contention 4 4 lock_acquisition 588.0 400 1.701 counter=103 +gpu_lock_contention 4 5 lock_acquisition 588.5 400 1.699 counter=103 +gpu_lock_contention 8 1 lock_acquisition 319.7 800 3.128 counter=107 +gpu_lock_contention 8 2 lock_acquisition 319.4 800 3.130 counter=107 +gpu_lock_contention 8 3 lock_acquisition 319.7 800 3.128 counter=107 +gpu_lock_contention 8 4 lock_acquisition 319.5 800 3.130 counter=107 +gpu_lock_contention 8 5 lock_acquisition 319.4 800 3.131 counter=107 +gpu_lock_contention 16 1 lock_acquisition 199.3 1600 5.017 counter=115 +gpu_lock_contention 16 2 lock_acquisition 198.8 1600 5.030 counter=115 +gpu_lock_contention 16 3 lock_acquisition 198.8 1600 5.030 counter=115 +gpu_lock_contention 16 4 lock_acquisition 199.4 1600 5.015 counter=115 +gpu_lock_contention 16 5 lock_acquisition 198.8 1600 5.030 counter=115 +gpu_lock_contention 32 1 lock_acquisition 168.1 3200 5.948 counter=131 +gpu_lock_contention 32 2 lock_acquisition 168.0 3200 5.953 counter=131 +gpu_lock_contention 32 3 lock_acquisition 168.2 3200 5.944 counter=131 +gpu_lock_contention 32 4 lock_acquisition 167.8 3200 5.959 counter=131 +gpu_lock_contention 32 5 lock_acquisition 167.8 3200 5.958 counter=131 +gpu_pcache_claim 1 1 claim_throughput 4397.6 1000 0.227 free_to_pending +gpu_pcache_claim 1 2 claim_throughput 4387.6 1000 0.228 free_to_pending +gpu_pcache_claim 1 3 claim_throughput 4388.1 1000 0.228 free_to_pending +gpu_pcache_claim 1 4 claim_throughput 4388.2 1000 0.228 free_to_pending +gpu_pcache_claim 1 5 claim_throughput 4388.0 1000 0.228 free_to_pending +gpu_pcache_claim 32 1 claim_throughput 20820.4 32000 0.048 free_to_pending +gpu_pcache_claim 32 2 claim_throughput 20827.9 32000 0.048 free_to_pending +gpu_pcache_claim 32 3 claim_throughput 19216.1 32000 0.052 free_to_pending +gpu_pcache_claim 32 4 claim_throughput 17914.2 32000 0.056 free_to_pending +gpu_pcache_claim 32 5 claim_throughput 20202.7 32000 0.049 free_to_pending +gpu_pcache_claim 64 1 claim_throughput 50021.0 64000 0.020 free_to_pending +gpu_pcache_claim 64 2 claim_throughput 49535.5 64000 0.020 free_to_pending +gpu_pcache_claim 64 3 claim_throughput 50754.8 64000 0.020 free_to_pending +gpu_pcache_claim 64 4 claim_throughput 49424.7 64000 0.020 free_to_pending +gpu_pcache_claim 64 5 claim_throughput 49654.4 64000 0.020 free_to_pending +gpu_pcache_claim 128 1 claim_throughput 48636.7 128000 0.021 free_to_pending +gpu_pcache_claim 128 2 claim_throughput 51128.5 128000 0.020 free_to_pending +gpu_pcache_claim 128 3 claim_throughput 50332.4 128000 0.020 free_to_pending +gpu_pcache_claim 128 4 claim_throughput 50294.6 128000 0.020 free_to_pending +gpu_pcache_claim 128 5 claim_throughput 50091.9 128000 0.020 free_to_pending +gpu_pcache_claim 256 1 claim_throughput 97352.1 256000 0.010 free_to_pending +gpu_pcache_claim 256 2 claim_throughput 98231.3 256000 0.010 free_to_pending +gpu_pcache_claim 256 3 claim_throughput 97677.1 256000 0.010 free_to_pending +gpu_pcache_claim 256 4 claim_throughput 98483.2 256000 0.010 free_to_pending +gpu_pcache_claim 256 5 claim_throughput 98240.5 256000 0.010 free_to_pending +gpu_pcache_claim 512 1 claim_throughput 196027.4 512000 0.005 free_to_pending +gpu_pcache_claim 512 2 claim_throughput 196364.6 512000 0.005 free_to_pending diff --git a/daxfs/file.c b/daxfs/file.c index f11e59b..b4b4e76 100644 --- a/daxfs/file.c +++ b/daxfs/file.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include "daxfs.h" @@ -761,6 +763,47 @@ long daxfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) dma_buf_put(info->dmabuf); return fd; } + case DAXFS_IOC_GET_GPU_INFO: { + struct daxfs_gpu_info gi; + u64 pcache_off; + + memset(&gi, 0, sizeof(gi)); + + gi.dax_phys_addr = daxfs_mem_phys(info, 0); + gi.dax_size = info->size; + + if (info->coord) { + gi.coord_offset = daxfs_mem_offset(info, info->coord); + gi.coord_lock_off = offsetof(struct daxfs_global_coord, + coord_lock); + gi.commit_seq_off = offsetof(struct daxfs_global_coord, + commit_sequence); + } + + pcache_off = le64_to_cpu(info->super->pcache_offset); + if (pcache_off && info->pcache) { + struct daxfs_pcache_header *hdr; + + hdr = info->pcache->header; + gi.pcache_offset = pcache_off; + gi.pcache_slots_offset = pcache_off + + le64_to_cpu(hdr->slot_meta_offset); + gi.pcache_data_offset = pcache_off + + le64_to_cpu(hdr->slot_data_offset); + gi.pcache_slot_count = info->pcache->slot_count; + gi.pcache_slot_stride = + sizeof(struct daxfs_pcache_slot); + gi.state_tag_off = offsetof(struct daxfs_pcache_slot, + state_tag); + gi.pending_count_off = + offsetof(struct daxfs_pcache_header, + pending_count); + } + + if (copy_to_user((void __user *)arg, &gi, sizeof(gi))) + return -EFAULT; + return 0; + } } return -ENOTTY; } diff --git a/include/daxfs_format.h b/include/daxfs_format.h index b752708..cc50222 100644 --- a/include/daxfs_format.h +++ b/include/daxfs_format.h @@ -14,6 +14,29 @@ /* ioctl commands */ #define DAXFS_IOC_GET_DMABUF _IO('D', 1) /* Get dma-buf fd for this mount */ +#define DAXFS_IOC_GET_GPU_INFO _IOR('D', 2, struct daxfs_gpu_info) + +/* + * GPU info for PCIe AtomicOps coordination. + * Exposes physical addresses and field offsets so a GPU can participate + * in the same cmpxchg-based protocols (coord lock, page cache state + * machine) via PCIe CAS TLPs. + */ +struct daxfs_gpu_info { + __u64 dax_phys_addr; /* Physical base of DAX region */ + __u64 dax_size; /* Total DAX region size */ + __u64 coord_offset; /* Offset of daxfs_global_coord from base */ + __u32 coord_lock_off; /* offsetof(coord_lock) within coord */ + __u32 commit_seq_off; /* offsetof(commit_sequence) within coord */ + __u64 pcache_offset; /* Offset of pcache region (0 = none) */ + __u64 pcache_slots_offset; /* Offset of slot metadata array */ + __u64 pcache_data_offset; /* Offset of slot data area */ + __u32 pcache_slot_count; /* Number of cache slots */ + __u32 pcache_slot_stride; /* sizeof(daxfs_pcache_slot) = 16 */ + __u32 state_tag_off; /* offsetof(state_tag) in slot = 0 */ + __u32 pending_count_off; /* offsetof(pending_count) in pcache_header */ + __u64 reserved[4]; +}; #define DAXFS_SUPER_MAGIC 0x64617835 /* "dax5" */ #define DAXFS_VERSION 8 diff --git a/include/daxfs_gpu.h b/include/daxfs_gpu.h new file mode 100644 index 0000000..aae24c3 --- /dev/null +++ b/include/daxfs_gpu.h @@ -0,0 +1,269 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * daxfs GPU-side coordination header (CUDA) + * + * Device-inline functions that mirror the kernel's cmpxchg-based + * coordination protocols for the coordination lock and shared page + * cache. Each function compiles to PCIe AtomicOp TLPs (CAS / Swap) + * that serialize at the memory controller alongside CPU LOCK CMPXCHG, + * giving mutual atomicity across CPU and GPU. + * + * All pointers below are CUdeviceptr-derived device pointers into the + * DAX region mapped via daxfs-gpu-map. The caller is responsible for + * computing field addresses from the base pointer and the offsets + * returned by DAXFS_IOC_GET_GPU_INFO. + * + * Copyright (C) 2026 Multikernel Technologies, Inc. All rights reserved. + */ +#ifndef _DAXFS_GPU_H +#define _DAXFS_GPU_H + +#ifdef __CUDACC__ + +/* Re-export the state/tag helpers so GPU code matches kernel conventions. + * Guarded so this header can coexist with daxfs_format.h. */ +#ifndef PCACHE_STATE_FREE +#define PCACHE_STATE_FREE 0 +#define PCACHE_STATE_PENDING 1 +#define PCACHE_STATE_VALID 2 +#endif + +#ifndef PCACHE_STATE +#define PCACHE_STATE(v) ((v) & 3ULL) +#define PCACHE_TAG(v) ((v) >> 2) +#define PCACHE_MAKE(state, tag) (((unsigned long long)(tag) << 2) | (state)) +#endif + +/* + * ========================================================================= + * Coordination lock (mirrors branch.c daxfs_coord_lock / unlock) + * ========================================================================= + */ + +/* + * Acquire the global coordination lock. + * + * @lock: device pointer to coord_lock (__le32, 0 = free, 1 = held) + * + * Spins with atomicCAS (32-bit PCIe CAS TLP) until the lock is acquired. + * __threadfence_system() after acquisition ensures all subsequent GPU + * loads/stores are ordered after the lock is visible on the PCIe fabric. + */ +static __device__ __forceinline__ void +daxfs_gpu_coord_lock(unsigned int *lock) +{ + while (atomicCAS(lock, 0u, 1u) != 0u) + ; /* spin */ + __threadfence_system(); +} + +/* + * Release the global coordination lock. + * + * @lock: device pointer to coord_lock + * + * __threadfence_system() before release ensures all prior GPU stores are + * visible on the PCIe fabric before the lock word is cleared. + * atomicExch generates a PCIe Swap TLP (unconditional write with return). + */ +static __device__ __forceinline__ void +daxfs_gpu_coord_unlock(unsigned int *lock) +{ + __threadfence_system(); + atomicExch(lock, 0u); +} + +/* + * ========================================================================= + * Commit sequence (mirrors branch.c daxfs_commit_seq_changed) + * ========================================================================= + */ + +/* + * Read the current commit sequence number. + * + * @seq: device pointer to commit_sequence (__le64) + * + * Returns the current value. Caller compares against a cached copy to + * detect new commits. Uses a volatile load (no atomicCAS needed for + * read-only access on a naturally-aligned 64-bit word). + */ +static __device__ __forceinline__ unsigned long long +daxfs_gpu_read_commit_seq(const unsigned long long *seq) +{ + return *(volatile const unsigned long long *)seq; +} + +/* + * ========================================================================= + * Page cache slot state machine (mirrors pcache.c slot_cmpxchg) + * ========================================================================= + */ + +/* + * Atomic compare-and-swap on a pcache slot's state_tag field. + * + * @state_tag: device pointer to slot->state_tag (__le64) + * @expected: expected old value (packed state | tag) + * @desired: new value to write if *state_tag == expected + * + * Returns the value that was in *state_tag before the operation. + * Maps to a 64-bit PCIe CAS TLP. + */ +static __device__ __forceinline__ unsigned long long +daxfs_gpu_slot_cmpxchg(unsigned long long *state_tag, + unsigned long long expected, + unsigned long long desired) +{ + unsigned long long old = atomicCAS(state_tag, expected, desired); + + __threadfence_system(); + return old; +} + +/* + * ========================================================================= + * Pending counter (mirrors pcache.c pcache_inc/dec_pending) + * ========================================================================= + */ + +/* + * Atomically increment the pending slot counter. + * + * @pending_count: device pointer to pcache_header->pending_count (__le32) + * + * CAS loop identical to the kernel's pcache_inc_pending(). + */ +static __device__ __forceinline__ void +daxfs_gpu_pcache_inc_pending(unsigned int *pending_count) +{ + unsigned int old_val, new_val; + + do { + old_val = *(volatile unsigned int *)pending_count; + new_val = old_val + 1; + } while (atomicCAS(pending_count, old_val, new_val) != old_val); + __threadfence_system(); +} + +/* + * Atomically decrement the pending slot counter (saturates at 0). + * + * @pending_count: device pointer to pcache_header->pending_count (__le32) + */ +static __device__ __forceinline__ void +daxfs_gpu_pcache_dec_pending(unsigned int *pending_count) +{ + unsigned int old_val, new_val; + + do { + old_val = *(volatile unsigned int *)pending_count; + if (old_val == 0) + return; + new_val = old_val - 1; + } while (atomicCAS(pending_count, old_val, new_val) != old_val); + __threadfence_system(); +} + +/* + * ========================================================================= + * Page cache fast-path lookup (mirrors pcache.c daxfs_pcache_get_page) + * ========================================================================= + */ + +/* + * Fast-path cache lookup: load state_tag, check VALID + tag match. + * + * @state_tag: device pointer to slot->state_tag + * @desired_tag: expected tag value (backing_page_offset >> 12) + * + * Returns true if slot is VALID with matching tag (cache hit). + * Caller can then read directly from the slot data area. + */ +static __device__ __forceinline__ bool +daxfs_gpu_pcache_lookup(const unsigned long long *state_tag, + unsigned long long desired_tag) +{ + unsigned long long val; + + val = *(volatile const unsigned long long *)state_tag; + if (val == PCACHE_MAKE(PCACHE_STATE_VALID, desired_tag)) { + __threadfence_system(); /* order data read after state check */ + return true; + } + return false; +} + +/* + * ========================================================================= + * Page cache slot claim (mirrors pcache.c slow path FREE -> PENDING) + * ========================================================================= + */ + +/* + * Attempt to claim a FREE cache slot by transitioning to PENDING. + * + * @state_tag: device pointer to slot->state_tag + * @desired_tag: tag to install (backing_page_offset >> 12) + * + * Returns true if the slot was successfully claimed (FREE -> PENDING). + * On success the caller must also call daxfs_gpu_pcache_inc_pending(). + * On failure the slot was not FREE or was raced; caller should re-read + * and retry. + */ +static __device__ __forceinline__ bool +daxfs_gpu_pcache_claim(unsigned long long *state_tag, + unsigned long long desired_tag) +{ + unsigned long long free_val = PCACHE_MAKE(PCACHE_STATE_FREE, 0); + unsigned long long pend_val = PCACHE_MAKE(PCACHE_STATE_PENDING, + desired_tag); + unsigned long long old; + + old = atomicCAS(state_tag, free_val, pend_val); + __threadfence_system(); + return old == free_val; +} + +/* + * ========================================================================= + * Wait for slot to become VALID (mirrors pcache.c wait_valid) + * ========================================================================= + */ + +/* + * Poll until a PENDING slot transitions to VALID with matching tag. + * + * @state_tag: device pointer to slot->state_tag + * @desired_tag: expected tag value + * @max_iters: maximum poll iterations before giving up + * + * Returns true if slot became VALID with matching tag within the + * iteration budget. Returns false on timeout or unexpected state + * (e.g., slot was evicted to FREE). + */ +static __device__ __forceinline__ bool +daxfs_gpu_pcache_wait_valid(const unsigned long long *state_tag, + unsigned long long desired_tag, + unsigned int max_iters) +{ + unsigned long long val; + unsigned long long expected = PCACHE_MAKE(PCACHE_STATE_VALID, + desired_tag); + unsigned int i; + + for (i = 0; i < max_iters; i++) { + val = *(volatile const unsigned long long *)state_tag; + if (val == expected) { + __threadfence_system(); + return true; + } + /* Slot evicted from under us */ + if (PCACHE_STATE(val) == PCACHE_STATE_FREE) + return false; + } + return false; +} + +#endif /* __CUDACC__ */ +#endif /* _DAXFS_GPU_H */ diff --git a/tests/bench_agent.sh b/tests/bench_agent.sh new file mode 100755 index 0000000..2a83f3e --- /dev/null +++ b/tests/bench_agent.sh @@ -0,0 +1,841 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# DAXFS AI Agent Speculative Branching Benchmark +# +# Measures branch creation, parallel workload execution, commit/merge, +# and sibling invalidation — the core operations for AI agent speculative +# execution where N agents fork from shared filesystem state. +# +# Usage: sudo ./tests/bench_agent.sh [options] +# +# Options: +# -n NUM Maximum agent count (powers of 2 up to this, default: 64) +# -d NUM Maximum nesting depth (default: 8) +# -w SIZE Workload: small, medium, large (default: medium) +# -r NUM Repetitions per measurement (default: 3) +# -o DIR Output directory (default: ./bench_results) +# -v Verbose output +# +# Requirements: +# - Root privileges +# - daxfs.ko module built +# - mkdaxfs, daxfs-branch, daxfs-inspect tools built +# - /dev/dma_heap/system available (or modify HEAP_DEV) + +set -e + +# ── Configuration ──────────────────────────────────────────────────── + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +MKDAXFS="$PROJECT_DIR/tools/mkdaxfs" +DAXFS_BRANCH="$PROJECT_DIR/tools/daxfs-branch" +DAXFS_INSPECT="$PROJECT_DIR/tools/daxfs-inspect" +MODULE="$PROJECT_DIR/daxfs/daxfs.ko" +HEAP_DEV="/dev/dma_heap/system" + +# Defaults +MAX_AGENTS=64 +MAX_DEPTH=8 +WORKLOAD="medium" +REPS=3 +OUTPUT_DIR="./bench_results" +VERBOSE=0 + +# Runtime state +TEST_DIR="" +MODULE_LOADED_BY_US=0 + +# ── Output helpers ─────────────────────────────────────────────────── + +log() { echo -e "$@"; } + +log_verbose() { + if [ "$VERBOSE" -eq 1 ]; then + echo -e " [v] $*" >&2 + fi +} + +die() { + echo "ERROR: $1" >&2 + exit 1 +} + +# ── Timing ─────────────────────────────────────────────────────────── + +# Returns nanosecond wall-clock timestamp +now_ns() { + date +%s%N +} + +# Converts nanosecond delta to microseconds +ns_to_us() { + echo $(( $1 / 1000 )) +} + +# ── TSV output ─────────────────────────────────────────────────────── + +TSV_FILE="" + +tsv_init() { + TSV_FILE="$OUTPUT_DIR/agents.tsv" + mkdir -p "$OUTPUT_DIR" + printf "experiment\tparameter\titeration\toperation\tlatency_us\tops_count\tdelta_bytes\tnotes\n" \ + > "$TSV_FILE" +} + +tsv_row() { + # args: experiment parameter iteration operation latency_us ops_count delta_bytes notes + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" \ + >> "$TSV_FILE" +} + +# ── Source tree generation ─────────────────────────────────────────── + +# Generates a C-like source file of approximately the given size (bytes) +generate_c_file() { + local path="$1" target_size="$2" + local written=0 + { + echo "/* Auto-generated source file */" + echo "#include " + echo "#include " + echo "" + local func_num=0 + while [ "$written" -lt "$target_size" ]; do + cat <> 2); + if (result > 1000000) result %= 997; + } + return result; +} +CFUNC + func_num=$((func_num + 1)) + written=$(( written + 180 )) + done + } > "$path" +} + +# Generates a header file of approximately the given size +generate_h_file() { + local path="$1" target_size="$2" + local guard + guard=$(basename "$path" | tr '[:lower:].' '[:upper:]_') + { + echo "#ifndef ${guard}" + echo "#define ${guard}" + echo "" + local decl_num=0 written=0 + while [ "$written" -lt "$target_size" ]; do + echo "int compute_${decl_num}(int x, int y);" + echo "struct data_${decl_num} { int field_a; long field_b; char name[64]; };" + echo "" + decl_num=$((decl_num + 1)) + written=$((written + 100)) + done + echo "#endif /* ${guard} */" + } > "$path" +} + +# Creates ~30-file realistic C project in the given directory +create_source_tree() { + local dir="$1" + mkdir -p "$dir/src" "$dir/include" "$dir/tests" "$dir/docs" "$dir/config" + + # src/module_{1..10}.c — 2-8KB each + for i in $(seq 1 10); do + local size=$(( 2048 + (i * 600) )) + generate_c_file "$dir/src/module_${i}.c" "$size" + done + + # include/module_{1..5}.h — 512-1536 bytes each + for i in $(seq 1 5); do + local size=$(( 512 + (i * 200) )) + generate_h_file "$dir/include/module_${i}.h" "$size" + done + + # tests/test_{1..3}.c — 1-2KB each + for i in $(seq 1 3); do + generate_c_file "$dir/tests/test_${i}.c" $(( 1024 + i * 300 )) + done + + # docs/ARCHITECTURE.md + { + echo "# Architecture" + echo "" + echo "This project consists of 10 modules that implement a data processing pipeline." + echo "Each module handles a specific stage of the computation." + echo "" + for i in $(seq 1 10); do + echo "## Module $i" + echo "" + echo "Handles stage $i of the pipeline. Dependencies: module_$((i > 1 ? i-1 : 1))." + echo "" + done + } > "$dir/docs/ARCHITECTURE.md" + + # config/settings.json + cat > "$dir/config/settings.json" <<'JSON' +{ + "version": "1.0.0", + "modules": 10, + "optimization_level": 2, + "debug": false, + "paths": { + "input": "/data/input", + "output": "/data/output", + "cache": "/tmp/cache" + } +} +JSON + + # Makefile + { + echo "CC = gcc" + echo "CFLAGS = -Wall -Wextra -O2 -Iinclude" + echo "SRCS = \$(wildcard src/*.c)" + echo "OBJS = \$(SRCS:.c=.o)" + echo "" + echo "all: libproject.a" + echo "" + echo "libproject.a: \$(OBJS)" + echo " ar rcs \$@ \$^" + echo "" + echo "clean:" + echo " rm -f src/*.o libproject.a" + } > "$dir/Makefile" + + # README.md + { + echo "# Project" + echo "" + echo "A data processing pipeline with 10 modules." + echo "" + echo "## Build" + echo "" + echo '```' + echo "make" + echo '```' + } > "$dir/README.md" +} + +# ── Workload ───────────────────────────────────────────────────────── + +# Runs the AI agent workload on a mounted branch. +# Simulates an AI agent editing a code project. +# +# Args: $1 = mount path, $2 = workload size (small|medium|large) +# Returns: number of operations performed (via global WORKLOAD_OPS) +WORKLOAD_OPS=0 + +run_agent_workload() { + local mnt="$1" size="$2" + local n_reads n_creates n_modifies n_mkdirs n_deletes n_renames + local ops=0 + + case "$size" in + small) n_reads=5; n_creates=3; n_modifies=1; n_mkdirs=1; n_deletes=0; n_renames=0 ;; + medium) n_reads=10; n_creates=10; n_modifies=5; n_mkdirs=3; n_deletes=2; n_renames=1 ;; + large) n_reads=20; n_creates=50; n_modifies=20; n_mkdirs=10; n_deletes=5; n_renames=3 ;; + *) die "Unknown workload size: $size" ;; + esac + + # Reads — cat existing files + for i in $(seq 1 "$n_reads"); do + local idx=$(( (i % 10) + 1 )) + cat "$mnt/src/module_${idx}.c" > /dev/null 2>&1 || true + ops=$((ops + 1)) + done + + # Mkdirs — create new directories + for i in $(seq 1 "$n_mkdirs"); do + mkdir -p "$mnt/agent_dir_${i}/sub" 2>/dev/null || true + ops=$((ops + 1)) + done + + # Creates — write new files (1-4KB of generated C content) + for i in $(seq 1 "$n_creates"); do + local fsize=$(( 1024 + (i * 73 % 3072) )) + generate_c_file "$mnt/agent_file_${i}.c" "$fsize" + ops=$((ops + 1)) + done + + # Modifies — overwrite existing source files + for i in $(seq 1 "$n_modifies"); do + local idx=$(( (i % 10) + 1 )) + local fsize=$(( 2048 + (i * 500) )) + generate_c_file "$mnt/src/module_${idx}.c" "$fsize" + ops=$((ops + 1)) + done + + # Deletes — remove some created files + for i in $(seq 1 "$n_deletes"); do + rm -f "$mnt/agent_file_${i}.c" 2>/dev/null || true + ops=$((ops + 1)) + done + + # Renames — rename some files + for i in $(seq 1 "$n_renames"); do + local src_idx=$(( n_creates - i + 1 )) + if [ -f "$mnt/agent_file_${src_idx}.c" ]; then + mv "$mnt/agent_file_${src_idx}.c" "$mnt/agent_file_${src_idx}_renamed.c" 2>/dev/null || true + fi + ops=$((ops + 1)) + done + + WORKLOAD_OPS=$ops +} + +# ── Environment setup / teardown ───────────────────────────────────── + +# Creates a fresh daxfs image and mounts it as main. +# Args: $1 = source dir, $2 = image size, $3 = delta size, $4 = main mount point +setup_fresh_image() { + local src="$1" img_size="$2" delta_size="$3" mnt_main="$4" + mkdir -p "$mnt_main" + + "$MKDAXFS" -d "$src" -H "$HEAP_DEV" -s "$img_size" -m "$mnt_main" -b -D "$delta_size" \ + || die "mkdaxfs failed (size=$img_size delta=$delta_size)" +} + +# Unmount all mounts under TEST_DIR, unload module if we loaded it +full_cleanup() { + log_verbose "Full cleanup..." + + # Unmount everything under TEST_DIR + if [ -n "$TEST_DIR" ] && [ -d "$TEST_DIR" ]; then + # Find and unmount in reverse order + local mnts + mnts=$(mount | grep "$TEST_DIR" | awk '{print $3}' | sort -r) || true + for m in $mnts; do + umount "$m" 2>/dev/null || umount -l "$m" 2>/dev/null || true + done + rm -rf "$TEST_DIR" + fi + + if [ "$MODULE_LOADED_BY_US" = "1" ]; then + rmmod daxfs 2>/dev/null || true + fi +} + +# Quick teardown: just unmount everything under TEST_DIR (between experiments) +teardown_mounts() { + if [ -n "$TEST_DIR" ] && [ -d "$TEST_DIR" ]; then + local mnts + mnts=$(mount | grep "$TEST_DIR" | awk '{print $3}' | sort -r) || true + for m in $mnts; do + umount "$m" 2>/dev/null || umount -l "$m" 2>/dev/null || true + done + fi +} + +check_prerequisites() { + if [ "$(id -u)" -ne 0 ]; then + die "Must run as root" + fi + for tool in "$MKDAXFS" "$DAXFS_BRANCH" "$DAXFS_INSPECT" "$MODULE"; do + if [ ! -f "$tool" ]; then + die "$(basename "$tool") not found at $tool — run 'make' first" + fi + done + if [ ! -e "$HEAP_DEV" ]; then + die "DMA heap not found at $HEAP_DEV" + fi +} + +load_module() { + MODULE_LOADED_BY_US=0 + if ! lsmod | grep -q "^daxfs"; then + log_verbose "Loading daxfs module..." + insmod "$MODULE" || die "Failed to load module" + MODULE_LOADED_BY_US=1 + fi +} + +# ── Experiment 1: Agent Scalability ────────────────────────────────── + +scale_agents() { + log "Experiment 1: Agent Scalability (scale_agents)" + + local n=1 + while [ "$n" -le "$MAX_AGENTS" ]; do + log " N=$n agents..." + local iter + for iter in $(seq 1 "$REPS"); do + log_verbose "Iteration $iter/$REPS for N=$n" + + # Fresh image: delta = (N+2)*1M + 64M headroom + local delta_mb=$(( (n + 2) + 64 )) + local img_mb=$(( delta_mb + 128 )) + local mnt_main="$TEST_DIR/main" + local source="$TEST_DIR/source" + + teardown_mounts + rm -rf "$TEST_DIR/main" "$TEST_DIR/branch_"* + mkdir -p "$mnt_main" + + setup_fresh_image "$source" "${img_mb}M" "${delta_mb}M" "$mnt_main" + + # ── Branch creation (sequential, branch_lock serializes) ── + local t0 t1 branch_total_us=0 + local branch_mnts=() + for b in $(seq 1 "$n"); do + local bmnt="$TEST_DIR/branch_${b}" + mkdir -p "$bmnt" + branch_mnts+=("$bmnt") + + t0=$(now_ns) + "$DAXFS_BRANCH" create "agent_${b}" -m "$bmnt" -p main \ + || die "Failed to create branch agent_${b}" + t1=$(now_ns) + + local lat_us + lat_us=$(ns_to_us $((t1 - t0))) + branch_total_us=$((branch_total_us + lat_us)) + log_verbose " branch agent_${b} created in ${lat_us} us" + done + tsv_row "scale_agents" "$n" "$iter" "branch_create_total" "$branch_total_us" "$n" "" "" + tsv_row "scale_agents" "$n" "$iter" "branch_create_avg" "$((branch_total_us / n))" "$n" "" "" + + # ── Parallel workload on all branches ── + t0=$(now_ns) + local pids=() + for b in $(seq 1 "$n"); do + ( + run_agent_workload "${branch_mnts[$((b-1))]}" "$WORKLOAD" + ) & + pids+=($!) + done + # Wait for all + local workload_fail=0 + for pid in "${pids[@]}"; do + wait "$pid" 2>/dev/null || workload_fail=$((workload_fail + 1)) + done + t1=$(now_ns) + local workload_us + workload_us=$(ns_to_us $((t1 - t0))) + tsv_row "scale_agents" "$n" "$iter" "parallel_workload" "$workload_us" "$n" "" \ + "fails=$workload_fail" + + # ── Commit winner (branch 1) ── + t0=$(now_ns) + "$DAXFS_BRANCH" commit -m "${branch_mnts[0]}" \ + || die "Commit failed for agent_1" + t1=$(now_ns) + local commit_us + commit_us=$(ns_to_us $((t1 - t0))) + tsv_row "scale_agents" "$n" "$iter" "commit" "$commit_us" "1" "" "" + + # ── Detect ESTALE on sibling (branch 2, if it exists) ── + if [ "$n" -ge 2 ]; then + t0=$(now_ns) + # Any operation on an invalidated sibling should fail with ESTALE + local estale_rc=0 + cat "${branch_mnts[1]}/src/module_1.c" > /dev/null 2>&1 || estale_rc=$? + t1=$(now_ns) + local estale_us + estale_us=$(ns_to_us $((t1 - t0))) + tsv_row "scale_agents" "$n" "$iter" "estale_detect" "$estale_us" "1" "" \ + "rc=$estale_rc" + fi + + # ── End-to-end cleanup time ── + t0=$(now_ns) + teardown_mounts + t1=$(now_ns) + local cleanup_us + cleanup_us=$(ns_to_us $((t1 - t0))) + tsv_row "scale_agents" "$n" "$iter" "cleanup" "$cleanup_us" "$n" "" "" + + done + n=$((n * 2)) + done +} + +# ── Experiment 2: Nesting Depth ────────────────────────────────────── + +scale_depth() { + log "Experiment 2: Nesting Depth (scale_depth)" + + local d=1 + while [ "$d" -le "$MAX_DEPTH" ]; do + log " D=$d levels..." + local iter + for iter in $(seq 1 "$REPS"); do + log_verbose "Iteration $iter/$REPS for D=$d" + + # Fresh image: need d+2 branches worth of delta + headroom + local delta_mb=$(( (d + 4) + 64 )) + local img_mb=$(( delta_mb + 128 )) + local mnt_main="$TEST_DIR/main" + local source="$TEST_DIR/source" + + teardown_mounts + rm -rf "$TEST_DIR/main" "$TEST_DIR/level_"* + mkdir -p "$mnt_main" + + setup_fresh_image "$source" "${img_mb}M" "${delta_mb}M" "$mnt_main" + + # ── Create chain: main → L1 → L2 → ... → LD ── + local t0 t1 + t0=$(now_ns) + local parent="main" + local level_mnts=() + for lvl in $(seq 1 "$d"); do + local lmnt="$TEST_DIR/level_${lvl}" + mkdir -p "$lmnt" + level_mnts+=("$lmnt") + "$DAXFS_BRANCH" create "level_${lvl}" -m "$lmnt" -p "$parent" \ + || die "Failed to create level_${lvl}" + parent="level_${lvl}" + done + t1=$(now_ns) + local chain_us + chain_us=$(ns_to_us $((t1 - t0))) + tsv_row "scale_depth" "$d" "$iter" "chain_create" "$chain_us" "$d" "" "" + + # ── Workload at deepest level ── + local deepest="${level_mnts[$((d-1))]}" + t0=$(now_ns) + run_agent_workload "$deepest" "$WORKLOAD" + t1=$(now_ns) + local work_us + work_us=$(ns_to_us $((t1 - t0))) + tsv_row "scale_depth" "$d" "$iter" "workload" "$work_us" "$WORKLOAD_OPS" "" "" + + # ── Get delta bytes before commit ── + local delta_bytes="" + delta_bytes=$("$DAXFS_INSPECT" info -m "$deepest" -b "level_${d}" 2>/dev/null \ + | grep -i "delta.*used" | head -1 | grep -oP '\d+' | tail -1) || true + log_verbose " delta_bytes at level_${d}: $delta_bytes" + + # ── Commit from deepest level (walks up D levels) ── + t0=$(now_ns) + "$DAXFS_BRANCH" commit -m "$deepest" \ + || die "Commit from depth $d failed" + t1=$(now_ns) + local commit_us + commit_us=$(ns_to_us $((t1 - t0))) + tsv_row "scale_depth" "$d" "$iter" "commit" "$commit_us" "$d" "$delta_bytes" "" + + teardown_mounts + done + d=$((d * 2)) + done +} + +# ── Experiment 3: Commit Cost vs Delta Size ────────────────────────── + +commit_cost() { + log "Experiment 3: Commit Cost vs Delta Size (commit_cost)" + + local ops_counts="10 50 100 500 1000 5000" + + for ops_target in $ops_counts; do + log " ops=$ops_target..." + local iter + for iter in $(seq 1 "$REPS"); do + log_verbose "Iteration $iter/$REPS for ops=$ops_target" + + # Fresh image with generous delta + local delta_mb=128 + local img_mb=256 + local mnt_main="$TEST_DIR/main" + local bmnt="$TEST_DIR/branch_cost" + local source="$TEST_DIR/source" + + teardown_mounts + rm -rf "$TEST_DIR/main" "$TEST_DIR/branch_cost" + mkdir -p "$mnt_main" "$bmnt" + + setup_fresh_image "$source" "${img_mb}M" "${delta_mb}M" "$mnt_main" + + "$DAXFS_BRANCH" create "cost_test" -m "$bmnt" -p main \ + || die "Failed to create cost_test branch" + + # ── Run exactly ops_target write operations ── + local t0 t1 + t0=$(now_ns) + local op=0 + while [ "$op" -lt "$ops_target" ]; do + local fsize=$(( 1024 + (op * 37 % 3072) )) + generate_c_file "$bmnt/cost_file_${op}.c" "$fsize" + op=$((op + 1)) + done + t1=$(now_ns) + local workload_us + workload_us=$(ns_to_us $((t1 - t0))) + tsv_row "commit_cost" "$ops_target" "$iter" "workload" "$workload_us" "$ops_target" "" "" + + # ── Read delta_bytes via daxfs-inspect ── + local delta_bytes="" + delta_bytes=$("$DAXFS_INSPECT" info -m "$bmnt" -b "cost_test" 2>/dev/null \ + | grep -i "delta.*used" | head -1 | grep -oP '\d+' | tail -1) || true + log_verbose " delta_bytes: $delta_bytes" + + # ── Commit and time it ── + t0=$(now_ns) + "$DAXFS_BRANCH" commit -m "$bmnt" \ + || die "Commit failed for ops=$ops_target" + t1=$(now_ns) + local commit_us + commit_us=$(ns_to_us $((t1 - t0))) + tsv_row "commit_cost" "$ops_target" "$iter" "commit" "$commit_us" "$ops_target" \ + "$delta_bytes" "" + + teardown_mounts + done + done +} + +# ── Experiment 4: tmpfs Baseline ───────────────────────────────────── + +baseline_tmpfs() { + log "Experiment 4: tmpfs Baseline (baseline_tmpfs)" + + local source="$TEST_DIR/source" + local tmpfs_base="$TEST_DIR/tmpfs_base" + + local n=1 + while [ "$n" -le "$MAX_AGENTS" ]; do + log " N=$n agents (tmpfs)..." + local iter + for iter in $(seq 1 "$REPS"); do + log_verbose "Iteration $iter/$REPS for N=$n" + + # Fresh tmpfs base + rm -rf "$tmpfs_base" + mkdir -p "$tmpfs_base" + mount -t tmpfs -o size=512M tmpfs "$tmpfs_base" + cp -a "$source"/. "$tmpfs_base/" + + # ── "Branch" creation = cp -a (O(source_size)) ── + local t0 t1 branch_total_us=0 + local branch_dirs=() + for b in $(seq 1 "$n"); do + local bdir="$TEST_DIR/tmpfs_branch_${b}" + t0=$(now_ns) + cp -a "$tmpfs_base" "$bdir" + t1=$(now_ns) + branch_dirs+=("$bdir") + local lat_us + lat_us=$(ns_to_us $((t1 - t0))) + branch_total_us=$((branch_total_us + lat_us)) + done + tsv_row "baseline_tmpfs" "$n" "$iter" "branch_create_total" "$branch_total_us" "$n" "" "" + tsv_row "baseline_tmpfs" "$n" "$iter" "branch_create_avg" "$((branch_total_us / n))" "$n" "" "" + + # ── Parallel workload ── + t0=$(now_ns) + local pids=() + for b in $(seq 1 "$n"); do + ( + run_agent_workload "${branch_dirs[$((b-1))]}" "$WORKLOAD" + ) & + pids+=($!) + done + for pid in "${pids[@]}"; do + wait "$pid" 2>/dev/null || true + done + t1=$(now_ns) + local workload_us + workload_us=$(ns_to_us $((t1 - t0))) + tsv_row "baseline_tmpfs" "$n" "$iter" "parallel_workload" "$workload_us" "$n" "" "" + + # ── "Commit" = cp -a winner back to base ── + t0=$(now_ns) + rm -rf "$tmpfs_base" + cp -a "${branch_dirs[0]}" "$tmpfs_base" + t1=$(now_ns) + local commit_us + commit_us=$(ns_to_us $((t1 - t0))) + tsv_row "baseline_tmpfs" "$n" "$iter" "commit" "$commit_us" "1" "" "" + + # ── Cleanup branches ── + t0=$(now_ns) + for bdir in "${branch_dirs[@]}"; do + rm -rf "$bdir" + done + umount "$tmpfs_base" 2>/dev/null || true + rm -rf "$tmpfs_base" + t1=$(now_ns) + local cleanup_us + cleanup_us=$(ns_to_us $((t1 - t0))) + tsv_row "baseline_tmpfs" "$n" "$iter" "cleanup" "$cleanup_us" "$n" "" "" + done + n=$((n * 2)) + done +} + +# ── Experiment 5: OverlayFS Baseline ───────────────────────────────── + +baseline_overlayfs() { + log "Experiment 5: OverlayFS Baseline (baseline_overlayfs)" + + local source="$TEST_DIR/source" + + local n=1 + while [ "$n" -le "$MAX_AGENTS" ]; do + log " N=$n agents (overlayfs)..." + local iter + for iter in $(seq 1 "$REPS"); do + log_verbose "Iteration $iter/$REPS for N=$n" + + # ── "Branch" creation = overlayfs mount with upper/work dirs ── + local t0 t1 branch_total_us=0 + local overlay_mnts=() + for b in $(seq 1 "$n"); do + local upper="$TEST_DIR/ovl_upper_${b}" + local work="$TEST_DIR/ovl_work_${b}" + local merged="$TEST_DIR/ovl_merged_${b}" + mkdir -p "$upper" "$work" "$merged" + + t0=$(now_ns) + mount -t overlay overlay \ + -o "lowerdir=${source},upperdir=${upper},workdir=${work}" \ + "$merged" 2>/dev/null \ + || { log_verbose "overlayfs mount failed for branch $b"; break; } + t1=$(now_ns) + overlay_mnts+=("$merged") + local lat_us + lat_us=$(ns_to_us $((t1 - t0))) + branch_total_us=$((branch_total_us + lat_us)) + done + local actual_n=${#overlay_mnts[@]} + if [ "$actual_n" -eq 0 ]; then + log_verbose "No overlayfs mounts succeeded, skipping" + continue + fi + tsv_row "baseline_overlayfs" "$n" "$iter" "branch_create_total" "$branch_total_us" \ + "$actual_n" "" "" + tsv_row "baseline_overlayfs" "$n" "$iter" "branch_create_avg" \ + "$((branch_total_us / actual_n))" "$actual_n" "" "" + + # ── Parallel workload ── + t0=$(now_ns) + local pids=() + for b in $(seq 1 "$actual_n"); do + ( + run_agent_workload "${overlay_mnts[$((b-1))]}" "$WORKLOAD" + ) & + pids+=($!) + done + for pid in "${pids[@]}"; do + wait "$pid" 2>/dev/null || true + done + t1=$(now_ns) + local workload_us + workload_us=$(ns_to_us $((t1 - t0))) + tsv_row "baseline_overlayfs" "$n" "$iter" "parallel_workload" "$workload_us" \ + "$actual_n" "" "" + + # ── "Commit" = copy upper dir to source ── + t0=$(now_ns) + cp -a "$TEST_DIR/ovl_upper_1"/. "$source/" 2>/dev/null || true + t1=$(now_ns) + local commit_us + commit_us=$(ns_to_us $((t1 - t0))) + tsv_row "baseline_overlayfs" "$n" "$iter" "commit" "$commit_us" "1" "" "" + + # ── Cleanup ── + t0=$(now_ns) + for b in $(seq 1 "$actual_n"); do + umount "$TEST_DIR/ovl_merged_${b}" 2>/dev/null || true + done + rm -rf "$TEST_DIR"/ovl_upper_* "$TEST_DIR"/ovl_work_* "$TEST_DIR"/ovl_merged_* + t1=$(now_ns) + local cleanup_us + cleanup_us=$(ns_to_us $((t1 - t0))) + tsv_row "baseline_overlayfs" "$n" "$iter" "cleanup" "$cleanup_us" "$actual_n" "" \ + "no_nesting" + done + n=$((n * 2)) + done +} + +# ── Argument parsing ───────────────────────────────────────────────── + +parse_args() { + while getopts "n:d:w:r:o:v" opt; do + case $opt in + n) MAX_AGENTS="$OPTARG" ;; + d) MAX_DEPTH="$OPTARG" ;; + w) WORKLOAD="$OPTARG" ;; + r) REPS="$OPTARG" ;; + o) OUTPUT_DIR="$OPTARG" ;; + v) VERBOSE=1 ;; + *) echo "Usage: $0 [-n agents] [-d depth] [-w small|medium|large] [-r reps] [-o dir] [-v]" + exit 1 ;; + esac + done + + # Validate workload + case "$WORKLOAD" in + small|medium|large) ;; + *) die "Invalid workload: $WORKLOAD (must be small, medium, or large)" ;; + esac + + # Validate max_agents is reasonable + if [ "$MAX_AGENTS" -lt 1 ] || [ "$MAX_AGENTS" -gt 256 ]; then + die "Agent count must be between 1 and 256" + fi +} + +# ── Main ───────────────────────────────────────────────────────────── + +main() { + parse_args "$@" + + log "DAXFS AI Agent Speculative Branching Benchmark" + log "===============================================" + log " max_agents=$MAX_AGENTS max_depth=$MAX_DEPTH workload=$WORKLOAD" + log " reps=$REPS output=$OUTPUT_DIR" + log "" + + check_prerequisites + load_module + tsv_init + + # Create persistent test directory and source tree + TEST_DIR=$(mktemp -d /tmp/daxfs_bench.XXXXXX) + trap full_cleanup EXIT + + local source="$TEST_DIR/source" + log "Generating source tree..." + create_source_tree "$source" + local src_files + src_files=$(find "$source" -type f | wc -l) + log " $src_files files in source tree" + log "" + + # ── Run experiments ── + + scale_agents + log "" + + scale_depth + log "" + + commit_cost + log "" + + baseline_tmpfs + log "" + + baseline_overlayfs + log "" + + # ── Summary ── + local rows + rows=$(tail -n +2 "$TSV_FILE" | wc -l) + log "===============================================" + log "Benchmark complete: $rows data points" + log "Results: $TSV_FILE" +} + +main "$@" diff --git a/tests/bench_gpu.cu b/tests/bench_gpu.cu new file mode 100644 index 0000000..bb14177 --- /dev/null +++ b/tests/bench_gpu.cu @@ -0,0 +1,1004 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAXFS GPU Coordination Benchmark + * + * Measures PCIe AtomicOp latency and throughput for the GPU-side + * coordination primitives (coord lock, page cache CAS, commit seq read). + * + * Uses cudaMallocHost (pinned host memory) to create a GPU-accessible + * region that mirrors the daxfs DAX layout. GPU kernels then exercise + * the same atomicCAS / volatile-read primitives from daxfs_gpu.h, + * measuring PCIe round-trip latency across the bus. + * + * Usage: sudo ./tests/bench_gpu [options] + */ + +#include +#include +#include +#include +#include +#include + +/* GPU-side primitives */ +#include "../include/daxfs_gpu.h" + +/* ── Configuration ────────────────────────────────────────────────── */ + +#define DEFAULT_ITERS 10000 +#define DEFAULT_REPS 5 +#define WARMUP_ITERS 200 +#define PCACHE_SLOTS 4096 + +/* ── P2P DMA and Multi-tenant constants ──────────────────────────── */ + +#define P2P_MAX_SIZE (1 << 20) /* 1 MB max transfer */ +#define P2P_REGION_SIZE (4 << 20) /* 4 MB P2P data region */ + +#define MT_MAX_TENANTS 32 +#define MT_PERM_NONE 0 +#define MT_PERM_READ 1 +#define MT_PERM_WRITE 2 +#define MT_PERM_RW 3 + +/* ── Simulated DAX layout in pinned host memory ───────────────────── */ + +struct dax_sim { + /* Coordination region (mirrors daxfs_global_coord) */ + unsigned int coord_lock; /* 0=free, 1=held */ + unsigned int coord_pad; + unsigned long long commit_sequence; + unsigned long long last_committed_id; + + /* Page cache header (mirrors daxfs_pcache_header) */ + unsigned int pcache_pending; + unsigned int pcache_pad[3]; + + /* Page cache slot metadata array */ + struct { + unsigned long long state_tag; /* bits[1:0]=state, bits[63:2]=tag */ + unsigned int ref_bit; + unsigned int reserved; + } slots[PCACHE_SLOTS]; + + /* Multi-tenant permission table (mirrors CXL shared memory ACL). + * perm[tenant][slot] = MT_PERM_{NONE,READ,WRITE,RW}. + * GPU checks this before every read/write to enforce isolation. */ + unsigned char perm[MT_MAX_TENANTS][PCACHE_SLOTS]; +}; + +/* Separate P2P DMA data region (pinned host, simulates CXL memory pages) */ +struct p2p_region { + char data[P2P_REGION_SIZE]; +}; + +/* ── TSV output ───────────────────────────────────────────────────── */ + +static FILE *tsv_fp; + +static void tsv_init(const char *path) +{ + tsv_fp = fopen(path, "w"); + if (!tsv_fp) { perror("fopen"); exit(1); } + fprintf(tsv_fp, "experiment\tparameter\titeration\toperation\t" + "latency_ns\tops_count\tthru_mops\tnotes\n"); +} + +static void tsv_row(const char *exp, int param, int iter, + const char *op, double lat_ns, int ops, + double mops, const char *notes) +{ + fprintf(tsv_fp, "%s\t%d\t%d\t%s\t%.1f\t%d\t%.3f\t%s\n", + exp, param, iter, op, lat_ns, ops, mops, notes ? notes : ""); + fflush(tsv_fp); +} + +/* ── GPU timer ────────────────────────────────────────────────────── */ + +static double event_ms_to_ns(cudaEvent_t a, cudaEvent_t b) +{ + float ms; + cudaEventElapsedTime(&ms, a, b); + return (double)ms * 1e6; +} + +/* ── Kernel 1: Coord Lock/Unlock Round-Trip (single thread) ──────── */ + +__global__ void kern_lock_rt(unsigned int *lock, int iters, + long long *out_cycles) +{ + long long t0 = clock64(); + for (int i = 0; i < iters; i++) { + daxfs_gpu_coord_lock(lock); + daxfs_gpu_coord_unlock(lock); + } + *out_cycles = clock64() - t0; +} + +/* ── Kernel 2: Commit Sequence Volatile Read ─────────────────────── */ + +__global__ void kern_seq_read(const unsigned long long *seq, int iters, + unsigned long long *sink, long long *out_cycles) +{ + unsigned long long s = 0; + long long t0 = clock64(); + for (int i = 0; i < iters; i++) + s += daxfs_gpu_read_commit_seq(seq); + *out_cycles = clock64() - t0; + *sink = s; +} + +/* ── Kernel 3: Pending Counter Inc/Dec ────────────────────────────── */ + +__global__ void kern_pending(unsigned int *cnt, int iters, + long long *out_cycles) +{ + long long t0 = clock64(); + for (int i = 0; i < iters; i++) { + daxfs_gpu_pcache_inc_pending(cnt); + daxfs_gpu_pcache_dec_pending(cnt); + } + *out_cycles = clock64() - t0; +} + +/* ── Kernel 4: Page Cache Lookup (scaling threads) ────────────────── */ + +__global__ void kern_lookup(const unsigned long long *slot_base, + unsigned int stride, unsigned int st_off, + unsigned int slot_count, int iters, + unsigned int *out_hits) +{ + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int idx = tid % slot_count; + const unsigned long long *st = + (const unsigned long long *)((const char *)slot_base + + (unsigned long long)idx * stride + st_off); + unsigned long long tag = (unsigned long long)(idx + 1); + unsigned int hits = 0; + + for (int i = 0; i < iters; i++) + if (daxfs_gpu_pcache_lookup(st, tag)) + hits++; + + atomicAdd(out_hits, hits); +} + +/* ── Kernel 5: CAS Throughput on Independent Slots ────────────────── */ + +__global__ void kern_cas_indep(unsigned long long *slot_base, + unsigned int stride, unsigned int st_off, + unsigned int slot_count, int iters) +{ + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int idx = tid % slot_count; + unsigned long long *st = + (unsigned long long *)((char *)slot_base + + (unsigned long long)idx * stride + st_off); + + unsigned long long va = PCACHE_MAKE(PCACHE_STATE_FREE, 0); + unsigned long long vb = PCACHE_MAKE(PCACHE_STATE_PENDING, + (unsigned long long)(tid + 1)); + + for (int i = 0; i < iters; i++) { + unsigned long long old = daxfs_gpu_slot_cmpxchg(st, va, vb); + if (old == vb) + daxfs_gpu_slot_cmpxchg(st, vb, va); + } +} + +/* ── Kernel 6: Lock Contention (multiple threads, one lock) ──────── */ + +__global__ void kern_lock_contend(unsigned int *lock, int iters, + unsigned int *counter) +{ + for (int i = 0; i < iters; i++) { + daxfs_gpu_coord_lock(lock); + (*counter)++; + __threadfence_system(); + daxfs_gpu_coord_unlock(lock); + } +} + +/* ── Kernel 7: Page Cache Claim (FREE→PENDING transition) ─────────── */ + +__global__ void kern_claim(unsigned long long *slot_base, + unsigned int stride, unsigned int st_off, + unsigned int slot_count, int iters, + unsigned int *pending_count) +{ + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int idx = tid % slot_count; + unsigned long long *st = + (unsigned long long *)((char *)slot_base + + (unsigned long long)idx * stride + st_off); + unsigned long long tag = (unsigned long long)(idx + 1); + + for (int i = 0; i < iters; i++) { + if (daxfs_gpu_pcache_claim(st, tag)) { + daxfs_gpu_pcache_inc_pending(pending_count); + /* Reset to FREE for next iteration */ + *st = PCACHE_MAKE(PCACHE_STATE_FREE, 0); + __threadfence_system(); + daxfs_gpu_pcache_dec_pending(pending_count); + } + } +} + +/* ═══════════════════════════════════════════════════════════════════ + * P2P DMA kernels — GPU directly reads/writes CXL/DAX host memory + * ═══════════════════════════════════════════════════════════════════ */ + +/* Kernel 8a: P2P DMA Read — GPU reads pages from host CXL memory */ +__global__ void kern_p2p_read(const char *host_src, char *dev_dst, + unsigned int xfer_size, int iters) +{ + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int offset = (tid * xfer_size) % P2P_REGION_SIZE; + const char *src = host_src + offset; + + for (int i = 0; i < iters; i++) { + /* Explicit GPU load from host (PCIe read TLPs) */ + for (unsigned int b = 0; b < xfer_size; b += sizeof(unsigned long long)) { + unsigned long long val = + *(volatile const unsigned long long *)(src + b); + /* Store to device mem so compiler doesn't elide the load */ + if (dev_dst) + *(unsigned long long *)(dev_dst + b) = val; + } + } +} + +/* Kernel 8b: P2P DMA Write — GPU writes pages to host CXL memory */ +__global__ void kern_p2p_write(char *host_dst, unsigned int xfer_size, + int iters) +{ + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int offset = (tid * xfer_size) % P2P_REGION_SIZE; + char *dst = host_dst + offset; + unsigned long long pattern = 0xDA5F5DA5ULL + tid; + + for (int i = 0; i < iters; i++) { + /* Explicit GPU store to host (PCIe write TLPs) */ + for (unsigned int b = 0; b < xfer_size; b += sizeof(unsigned long long)) { + *(volatile unsigned long long *)(dst + b) = pattern + b; + } + __threadfence_system(); + } +} + +/* ═══════════════════════════════════════════════════════════════════ + * Multi-tenant CXL memory permission-checked access kernels + * + * Simulates GPU agents from different tenants sharing a CXL memory + * pool. Each access performs an explicit permission check against + * a per-tenant ACL bitmap in the shared DAX region before issuing + * the actual read/write. This is the GPU-side enforcement path + * that mirrors what hardware CXL.mem permission checks would do. + * ═══════════════════════════════════════════════════════════════════ */ + +/* + * Permission check + read: GPU thread loads perm[tenant][slot], + * verifies MT_PERM_READ is set, then reads the slot data. + */ +__global__ void kern_mt_read(const unsigned long long *slot_base, + unsigned int stride, unsigned int st_off, + unsigned int slot_count, + const unsigned char *perm_base, + unsigned int tenant_id, int iters, + unsigned int *out_allowed, + unsigned int *out_denied) +{ + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int idx = tid % slot_count; + unsigned int allowed = 0, denied = 0; + + /* Permission table row for this tenant */ + const unsigned char *my_perm = perm_base + tenant_id * slot_count; + + const unsigned long long *st = + (const unsigned long long *)((const char *)slot_base + + (unsigned long long)idx * stride + st_off); + + for (int i = 0; i < iters; i++) { + /* Step 1: Explicit permission check (volatile read from host ACL) */ + unsigned char p = *(volatile const unsigned char *)(my_perm + idx); + + if (p & MT_PERM_READ) { + /* Step 2: Allowed — read the slot via PCIe */ + unsigned long long val = *(volatile const unsigned long long *)st; + (void)val; + allowed++; + } else { + denied++; + } + /* Rotate to next slot */ + idx = (idx + 7) % slot_count; + st = (const unsigned long long *)((const char *)slot_base + + (unsigned long long)idx * stride + st_off); + } + + atomicAdd(out_allowed, allowed); + atomicAdd(out_denied, denied); +} + +/* + * Permission check + write: GPU thread checks MT_PERM_WRITE, + * then performs CAS on the slot (simulating a page cache mutation). + */ +__global__ void kern_mt_write(unsigned long long *slot_base, + unsigned int stride, unsigned int st_off, + unsigned int slot_count, + const unsigned char *perm_base, + unsigned int tenant_id, int iters, + unsigned int *out_allowed, + unsigned int *out_denied) +{ + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int idx = tid % slot_count; + unsigned int allowed = 0, denied = 0; + + const unsigned char *my_perm = perm_base + tenant_id * slot_count; + + for (int i = 0; i < iters; i++) { + unsigned long long *st = + (unsigned long long *)((char *)slot_base + + (unsigned long long)idx * stride + st_off); + + /* Step 1: Explicit permission check */ + unsigned char p = *(volatile const unsigned char *)(my_perm + idx); + + if (p & MT_PERM_WRITE) { + /* Step 2: Allowed — CAS write to slot (PCIe CAS TLP) */ + unsigned long long old = *st; + unsigned long long new_val = old + 1; + daxfs_gpu_slot_cmpxchg(st, old, new_val); + allowed++; + } else { + denied++; + } + idx = (idx + 7) % slot_count; + } + + atomicAdd(out_allowed, allowed); + atomicAdd(out_denied, denied); +} + +/* + * Baseline: same read without permission check, to measure + * the overhead of the ACL lookup. + */ +__global__ void kern_mt_read_noacl(const unsigned long long *slot_base, + unsigned int stride, unsigned int st_off, + unsigned int slot_count, int iters, + unsigned int *out_count) +{ + unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int idx = tid % slot_count; + unsigned int count = 0; + + for (int i = 0; i < iters; i++) { + const unsigned long long *st = + (const unsigned long long *)((const char *)slot_base + + (unsigned long long)idx * stride + st_off); + unsigned long long val = *(volatile const unsigned long long *)st; + (void)val; + count++; + idx = (idx + 7) % slot_count; + } + + atomicAdd(out_count, count); +} + +/* ── Error checking ───────────────────────────────────────────────── */ + +#define CK(call) do { \ + cudaError_t e = (call); \ + if (e != cudaSuccess) { \ + fprintf(stderr, "CUDA %s:%d: %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(e)); \ + exit(1); \ + } \ +} while(0) + +/* ── Main ─────────────────────────────────────────────────────────── */ + +int main(int argc, char **argv) +{ + const char *tsv_path = "./bench_results/gpu.tsv"; + int iters = DEFAULT_ITERS; + int reps = DEFAULT_REPS; + int opt; + + while ((opt = getopt(argc, argv, "i:r:o:h")) != -1) { + switch (opt) { + case 'i': iters = atoi(optarg); break; + case 'r': reps = atoi(optarg); break; + case 'o': tsv_path = optarg; break; + default: + fprintf(stderr, "Usage: %s [-i iters] [-r reps] [-o out.tsv]\n", + argv[0]); + return 1; + } + } + + /* GPU info */ + cudaDeviceProp prop; + CK(cudaGetDeviceProperties(&prop, 0)); + int clock_khz; + CK(cudaDeviceGetAttribute(&clock_khz, cudaDevAttrClockRate, 0)); + double ns_per_cycle = 1e6 / (double)clock_khz; + + printf("DAXFS GPU Coordination Benchmark\n"); + printf("================================\n"); + printf("GPU: %s (SM %d.%d, %d SMs, %.0f MHz)\n", + prop.name, prop.major, prop.minor, + prop.multiProcessorCount, prop.clockRate / 1000.0); + printf(" iters=%d reps=%d pcache_slots=%d\n", iters, reps, PCACHE_SLOTS); + printf(" ns/cycle=%.2f\n\n", ns_per_cycle); + + /* Allocate pinned host memory simulating DAX region */ + struct dax_sim *dax; + CK(cudaMallocHost(&dax, sizeof(struct dax_sim))); + memset(dax, 0, sizeof(struct dax_sim)); + + /* Initialize pcache slots as VALID with known tags for lookup test */ + for (int i = 0; i < PCACHE_SLOTS; i++) + dax->slots[i].state_tag = PCACHE_MAKE(PCACHE_STATE_VALID, + (unsigned long long)(i + 1)); + + /* Get device pointers for pinned memory */ + unsigned int *d_lock = &dax->coord_lock; + unsigned long long *d_seq = &dax->commit_sequence; + unsigned int *d_pend = &dax->pcache_pending; + unsigned long long *d_slot_base = &dax->slots[0].state_tag; + unsigned int slot_stride = sizeof(dax->slots[0]); /* 16 bytes */ + unsigned int st_off = 0; /* state_tag at offset 0 in slot */ + + /* Scratch device memory */ + long long *d_cycles; + unsigned long long *d_sink; + unsigned int *d_counter, *d_hits; + CK(cudaMalloc(&d_cycles, sizeof(long long))); + CK(cudaMalloc(&d_sink, sizeof(unsigned long long))); + CK(cudaMalloc(&d_counter, sizeof(unsigned int))); + CK(cudaMalloc(&d_hits, sizeof(unsigned int))); + + cudaEvent_t ev0, ev1; + CK(cudaEventCreate(&ev0)); + CK(cudaEventCreate(&ev1)); + + tsv_init(tsv_path); + + printf("DAX simulation at %p (pinned host memory, GPU-visible via PCIe)\n\n", + (void *)dax); + + /* ── Exp 1: Coord Lock Round-Trip ─────────────────────────────── */ + printf("Exp 1: Coordination Lock Round-Trip (single thread)\n"); + kern_lock_rt<<<1, 1>>>(d_lock, WARMUP_ITERS, d_cycles); + CK(cudaDeviceSynchronize()); + dax->coord_lock = 0; + + for (int r = 0; r < reps; r++) { + long long cyc; + CK(cudaEventRecord(ev0)); + kern_lock_rt<<<1, 1>>>(d_lock, iters, d_cycles); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + CK(cudaMemcpy(&cyc, d_cycles, sizeof(cyc), cudaMemcpyDeviceToHost)); + double ns_op = (double)cyc * ns_per_cycle / iters; + double ev_ns = event_ms_to_ns(ev0, ev1); + printf(" rep %d: %.1f ns/op (cycles), %.1f ns/op (wall)\n", + r + 1, ns_op, ev_ns / iters); + tsv_row("gpu_coord_lock", 1, r + 1, "lock_unlock_rt", + ns_op, iters, iters / (ev_ns / 1e3), "single_thread"); + dax->coord_lock = 0; + } + + /* ── Exp 2: Commit Seq Volatile Read ──────────────────────────── */ + printf("\nExp 2: Commit Sequence Read (volatile PCIe read)\n"); + kern_seq_read<<<1, 1>>>(d_seq, WARMUP_ITERS, d_sink, d_cycles); + CK(cudaDeviceSynchronize()); + + for (int r = 0; r < reps; r++) { + long long cyc; + CK(cudaEventRecord(ev0)); + kern_seq_read<<<1, 1>>>(d_seq, iters, d_sink, d_cycles); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + CK(cudaMemcpy(&cyc, d_cycles, sizeof(cyc), cudaMemcpyDeviceToHost)); + double ns_op = (double)cyc * ns_per_cycle / iters; + double ev_ns = event_ms_to_ns(ev0, ev1); + printf(" rep %d: %.1f ns/op\n", r + 1, ns_op); + tsv_row("gpu_commit_seq", 1, r + 1, "volatile_read", + ns_op, iters, iters / (ev_ns / 1e3), "single_thread"); + } + + /* ── Exp 3: Pending Counter Inc/Dec ───────────────────────────── */ + printf("\nExp 3: Pending Counter CAS Inc/Dec\n"); + kern_pending<<<1, 1>>>(d_pend, WARMUP_ITERS, d_cycles); + CK(cudaDeviceSynchronize()); + dax->pcache_pending = 0; + + for (int r = 0; r < reps; r++) { + long long cyc; + CK(cudaEventRecord(ev0)); + kern_pending<<<1, 1>>>(d_pend, iters, d_cycles); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + CK(cudaMemcpy(&cyc, d_cycles, sizeof(cyc), cudaMemcpyDeviceToHost)); + double ns_op = (double)cyc * ns_per_cycle / (iters * 2); + double ev_ns = event_ms_to_ns(ev0, ev1); + printf(" rep %d: %.1f ns/op (CAS pair)\n", r + 1, ns_op); + tsv_row("gpu_pending_ctr", 1, r + 1, "cas_inc_dec", + ns_op, iters * 2, (iters * 2) / (ev_ns / 1e3), "single_thread"); + dax->pcache_pending = 0; + } + + /* ── Exp 4: Page Cache Lookup Throughput ──────────────────────── */ + printf("\nExp 4: Page Cache Lookup (scaling threads)\n"); + { + int tcounts[] = {1, 32, 64, 128, 256, 512, 1024}; + for (int c = 0; c < 7; c++) { + int nt = tcounts[c]; + int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256; + for (int r = 0; r < reps; r++) { + CK(cudaMemset(d_hits, 0, sizeof(unsigned int))); + CK(cudaEventRecord(ev0)); + kern_lookup<<>>(d_slot_base, slot_stride, st_off, + PCACHE_SLOTS, iters, d_hits); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + double ev_ns = event_ms_to_ns(ev0, ev1); + long long ops = (long long)nt * iters; + double mops = ops / (ev_ns / 1e3); + printf(" threads=%4d: %8.3f Mops/s\n", nt, mops); + tsv_row("gpu_pcache_lookup", nt, r + 1, "lookup_throughput", + ev_ns / ops, (int)ops, mops, ""); + } + } + } + + /* ── Exp 5: Slot CAS Throughput (independent) ─────────────────── */ + printf("\nExp 5: Slot CAS Throughput (independent slots)\n"); + { + /* Reset slots to FREE for CAS test */ + for (int i = 0; i < PCACHE_SLOTS; i++) + dax->slots[i].state_tag = PCACHE_MAKE(PCACHE_STATE_FREE, 0); + + int tcounts[] = {1, 32, 64, 128, 256, 512, 1024}; + int cas_iters = iters / 10; + for (int c = 0; c < 7; c++) { + int nt = tcounts[c]; + int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256; + for (int r = 0; r < reps; r++) { + CK(cudaEventRecord(ev0)); + kern_cas_indep<<>>(d_slot_base, slot_stride, st_off, + PCACHE_SLOTS, cas_iters); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + double ev_ns = event_ms_to_ns(ev0, ev1); + long long ops = (long long)nt * cas_iters; + double mops = ops / (ev_ns / 1e3); + printf(" threads=%4d: %8.3f Mops/s\n", nt, mops); + tsv_row("gpu_slot_cas", nt, r + 1, "cas_throughput", + ev_ns / ops, (int)ops, mops, "independent"); + } + } + } + + /* ── Exp 6: Lock Contention ───────────────────────────────────── */ + printf("\nExp 6: Lock Contention (scaling threads)\n"); + { + int tcounts[] = {1, 2, 4, 8, 16, 32}; + int lock_iters = iters / 100; + for (int c = 0; c < 6; c++) { + int nt = tcounts[c]; + for (int r = 0; r < reps; r++) { + dax->coord_lock = 0; + CK(cudaMemset(d_counter, 0, sizeof(unsigned int))); + CK(cudaEventRecord(ev0)); + kern_lock_contend<<<1, nt>>>(d_lock, lock_iters, d_counter); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + unsigned int ctr; + CK(cudaMemcpy(&ctr, d_counter, sizeof(ctr), + cudaMemcpyDeviceToHost)); + double ev_ns = event_ms_to_ns(ev0, ev1); + long long ops = (long long)nt * lock_iters; + double ns_acq = ev_ns / ops; + printf(" threads=%2d: %8.1f ns/acq counter=%u/%lld\n", + nt, ns_acq, ctr, ops); + char note[64]; + snprintf(note, sizeof(note), "counter=%u", ctr); + tsv_row("gpu_lock_contention", nt, r + 1, "lock_acquisition", + ns_acq, (int)ops, ops / (ev_ns / 1e3), note); + } + } + } + + /* ── Exp 7: Page Cache Claim (FREE→PENDING) ───────────────────── */ + printf("\nExp 7: Page Cache Claim (FREE->PENDING transition)\n"); + { + int tcounts[] = {1, 32, 64, 128, 256, 512, 1024}; + int claim_iters = iters / 10; + for (int c = 0; c < 7; c++) { + int nt = tcounts[c]; + int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256; + /* Reset slots to FREE */ + for (int i = 0; i < PCACHE_SLOTS; i++) + dax->slots[i].state_tag = PCACHE_MAKE(PCACHE_STATE_FREE, 0); + dax->pcache_pending = 0; + + for (int r = 0; r < reps; r++) { + CK(cudaEventRecord(ev0)); + kern_claim<<>>(d_slot_base, slot_stride, st_off, + PCACHE_SLOTS, claim_iters, d_pend); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + double ev_ns = event_ms_to_ns(ev0, ev1); + long long ops = (long long)nt * claim_iters; + double mops = ops / (ev_ns / 1e3); + printf(" threads=%4d: %8.3f Mops/s\n", nt, mops); + tsv_row("gpu_pcache_claim", nt, r + 1, "claim_throughput", + ev_ns / ops, (int)ops, mops, "free_to_pending"); + /* Reset for next rep */ + for (int i = 0; i < PCACHE_SLOTS; i++) + dax->slots[i].state_tag = PCACHE_MAKE(PCACHE_STATE_FREE, 0); + dax->pcache_pending = 0; + } + } + } + + /* ═══════════════════════════════════════════════════════════════ + * Exp 8: P2P DMA — GPU explicit read/write to CXL host memory + * ═══════════════════════════════════════════════════════════════ */ + printf("\n════════════════════════════════════════════\n"); + printf("Exp 8: P2P DMA Read/Write (GPU ↔ CXL host memory)\n"); + { + /* Allocate pinned host region simulating CXL memory pages */ + struct p2p_region *p2p; + CK(cudaMallocHost(&p2p, sizeof(struct p2p_region))); + memset(p2p->data, 0xAB, P2P_REGION_SIZE); + + /* Device-side scratch buffer for reads */ + char *d_scratch; + CK(cudaMalloc(&d_scratch, P2P_MAX_SIZE)); + + unsigned int sizes[] = {64, 256, 4096, 65536, 1 << 20}; + const char *labels[] = {"64B", "256B", "4KB", "64KB", "1MB"}; + + /* 8a: P2P Read (GPU loads from host CXL memory) */ + printf(" ── P2P Read ──\n"); + for (int s = 0; s < 5; s++) { + unsigned int xsz = sizes[s]; + int nt = (xsz <= 4096) ? 64 : 1; + int p2p_iters = (xsz <= 4096) ? iters : iters / 100; + int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256; + + for (int r = 0; r < reps; r++) { + CK(cudaEventRecord(ev0)); + kern_p2p_read<<>>(p2p->data, d_scratch, + xsz, p2p_iters); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + double ev_ns = event_ms_to_ns(ev0, ev1); + long long total_bytes = (long long)nt * p2p_iters * xsz; + double bw_gbps = total_bytes / (ev_ns / 1.0); /* B/ns = GB/s */ + double lat_ns = ev_ns / ((long long)nt * p2p_iters); + printf(" %5s: %8.2f GB/s %8.1f ns/op\n", + labels[s], bw_gbps, lat_ns); + char note[32]; + snprintf(note, sizeof(note), "size=%s", labels[s]); + tsv_row("gpu_p2p_read", xsz, r + 1, "read_bw", + lat_ns, (int)(total_bytes >> 10), bw_gbps * 1000, note); + } + } + + /* 8b: P2P Write (GPU stores to host CXL memory) */ + printf(" ── P2P Write ──\n"); + for (int s = 0; s < 5; s++) { + unsigned int xsz = sizes[s]; + int nt = (xsz <= 4096) ? 64 : 1; + int p2p_iters = (xsz <= 4096) ? iters : iters / 100; + int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256; + + for (int r = 0; r < reps; r++) { + CK(cudaEventRecord(ev0)); + kern_p2p_write<<>>(p2p->data, xsz, p2p_iters); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + double ev_ns = event_ms_to_ns(ev0, ev1); + long long total_bytes = (long long)nt * p2p_iters * xsz; + double bw_gbps = total_bytes / (ev_ns / 1.0); + double lat_ns = ev_ns / ((long long)nt * p2p_iters); + printf(" %5s: %8.2f GB/s %8.1f ns/op\n", + labels[s], bw_gbps, lat_ns); + char note[32]; + snprintf(note, sizeof(note), "size=%s", labels[s]); + tsv_row("gpu_p2p_write", xsz, r + 1, "write_bw", + lat_ns, (int)(total_bytes >> 10), bw_gbps * 1000, note); + } + } + + cudaFreeHost(p2p); + cudaFree(d_scratch); + } + + /* ═══════════════════════════════════════════════════════════════ + * Exp 9: Multi-tenant CXL memory permission-checked access + * + * Simulates multiple AI agent tenants sharing CXL memory. + * GPU explicitly checks per-tenant permission bitmap in shared + * DAX region before each read/write — the GPU-side enforcement + * of CXL.mem access control. + * ═══════════════════════════════════════════════════════════════ */ + printf("\n════════════════════════════════════════════\n"); + printf("Exp 9: Multi-tenant CXL Permission-Checked Access\n"); + { + /* Re-init slots to VALID for read tests */ + for (int i = 0; i < PCACHE_SLOTS; i++) + dax->slots[i].state_tag = PCACHE_MAKE(PCACHE_STATE_VALID, + (unsigned long long)(i + 1)); + + unsigned int *d_allowed, *d_denied; + CK(cudaMalloc(&d_allowed, sizeof(unsigned int))); + CK(cudaMalloc(&d_denied, sizeof(unsigned int))); + + unsigned char *perm_base = &dax->perm[0][0]; + + /* 9a: Permission check overhead — compare with vs without ACL */ + printf(" ── 9a: ACL Check Overhead (256 threads) ──\n"); + { + int nt = 256, blk = 1, tpb = 256; + int mt_iters = iters; + + /* Setup: tenant 0 has READ on all slots */ + memset(dax->perm[0], MT_PERM_RW, PCACHE_SLOTS); + + /* With ACL */ + for (int r = 0; r < reps; r++) { + CK(cudaMemset(d_allowed, 0, sizeof(unsigned int))); + CK(cudaMemset(d_denied, 0, sizeof(unsigned int))); + CK(cudaEventRecord(ev0)); + kern_mt_read<<>>(d_slot_base, slot_stride, st_off, + PCACHE_SLOTS, perm_base, + 0, mt_iters, + d_allowed, d_denied); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + double ev_ns = event_ms_to_ns(ev0, ev1); + long long ops = (long long)nt * mt_iters; + double mops = ops / (ev_ns / 1e3); + printf(" WITH ACL: %8.3f Mops/s (%.1f ns/op)\n", + mops, ev_ns / ops); + tsv_row("gpu_mt_acl_overhead", nt, r + 1, "read_with_acl", + ev_ns / ops, (int)ops, mops, "all_allowed"); + } + + /* Without ACL (baseline) */ + CK(cudaMemset(d_hits, 0, sizeof(unsigned int))); + for (int r = 0; r < reps; r++) { + CK(cudaMemset(d_hits, 0, sizeof(unsigned int))); + CK(cudaEventRecord(ev0)); + kern_mt_read_noacl<<>>(d_slot_base, slot_stride, + st_off, PCACHE_SLOTS, + mt_iters, d_hits); + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + double ev_ns = event_ms_to_ns(ev0, ev1); + long long ops = (long long)nt * mt_iters; + double mops = ops / (ev_ns / 1e3); + printf(" NO ACL: %8.3f Mops/s (%.1f ns/op)\n", + mops, ev_ns / ops); + tsv_row("gpu_mt_acl_overhead", nt, r + 1, "read_no_acl", + ev_ns / ops, (int)ops, mops, "baseline"); + } + } + + /* 9b: Multi-tenant read scaling (each warp = different tenant) */ + printf(" ── 9b: Multi-tenant Read (scaling tenants) ──\n"); + { + int tcounts[] = {1, 2, 4, 8, 16, 32}; + int mt_iters = iters; + + for (int c = 0; c < 6; c++) { + int n_tenants = tcounts[c]; + int threads_per_tenant = 32; /* one warp per tenant */ + int nt = n_tenants * threads_per_tenant; + int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256; + + /* Setup: all tenants get READ on their own slot range, + * NONE on others' slots (isolation) */ + memset(dax->perm, MT_PERM_NONE, + MT_MAX_TENANTS * PCACHE_SLOTS); + int slots_per_tenant = PCACHE_SLOTS / n_tenants; + for (int t = 0; t < n_tenants; t++) { + int start = t * slots_per_tenant; + memset(&dax->perm[t][start], MT_PERM_READ, + slots_per_tenant); + } + + for (int r = 0; r < reps; r++) { + CK(cudaMemset(d_allowed, 0, sizeof(unsigned int))); + CK(cudaMemset(d_denied, 0, sizeof(unsigned int))); + CK(cudaEventRecord(ev0)); + + /* Launch one warp per tenant, each with its tenant_id. + * Use a per-tenant kernel launch to keep it simple. */ + for (int t = 0; t < n_tenants; t++) { + kern_mt_read<<<1, threads_per_tenant>>>( + d_slot_base, slot_stride, st_off, + PCACHE_SLOTS, perm_base, + t, mt_iters, d_allowed, d_denied); + } + + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + + unsigned int h_allowed, h_denied; + CK(cudaMemcpy(&h_allowed, d_allowed, sizeof(unsigned int), + cudaMemcpyDeviceToHost)); + CK(cudaMemcpy(&h_denied, d_denied, sizeof(unsigned int), + cudaMemcpyDeviceToHost)); + + double ev_ns = event_ms_to_ns(ev0, ev1); + long long ops = (long long)nt * mt_iters; + double mops = ops / (ev_ns / 1e3); + + printf(" tenants=%2d: %8.3f Mops/s allowed=%u " + "denied=%u\n", + n_tenants, mops, h_allowed, h_denied); + + char note[64]; + snprintf(note, sizeof(note), + "allowed=%u,denied=%u", h_allowed, h_denied); + tsv_row("gpu_mt_read", n_tenants, r + 1, + "mt_read_throughput", + ev_ns / ops, (int)ops, mops, note); + } + } + } + + /* 9c: Multi-tenant write with permission check */ + printf(" ── 9c: Multi-tenant Write (scaling tenants) ──\n"); + { + int tcounts[] = {1, 2, 4, 8, 16, 32}; + int mt_iters = iters / 10; /* CAS writes are slower */ + + for (int c = 0; c < 6; c++) { + int n_tenants = tcounts[c]; + int threads_per_tenant = 32; + int nt = n_tenants * threads_per_tenant; + + /* Setup: each tenant gets WRITE on its slot range */ + memset(dax->perm, MT_PERM_NONE, + MT_MAX_TENANTS * PCACHE_SLOTS); + int slots_per_tenant = PCACHE_SLOTS / n_tenants; + for (int t = 0; t < n_tenants; t++) { + int start = t * slots_per_tenant; + memset(&dax->perm[t][start], MT_PERM_RW, + slots_per_tenant); + } + + /* Reset slots */ + for (int i = 0; i < PCACHE_SLOTS; i++) + dax->slots[i].state_tag = PCACHE_MAKE( + PCACHE_STATE_VALID, (unsigned long long)(i + 1)); + + for (int r = 0; r < reps; r++) { + CK(cudaMemset(d_allowed, 0, sizeof(unsigned int))); + CK(cudaMemset(d_denied, 0, sizeof(unsigned int))); + CK(cudaEventRecord(ev0)); + + for (int t = 0; t < n_tenants; t++) { + kern_mt_write<<<1, threads_per_tenant>>>( + d_slot_base, slot_stride, st_off, + PCACHE_SLOTS, perm_base, + t, mt_iters, d_allowed, d_denied); + } + + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + + unsigned int h_allowed, h_denied; + CK(cudaMemcpy(&h_allowed, d_allowed, sizeof(unsigned int), + cudaMemcpyDeviceToHost)); + CK(cudaMemcpy(&h_denied, d_denied, sizeof(unsigned int), + cudaMemcpyDeviceToHost)); + + double ev_ns = event_ms_to_ns(ev0, ev1); + long long ops = (long long)nt * mt_iters; + double mops = ops / (ev_ns / 1e3); + + printf(" tenants=%2d: %8.3f Mops/s allowed=%u " + "denied=%u\n", + n_tenants, mops, h_allowed, h_denied); + + char note[64]; + snprintf(note, sizeof(note), + "allowed=%u,denied=%u", h_allowed, h_denied); + tsv_row("gpu_mt_write", n_tenants, r + 1, + "mt_write_throughput", + ev_ns / ops, (int)ops, mops, note); + } + } + } + + /* 9d: Cross-tenant isolation test — tenant tries to access + * another tenant's slots (should all be denied) */ + printf(" ── 9d: Cross-tenant Isolation (denied access) ──\n"); + { + int n_tenants = 4; + int threads_per_tenant = 32; + int mt_iters = iters; + + /* Setup: 4 tenants, each owns 1/4 of slots */ + memset(dax->perm, MT_PERM_NONE, + MT_MAX_TENANTS * PCACHE_SLOTS); + int slots_per_tenant = PCACHE_SLOTS / n_tenants; + for (int t = 0; t < n_tenants; t++) { + int start = t * slots_per_tenant; + memset(&dax->perm[t][start], MT_PERM_RW, + slots_per_tenant); + } + + for (int r = 0; r < reps; r++) { + CK(cudaMemset(d_allowed, 0, sizeof(unsigned int))); + CK(cudaMemset(d_denied, 0, sizeof(unsigned int))); + CK(cudaEventRecord(ev0)); + + /* Tenant 0 tries to read ALL slots (including tenant 1-3's). + * Perm check uses global slot index, so tenant 0 can only + * access its own 1/4 — the rest should be denied. */ + kern_mt_read<<<1, threads_per_tenant>>>( + d_slot_base, + slot_stride, st_off, PCACHE_SLOTS, + perm_base, 0 /* tenant 0 */, mt_iters, + d_allowed, d_denied); + + CK(cudaEventRecord(ev1)); + CK(cudaDeviceSynchronize()); + + unsigned int h_allowed, h_denied; + CK(cudaMemcpy(&h_allowed, d_allowed, sizeof(unsigned int), + cudaMemcpyDeviceToHost)); + CK(cudaMemcpy(&h_denied, d_denied, sizeof(unsigned int), + cudaMemcpyDeviceToHost)); + + double ev_ns = event_ms_to_ns(ev0, ev1); + long long ops = (long long)threads_per_tenant * mt_iters; + double mops = ops / (ev_ns / 1e3); + + printf(" cross-tenant: %8.3f Mops/s allowed=%u " + "denied=%u (%s)\n", + mops, h_allowed, h_denied, + h_allowed == 0 ? "ISOLATED" : "VIOLATION!"); + + char note[80]; + snprintf(note, sizeof(note), + "allowed=%u,denied=%u,%s", + h_allowed, h_denied, + h_allowed == 0 ? "isolated" : "violation"); + tsv_row("gpu_mt_isolation", n_tenants, r + 1, + "cross_tenant_read", + ev_ns / ops, (int)ops, mops, note); + } + } + + cudaFree(d_allowed); + cudaFree(d_denied); + } + + /* ── Cleanup ──────────────────────────────────────────────────── */ + fclose(tsv_fp); + cudaEventDestroy(ev0); + cudaEventDestroy(ev1); + cudaFree(d_cycles); + cudaFree(d_sink); + cudaFree(d_counter); + cudaFree(d_hits); + cudaFreeHost(dax); + + printf("\nResults written to %s\n", tsv_path); + return 0; +} diff --git a/tests/cuda_compat/crt/math_functions.h b/tests/cuda_compat/crt/math_functions.h new file mode 100644 index 0000000..5bb5adc --- /dev/null +++ b/tests/cuda_compat/crt/math_functions.h @@ -0,0 +1,24 @@ +/* Local wrapper to fix glibc 2.41+ / CUDA rsqrt noexcept conflict */ +#ifndef _DAXFS_MATH_FUNCTIONS_COMPAT_H +#define _DAXFS_MATH_FUNCTIONS_COMPAT_H + +/* Include the real CUDA header first */ +#include_next + +/* Now suppress the glibc redeclarations by pre-declaring them + in a compatible way before glibc's math.h gets pulled in. */ +#ifdef __cplusplus +extern "C" { +#endif + +/* Override glibc's rsqrt/rsqrtf declarations to avoid noexcept mismatch. + We define them as weak aliases so they don't conflict. */ +#ifdef __GLIBC__ +#define __DAXFS_RSQRT_COMPAT +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/tests/plot_bench.py b/tests/plot_bench.py new file mode 100644 index 0000000..9ac8860 --- /dev/null +++ b/tests/plot_bench.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +""" +DAXFS Benchmark Plotter + +Reads agents.tsv and produces a multi-panel figure comparing DAXFS +against tmpfs and overlayfs baselines. +""" + +import sys +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np + +def load_data(tsv_path): + df = pd.read_csv(tsv_path, sep='\t') + df['latency_ms'] = df['latency_us'] / 1000.0 + return df + +def mean_latency(df, experiment, operation): + sub = df[(df['experiment'] == experiment) & (df['operation'] == operation)] + return sub.groupby('parameter')['latency_ms'].agg(['mean', 'std']).reset_index() + +def plot_agent_scalability(ax, df): + """Panel 1: Branch creation time (total) vs N agents — DAXFS vs tmpfs vs overlayfs""" + for exp, label, color, marker in [ + ('scale_agents', 'DAXFS', '#2196F3', 'o'), + ('baseline_tmpfs', 'tmpfs (cp -a)', '#FF5722', 's'), + ('baseline_overlayfs', 'OverlayFS', '#4CAF50', '^'), + ]: + d = mean_latency(df, exp, 'branch_create_total') + if len(d) == 0: + continue + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + label=label, color=color, marker=marker, + capsize=3, linewidth=2, markersize=7) + ax.set_xlabel('Number of Agents (N)') + ax.set_ylabel('Total Branch Creation (ms)') + ax.set_title('Branch Creation Cost vs Agent Count') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.3) + +def plot_parallel_workload(ax, df): + """Panel 2: Parallel workload time vs N agents""" + for exp, label, color, marker in [ + ('scale_agents', 'DAXFS', '#2196F3', 'o'), + ('baseline_tmpfs', 'tmpfs', '#FF5722', 's'), + ('baseline_overlayfs', 'OverlayFS', '#4CAF50', '^'), + ]: + d = mean_latency(df, exp, 'parallel_workload') + if len(d) == 0: + continue + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + label=label, color=color, marker=marker, + capsize=3, linewidth=2, markersize=7) + ax.set_xlabel('Number of Agents (N)') + ax.set_ylabel('Parallel Workload Time (ms)') + ax.set_title('Parallel Agent Workload Scalability') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.3) + +def plot_commit_time(ax, df): + """Panel 3: Commit time vs N agents""" + for exp, label, color, marker in [ + ('scale_agents', 'DAXFS', '#2196F3', 'o'), + ('baseline_tmpfs', 'tmpfs (cp -a)', '#FF5722', 's'), + ('baseline_overlayfs', 'OverlayFS (cp upper)', '#4CAF50', '^'), + ]: + d = mean_latency(df, exp, 'commit') + if len(d) == 0: + continue + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + label=label, color=color, marker=marker, + capsize=3, linewidth=2, markersize=7) + ax.set_xlabel('Number of Agents (N)') + ax.set_ylabel('Commit Time (ms)') + ax.set_title('Commit / Merge Cost') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.3) + +def plot_branch_create_avg(ax, df): + """Panel 4: Per-branch creation cost (avg) vs N — shows O(1) vs O(tree_size)""" + for exp, label, color, marker in [ + ('scale_agents', 'DAXFS (O(1) CoW)', '#2196F3', 'o'), + ('baseline_tmpfs', 'tmpfs (O(n) copy)', '#FF5722', 's'), + ('baseline_overlayfs', 'OverlayFS (mount)', '#4CAF50', '^'), + ]: + d = mean_latency(df, exp, 'branch_create_avg') + if len(d) == 0: + continue + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + label=label, color=color, marker=marker, + capsize=3, linewidth=2, markersize=7) + ax.set_xlabel('Number of Agents (N)') + ax.set_ylabel('Per-Branch Creation (ms)') + ax.set_title('Per-Branch Creation Cost (Average)') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.3) + +def plot_nesting_depth(ax, df): + """Panel 5: Chain creation + commit time vs nesting depth""" + for op, label, color, marker in [ + ('chain_create', 'Chain Create', '#2196F3', 'o'), + ('commit', 'Commit (walks chain)', '#E91E63', 'D'), + ('workload', 'Workload at Deepest', '#9E9E9E', 'x'), + ]: + d = mean_latency(df, 'scale_depth', op) + if len(d) == 0: + continue + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + label=label, color=color, marker=marker, + capsize=3, linewidth=2, markersize=7) + ax.set_xlabel('Nesting Depth (D)') + ax.set_ylabel('Latency (ms)') + ax.set_title('Speculation Nesting Depth') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.3) + +def plot_commit_cost(ax, df): + """Panel 6: Commit cost vs number of write operations""" + for op, label, color, marker in [ + ('commit', 'Commit Time', '#E91E63', 'D'), + ('workload', 'Workload Time', '#9E9E9E', 'x'), + ]: + d = mean_latency(df, 'commit_cost', op) + if len(d) == 0: + continue + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + label=label, color=color, marker=marker, + capsize=3, linewidth=2, markersize=7) + ax.set_xlabel('Number of Write Operations') + ax.set_ylabel('Latency (ms)') + ax.set_title('Commit Cost vs Delta Size') + ax.set_xscale('log') + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.legend(fontsize=9) + ax.grid(True, alpha=0.3) + +def main(): + tsv_path = sys.argv[1] if len(sys.argv) > 1 else './bench_results/agents.tsv' + out_path = sys.argv[2] if len(sys.argv) > 2 else './bench_results/daxfs_bench.png' + + df = load_data(tsv_path) + print(f"Loaded {len(df)} rows from {tsv_path}") + print(f"Experiments: {df['experiment'].unique()}") + + fig, axes = plt.subplots(2, 3, figsize=(18, 11)) + fig.suptitle('DAXFS AI Agent Speculative Branching Benchmark', fontsize=16, fontweight='bold') + + plot_agent_scalability(axes[0, 0], df) + plot_branch_create_avg(axes[0, 1], df) + plot_parallel_workload(axes[0, 2], df) + plot_commit_time(axes[1, 0], df) + plot_nesting_depth(axes[1, 1], df) + plot_commit_cost(axes[1, 2], df) + + plt.tight_layout(rect=[0, 0, 1, 0.95]) + fig.savefig(out_path, dpi=150, bbox_inches='tight') + print(f"Saved figure to {out_path}") + + # Also save individual plots for paper use + for name, plot_fn in [ + ('branch_creation', plot_agent_scalability), + ('branch_avg', plot_branch_create_avg), + ('parallel_workload', plot_parallel_workload), + ('commit_time', plot_commit_time), + ('nesting_depth', plot_nesting_depth), + ('commit_cost', plot_commit_cost), + ]: + fig2, ax2 = plt.subplots(figsize=(7, 5)) + plot_fn(ax2, df) + fig2.tight_layout() + p = out_path.replace('.png', f'_{name}.png') + fig2.savefig(p, dpi=150, bbox_inches='tight') + plt.close(fig2) + print(f" -> {p}") + + plt.close(fig) + +if __name__ == '__main__': + main() diff --git a/tests/plot_gpu.py b/tests/plot_gpu.py new file mode 100644 index 0000000..97e696e --- /dev/null +++ b/tests/plot_gpu.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +""" +DAXFS GPU Coordination Benchmark Plotter + +Reads gpu.tsv and produces multi-panel figures showing PCIe AtomicOp +latency/throughput, P2P DMA bandwidth, and multi-tenant CXL permission +check overhead. +""" + +import sys +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np + +def load_data(path): + df = pd.read_csv(path, sep='\t') + return df + +def mean_by_param(df, experiment, operation, value_col='latency_ns'): + sub = df[(df['experiment'] == experiment) & (df['operation'] == operation)] + return sub.groupby('parameter')[value_col].agg(['mean', 'std']).reset_index() + +def mean_by_param_mops(df, experiment, operation): + sub = df[(df['experiment'] == experiment) & (df['operation'] == operation)] + return sub.groupby('parameter')['thru_mops'].agg(['mean', 'std']).reset_index() + +# ── Original 6 panels ──────────────────────────────────────────────── + +def plot_latency_bars(ax, df): + primitives = [ + ('gpu_commit_seq', 'volatile_read', 'PCIe Read\n(commit_seq)', '#2196F3'), + ('gpu_pending_ctr', 'cas_inc_dec', 'CAS Inc/Dec\n(pending_ctr)', '#FF9800'), + ('gpu_coord_lock', 'lock_unlock_rt', 'Lock+Unlock\n(coord_lock)', '#F44336'), + ] + names, means, stds, colors = [], [], [], [] + for exp, op, name, color in primitives: + sub = df[(df['experiment'] == exp) & (df['operation'] == op)] + if len(sub) == 0: continue + names.append(name); means.append(sub['latency_ns'].mean()) + stds.append(sub['latency_ns'].std()); colors.append(color) + bars = ax.bar(names, means, yerr=stds, color=colors, capsize=5, + edgecolor='black', linewidth=0.5) + ax.set_ylabel('Latency (ns)') + ax.set_title('PCIe Atomic Primitive Latency\n(single GPU thread)') + ax.grid(True, alpha=0.3, axis='y') + for bar, m in zip(bars, means): + ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 30, + f'{m:.0f} ns', ha='center', va='bottom', fontsize=9, fontweight='bold') + +def plot_lookup_throughput(ax, df): + d = mean_by_param_mops(df, 'gpu_pcache_lookup', 'lookup_throughput') + if len(d) == 0: return + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + color='#2196F3', marker='o', capsize=3, linewidth=2, markersize=7) + base = d['mean'].iloc[0] + ax.plot(d['parameter'], d['parameter'] * base, '--', color='gray', + alpha=0.5, label='Ideal linear') + ax.set_xlabel('GPU Threads'); ax.set_ylabel('Throughput (Mops/s)') + ax.set_title('Page Cache Lookup Throughput') + ax.set_xscale('log', base=2); ax.set_yscale('log', base=10) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.legend(fontsize=9); ax.grid(True, alpha=0.3) + +def plot_cas_throughput(ax, df): + d = mean_by_param_mops(df, 'gpu_slot_cas', 'cas_throughput') + if len(d) == 0: return + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + color='#E91E63', marker='D', capsize=3, linewidth=2, markersize=7) + ax.set_xlabel('GPU Threads'); ax.set_ylabel('Throughput (Mops/s)') + ax.set_title('Slot CAS Throughput\n(independent slots, PCIe atomics)') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.grid(True, alpha=0.3) + if len(d) >= 4: + plateau = d['mean'].iloc[-3:].mean() + ax.axhline(y=plateau, color='gray', linestyle=':', alpha=0.5) + ax.text(d['parameter'].iloc[-1], plateau * 1.1, + f'PCIe limit: {plateau:.1f} Mops/s', + ha='right', fontsize=8, color='gray') + +def plot_lock_contention(ax, df): + d = mean_by_param(df, 'gpu_lock_contention', 'lock_acquisition') + if len(d) == 0: return + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + color='#F44336', marker='s', capsize=3, linewidth=2, markersize=7) + ax.set_xlabel('Contending GPU Threads') + ax.set_ylabel('Time per Acquisition (ns)') + ax.set_title('Coordination Lock Contention') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.grid(True, alpha=0.3) + +def plot_claim_throughput(ax, df): + d = mean_by_param_mops(df, 'gpu_pcache_claim', 'claim_throughput') + if len(d) == 0: return + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + color='#4CAF50', marker='^', capsize=3, linewidth=2, markersize=7) + ax.set_xlabel('GPU Threads'); ax.set_ylabel('Throughput (Mops/s)') + ax.set_title('Page Cache Claim\n(FREE->PENDING + pending counter)') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.grid(True, alpha=0.3) + +def plot_lookup_latency(ax, df): + d = mean_by_param(df, 'gpu_pcache_lookup', 'lookup_throughput') + if len(d) == 0: return + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + color='#2196F3', marker='o', capsize=3, linewidth=2, markersize=7) + ax.set_xlabel('GPU Threads'); ax.set_ylabel('Per-Op Latency (ns)') + ax.set_title('Page Cache Lookup Latency') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.grid(True, alpha=0.3) + +# ── New panels: P2P DMA & Multi-tenant ──────────────────────────────── + +def plot_p2p_bandwidth(ax, df): + """P2P DMA read/write bandwidth vs transfer size""" + sizes = [64, 256, 4096, 65536, 1 << 20] + labels = ['64B', '256B', '4KB', '64KB', '1MB'] + + for exp, label, color, marker in [ + ('gpu_p2p_read', 'GPU Read from CXL', '#2196F3', 'o'), + ('gpu_p2p_write', 'GPU Write to CXL', '#FF5722', 's'), + ]: + sub = df[df['experiment'] == exp] + if len(sub) == 0: continue + # thru_mops is actually GB/s * 1000 in this context + means, stds_v = [], [] + for sz in sizes: + s = sub[sub['parameter'] == sz] + if len(s) == 0: continue + # Compute GB/s from latency and size + lat = s['latency_ns'].mean() + bw = sz / lat # bytes/ns = GB/s + means.append(bw) + stds_v.append(0) + if means: + ax.plot(range(len(means)), means, marker=marker, color=color, + linewidth=2, markersize=7, label=label) + + ax.set_xticks(range(len(labels))) + ax.set_xticklabels(labels) + ax.set_xlabel('Transfer Size') + ax.set_ylabel('Bandwidth (GB/s)') + ax.set_title('P2P DMA Bandwidth\n(GPU <-> CXL Host Memory)') + ax.legend(fontsize=9) + ax.grid(True, alpha=0.3) + +def plot_p2p_latency(ax, df): + """P2P DMA read/write latency vs transfer size""" + sizes = [64, 256, 4096, 65536, 1 << 20] + labels = ['64B', '256B', '4KB', '64KB', '1MB'] + + for exp, label, color, marker in [ + ('gpu_p2p_read', 'GPU Read', '#2196F3', 'o'), + ('gpu_p2p_write', 'GPU Write', '#FF5722', 's'), + ]: + sub = df[df['experiment'] == exp] + if len(sub) == 0: continue + means = [] + for sz in sizes: + s = sub[sub['parameter'] == sz] + if len(s) == 0: continue + means.append(s['latency_ns'].mean() / 1e3) # us + if means: + ax.plot(range(len(means)), means, marker=marker, color=color, + linewidth=2, markersize=7, label=label) + + ax.set_xticks(range(len(labels))) + ax.set_xticklabels(labels) + ax.set_xlabel('Transfer Size') + ax.set_ylabel('Latency (us)') + ax.set_yscale('log') + ax.set_title('P2P DMA Latency\n(GPU <-> CXL Host Memory)') + ax.legend(fontsize=9) + ax.grid(True, alpha=0.3) + +def plot_mt_acl_overhead(ax, df): + """Multi-tenant: ACL check overhead vs no-check baseline""" + sub_acl = df[(df['experiment'] == 'gpu_mt_acl_overhead') & + (df['operation'] == 'read_with_acl')] + sub_noacl = df[(df['experiment'] == 'gpu_mt_acl_overhead') & + (df['operation'] == 'read_no_acl')] + + names = ['With ACL\n(perm check + read)', 'Without ACL\n(read only)'] + means = [sub_acl['thru_mops'].mean() if len(sub_acl) else 0, + sub_noacl['thru_mops'].mean() if len(sub_noacl) else 0] + stds = [sub_acl['thru_mops'].std() if len(sub_acl) else 0, + sub_noacl['thru_mops'].std() if len(sub_noacl) else 0] + colors = ['#4CAF50', '#9E9E9E'] + + bars = ax.bar(names, means, yerr=stds, color=colors, capsize=5, + edgecolor='black', linewidth=0.5) + ax.set_ylabel('Throughput (Mops/s)') + ax.set_title('CXL ACL Permission Check Overhead\n(256 threads, all-allowed)') + ax.grid(True, alpha=0.3, axis='y') + for bar, m in zip(bars, means): + ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, + f'{m:.1f}', ha='center', va='bottom', fontsize=10, fontweight='bold') + +def plot_mt_read_tenants(ax, df): + """Multi-tenant read throughput vs tenant count""" + d = mean_by_param_mops(df, 'gpu_mt_read', 'mt_read_throughput') + if len(d) == 0: return + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + color='#2196F3', marker='o', capsize=3, linewidth=2, markersize=7, + label='Read (perm-checked)') + ax.set_xlabel('Number of Tenants') + ax.set_ylabel('Throughput (Mops/s)') + ax.set_title('Multi-tenant Read Throughput\n(each tenant: 32 GPU threads)') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.legend(fontsize=9); ax.grid(True, alpha=0.3) + +def plot_mt_write_tenants(ax, df): + """Multi-tenant write throughput vs tenant count""" + d = mean_by_param_mops(df, 'gpu_mt_write', 'mt_write_throughput') + if len(d) == 0: return + ax.errorbar(d['parameter'], d['mean'], yerr=d['std'], + color='#FF5722', marker='s', capsize=3, linewidth=2, markersize=7, + label='Write (perm-checked CAS)') + ax.set_xlabel('Number of Tenants') + ax.set_ylabel('Throughput (Mops/s)') + ax.set_title('Multi-tenant Write Throughput\n(each tenant: 32 GPU threads)') + ax.set_xscale('log', base=2) + ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) + ax.legend(fontsize=9); ax.grid(True, alpha=0.3) + +def plot_mt_isolation(ax, df): + """Cross-tenant isolation: allowed vs denied pie/bar""" + sub = df[(df['experiment'] == 'gpu_mt_isolation')] + if len(sub) == 0: return + + # Parse allowed/denied from notes + allowed_total, denied_total = 0, 0 + for _, row in sub.iterrows(): + notes = str(row.get('notes', '')) + for part in notes.split(','): + if part.startswith('allowed='): + allowed_total += int(part.split('=')[1]) + elif part.startswith('denied='): + denied_total += int(part.split('=')[1]) + + if allowed_total + denied_total == 0: return + total = allowed_total + denied_total + pct_allowed = 100.0 * allowed_total / total + pct_denied = 100.0 * denied_total / total + + colors = ['#F44336', '#4CAF50'] + wedges, texts, autotexts = ax.pie( + [allowed_total, denied_total], + labels=[f'Own Slots\n(allowed)', f'Other Tenants\n(denied)'], + colors=colors, autopct='%1.1f%%', startangle=90, + textprops={'fontsize': 10}) + ax.set_title(f'Cross-Tenant Isolation (4 tenants)\n' + f'Tenant 0 accessing all {total//len(sub)} slots/rep') + +def main(): + tsv = sys.argv[1] if len(sys.argv) > 1 else './bench_results/gpu.tsv' + out = sys.argv[2] if len(sys.argv) > 2 else './bench_results/daxfs_gpu_bench.png' + + df = load_data(tsv) + print(f"Loaded {len(df)} rows from {tsv}") + print(f"Experiments: {df['experiment'].unique()}") + + # ── Figure 1: Core PCIe atomics (6 panels) ── + fig1, axes1 = plt.subplots(2, 3, figsize=(18, 11)) + fig1.suptitle('DAXFS GPU PCIe AtomicOp Coordination Benchmark\n' + '(RTX 5090, pinned host memory, PCIe 5.0)', + fontsize=15, fontweight='bold') + plot_latency_bars(axes1[0, 0], df) + plot_lookup_throughput(axes1[0, 1], df) + plot_cas_throughput(axes1[0, 2], df) + plot_lock_contention(axes1[1, 0], df) + plot_claim_throughput(axes1[1, 1], df) + plot_lookup_latency(axes1[1, 2], df) + plt.tight_layout(rect=[0, 0, 1, 0.93]) + fig1.savefig(out, dpi=150, bbox_inches='tight') + print(f"Saved: {out}") + plt.close(fig1) + + # ── Figure 2: P2P DMA + Multi-tenant (6 panels) ── + out2 = out.replace('.png', '_p2p_mt.png') + fig2, axes2 = plt.subplots(2, 3, figsize=(18, 11)) + fig2.suptitle('DAXFS GPU P2P DMA & Multi-tenant CXL Memory Benchmark\n' + '(RTX 5090, PCIe 5.0, shared CXL memory simulation)', + fontsize=15, fontweight='bold') + plot_p2p_bandwidth(axes2[0, 0], df) + plot_p2p_latency(axes2[0, 1], df) + plot_mt_acl_overhead(axes2[0, 2], df) + plot_mt_read_tenants(axes2[1, 0], df) + plot_mt_write_tenants(axes2[1, 1], df) + plot_mt_isolation(axes2[1, 2], df) + plt.tight_layout(rect=[0, 0, 1, 0.93]) + fig2.savefig(out2, dpi=150, bbox_inches='tight') + print(f"Saved: {out2}") + plt.close(fig2) + + # ── Individual plots ── + all_plots = [ + ('latency_bars', plot_latency_bars), + ('lookup_throughput', plot_lookup_throughput), + ('cas_throughput', plot_cas_throughput), + ('lock_contention', plot_lock_contention), + ('claim_throughput', plot_claim_throughput), + ('lookup_latency', plot_lookup_latency), + ('p2p_bandwidth', plot_p2p_bandwidth), + ('p2p_latency', plot_p2p_latency), + ('mt_acl_overhead', plot_mt_acl_overhead), + ('mt_read_tenants', plot_mt_read_tenants), + ('mt_write_tenants', plot_mt_write_tenants), + ('mt_isolation', plot_mt_isolation), + ] + for name, fn in all_plots: + f, a = plt.subplots(figsize=(7, 5)) + fn(a, df) + f.tight_layout() + p = out.replace('.png', f'_{name}.png') + f.savefig(p, dpi=150, bbox_inches='tight') + plt.close(f) + print(f" -> {p}") + +if __name__ == '__main__': + main() diff --git a/tools/Makefile b/tools/Makefile index f4335bb..79c5435 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -14,10 +14,23 @@ mkdaxfs: mkdaxfs.c daxfs-inspect: daxfs-inspect.c $(CC) $(CFLAGS) -o $@ $< +# Optional CUDA GPU mapping library +NVCC ?= nvcc +CUDA_PATH ?= /usr/local/cuda +CUDA_CFLAGS = -I../include -I$(CUDA_PATH)/include +CUDA_LDFLAGS = -L$(CUDA_PATH)/lib64 -lcuda + +GPU_TARGETS = daxfs-gpu-map.o + +gpu: $(GPU_TARGETS) + +daxfs-gpu-map.o: daxfs-gpu-map.c daxfs-gpu-map.h ../include/daxfs_format.h + $(CC) $(CFLAGS) $(CUDA_CFLAGS) -c -o $@ $< + clean: - rm -f $(TARGETS) + rm -f $(TARGETS) $(GPU_TARGETS) install: $(TARGETS) install -m 755 $(TARGETS) /usr/local/bin/ -.PHONY: all clean install +.PHONY: all clean install gpu diff --git a/tools/daxfs-gpu-map.c b/tools/daxfs-gpu-map.c new file mode 100644 index 0000000..bd1dda5 --- /dev/null +++ b/tools/daxfs-gpu-map.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * daxfs GPU mapping helper + * + * Maps a daxfs DAX region into GPU address space for PCIe AtomicOps. + * + * Two mapping paths: + * 1. dma-buf (preferred): DAXFS_IOC_GET_DMABUF -> cuImportExternalMemory + * 2. /dev/mem fallback: phys_addr from ioctl -> mmap + cuMemHostRegister + * + * Copyright (C) 2026 Multikernel Technologies, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "daxfs-gpu-map.h" + +/* Compute convenience device pointers from base + ioctl offsets */ +static void compute_dev_ptrs(struct daxfs_gpu_map *gpu) +{ + CUdeviceptr b = gpu->base; + const struct daxfs_gpu_info *gi = &gpu->info; + + if (gi->coord_offset) { + gpu->coord_lock = b + gi->coord_offset + gi->coord_lock_off; + gpu->commit_seq = b + gi->coord_offset + gi->commit_seq_off; + } + + if (gi->pcache_offset) { + gpu->pcache_slots = b + gi->pcache_slots_offset; + gpu->pcache_data = b + gi->pcache_data_offset; + gpu->pending_count = b + gi->pcache_offset + + gi->pending_count_off; + } +} + +/* Primary path: import dma-buf as CUDA external memory */ +static int map_dmabuf(struct daxfs_gpu_map *gpu) +{ + int dmabuf_fd; + CUresult res; + CUDA_EXTERNAL_MEMORY_HANDLE_DESC ext_desc; + CUDA_EXTERNAL_MEMORY_BUFFER_DESC buf_desc; + + dmabuf_fd = ioctl(gpu->mount_fd, DAXFS_IOC_GET_DMABUF); + if (dmabuf_fd < 0) + return -1; /* not a dma-buf mount */ + gpu->dmabuf_fd = dmabuf_fd; + + memset(&ext_desc, 0, sizeof(ext_desc)); + ext_desc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD; + ext_desc.handle.fd = dmabuf_fd; + ext_desc.size = gpu->info.dax_size; + + res = cuImportExternalMemory(&gpu->ext_mem, &ext_desc); + if (res != CUDA_SUCCESS) { + fprintf(stderr, "daxfs-gpu-map: cuImportExternalMemory: %d\n", + (int)res); + close(dmabuf_fd); + gpu->dmabuf_fd = -1; + return -1; + } + + /* + * cuImportExternalMemory with OPAQUE_FD takes ownership of the fd, + * so we must not close it ourselves. + */ + gpu->dmabuf_fd = -1; + + memset(&buf_desc, 0, sizeof(buf_desc)); + buf_desc.offset = 0; + buf_desc.size = gpu->info.dax_size; + + res = cuExternalMemoryGetMappedBuffer(&gpu->base, gpu->ext_mem, + &buf_desc); + if (res != CUDA_SUCCESS) { + fprintf(stderr, + "daxfs-gpu-map: cuExternalMemoryGetMappedBuffer: %d\n", + (int)res); + cuDestroyExternalMemory(gpu->ext_mem); + gpu->ext_mem = NULL; + return -1; + } + + gpu->size = gpu->info.dax_size; + return 0; +} + +/* Fallback path: /dev/mem mmap + cuMemHostRegister */ +static int map_devmem(struct daxfs_gpu_map *gpu) +{ + int fd; + void *addr; + CUresult res; + + if (!gpu->info.dax_phys_addr) { + fprintf(stderr, + "daxfs-gpu-map: no physical address available\n"); + errno = ENOTSUP; + return -1; + } + + fd = open("/dev/mem", O_RDWR | O_SYNC); + if (fd < 0) { + perror("daxfs-gpu-map: open /dev/mem"); + return -1; + } + + addr = mmap(NULL, gpu->info.dax_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, (off_t)gpu->info.dax_phys_addr); + close(fd); + + if (addr == MAP_FAILED) { + perror("daxfs-gpu-map: mmap /dev/mem"); + return -1; + } + gpu->host_mmap = addr; + + res = cuMemHostRegister(addr, gpu->info.dax_size, + CU_MEMHOSTREGISTER_DEVICEMAP); + if (res != CUDA_SUCCESS) { + fprintf(stderr, + "daxfs-gpu-map: cuMemHostRegister: %d\n", (int)res); + munmap(addr, gpu->info.dax_size); + gpu->host_mmap = NULL; + return -1; + } + + res = cuMemHostGetDevicePointer(&gpu->base, addr, 0); + if (res != CUDA_SUCCESS) { + fprintf(stderr, + "daxfs-gpu-map: cuMemHostGetDevicePointer: %d\n", + (int)res); + cuMemHostUnregister(addr); + munmap(addr, gpu->info.dax_size); + gpu->host_mmap = NULL; + return -1; + } + + gpu->size = gpu->info.dax_size; + return 0; +} + +/* Fallback path 2: mmap dma-buf fd + cuMemHostRegister */ +static int map_dmabuf_mmap(struct daxfs_gpu_map *gpu) +{ + int dmabuf_fd; + void *addr; + CUresult res; + + dmabuf_fd = ioctl(gpu->mount_fd, DAXFS_IOC_GET_DMABUF); + if (dmabuf_fd < 0) + return -1; + + fprintf(stderr, "daxfs-gpu-map: dmabuf_mmap: fd=%d size=%llu\n", + dmabuf_fd, (unsigned long long)gpu->info.dax_size); + + addr = mmap(NULL, gpu->info.dax_size, PROT_READ | PROT_WRITE, + MAP_SHARED, dmabuf_fd, 0); + if (addr == MAP_FAILED) { + perror("daxfs-gpu-map: mmap dma-buf fd"); + close(dmabuf_fd); + return -1; + } + + fprintf(stderr, "daxfs-gpu-map: dmabuf mmap'd at %p\n", addr); + + gpu->dmabuf_fd = dmabuf_fd; + gpu->host_mmap = addr; + + res = cuMemHostRegister(addr, gpu->info.dax_size, + CU_MEMHOSTREGISTER_DEVICEMAP | + CU_MEMHOSTREGISTER_IOMEMORY); + if (res != CUDA_SUCCESS) { + fprintf(stderr, + "daxfs-gpu-map: cuMemHostRegister(dmabuf, IOMEMORY): " + "%d, trying without IOMEMORY...\n", (int)res); + res = cuMemHostRegister(addr, gpu->info.dax_size, + CU_MEMHOSTREGISTER_DEVICEMAP); + } + if (res != CUDA_SUCCESS) { + fprintf(stderr, + "daxfs-gpu-map: cuMemHostRegister(dmabuf): %d\n", + (int)res); + munmap(addr, gpu->info.dax_size); + close(dmabuf_fd); + gpu->host_mmap = NULL; + gpu->dmabuf_fd = -1; + return -1; + } + + res = cuMemHostGetDevicePointer(&gpu->base, addr, 0); + if (res != CUDA_SUCCESS) { + fprintf(stderr, + "daxfs-gpu-map: cuMemHostGetDevicePointer(dmabuf): " + "%d\n", (int)res); + cuMemHostUnregister(addr); + munmap(addr, gpu->info.dax_size); + close(dmabuf_fd); + gpu->host_mmap = NULL; + gpu->dmabuf_fd = -1; + return -1; + } + + gpu->size = gpu->info.dax_size; + return 0; +} + +int daxfs_gpu_map(int mount_fd, struct daxfs_gpu_map *gpu) +{ + memset(gpu, 0, sizeof(*gpu)); + gpu->mount_fd = mount_fd; + gpu->dmabuf_fd = -1; + + /* Step 1: get layout from kernel */ + if (ioctl(mount_fd, DAXFS_IOC_GET_GPU_INFO, &gpu->info) < 0) { + perror("daxfs-gpu-map: DAXFS_IOC_GET_GPU_INFO"); + return -1; + } + + if (gpu->info.dax_size == 0) { + fprintf(stderr, "daxfs-gpu-map: zero-length DAX region\n"); + errno = EINVAL; + return -1; + } + + /* Step 2: map into GPU - try dma-buf import, mmap+register, /dev/mem */ + if (map_dmabuf(gpu) < 0 && map_dmabuf_mmap(gpu) < 0 && + map_devmem(gpu) < 0) + return -1; + + /* Step 3: compute convenience pointers */ + compute_dev_ptrs(gpu); + + return 0; +} + +void daxfs_gpu_unmap(struct daxfs_gpu_map *gpu) +{ + if (!gpu) + return; + + if (gpu->ext_mem) { + /* dma-buf path */ + if (gpu->base) + cuMemFree(gpu->base); + cuDestroyExternalMemory(gpu->ext_mem); + gpu->ext_mem = NULL; + } else if (gpu->host_mmap) { + /* /dev/mem path */ + cuMemHostUnregister(gpu->host_mmap); + munmap(gpu->host_mmap, gpu->size); + gpu->host_mmap = NULL; + } + + if (gpu->dmabuf_fd >= 0) { + close(gpu->dmabuf_fd); + gpu->dmabuf_fd = -1; + } + + gpu->base = 0; + gpu->size = 0; +} diff --git a/tools/daxfs-gpu-map.h b/tools/daxfs-gpu-map.h new file mode 100644 index 0000000..9a9c7fc --- /dev/null +++ b/tools/daxfs-gpu-map.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * daxfs GPU mapping helper - public API + * + * Maps a daxfs DAX region into GPU address space so that CUDA kernels + * can issue PCIe AtomicOps (CAS TLPs) against the coordination lock + * and shared page cache. + * + * Primary path : dma-buf fd -> cuImportExternalMemory (OPAQUE_FD) + * Fallback path: /dev/mem -> cuMemHostRegister (phys= mounts) + * + * Copyright (C) 2026 Multikernel Technologies, Inc. All rights reserved. + */ +#ifndef _DAXFS_GPU_MAP_H +#define _DAXFS_GPU_MAP_H + +#include +#include +#include "daxfs_format.h" + +struct daxfs_gpu_map { + /* GPU mapping */ + CUdeviceptr base; /* GPU pointer to DAX base */ + CUexternalMemory ext_mem; /* Handle (dma-buf path) */ + size_t size; /* Total mapped size */ + + /* Layout from ioctl */ + struct daxfs_gpu_info info; + + /* Convenience device pointers (computed from base + offsets) */ + CUdeviceptr coord_lock; /* -> coord_lock field */ + CUdeviceptr commit_seq; /* -> commit_sequence */ + CUdeviceptr pcache_slots; /* -> slot metadata */ + CUdeviceptr pcache_data; /* -> slot data area */ + CUdeviceptr pending_count; /* -> pending_count */ + + /* Internal state */ + int mount_fd; /* fd used for ioctl */ + int dmabuf_fd; /* dma-buf fd (-1 if none) */ + void *host_mmap; /* mmap addr (/dev/mem path) */ +}; + +/* + * Map the daxfs DAX region into GPU address space. + * + * @mount_fd: open fd on any file within the daxfs mount + * @gpu: output structure (zeroed by caller) + * + * Returns 0 on success, -1 on failure (errno set). + * On success the caller must eventually call daxfs_gpu_unmap(). + */ +int daxfs_gpu_map(int mount_fd, struct daxfs_gpu_map *gpu); + +/* + * Unmap and release all GPU and host resources. + */ +void daxfs_gpu_unmap(struct daxfs_gpu_map *gpu); + +/* + * Convenience: return a device pointer to a specific pcache slot's + * state_tag field. + */ +static inline CUdeviceptr +daxfs_gpu_slot_state_tag(const struct daxfs_gpu_map *gpu, uint32_t slot_idx) +{ + return gpu->pcache_slots + + (CUdeviceptr)slot_idx * gpu->info.pcache_slot_stride + + gpu->info.state_tag_off; +} + +/* + * Convenience: return a device pointer to a specific pcache slot's + * data page. + */ +static inline CUdeviceptr +daxfs_gpu_slot_data(const struct daxfs_gpu_map *gpu, uint32_t slot_idx) +{ + return gpu->pcache_data + (CUdeviceptr)slot_idx * 4096; +} + +#endif /* _DAXFS_GPU_MAP_H */