diff --git a/Makefile b/Makefile
index fb7ffe2..1d86490 100644
--- a/Makefile
+++ b/Makefile
@@ -13,9 +13,12 @@ tools:
 tests:
 	$(MAKE) -C tests
 
+gpu:
+	$(MAKE) -C tools gpu
+
 clean:
 	$(MAKE) -C daxfs clean
 	$(MAKE) -C tools clean
 	$(MAKE) -C tests clean
 
-.PHONY: all daxfs tools tests clean
+.PHONY: all daxfs tools tests gpu clean
diff --git a/bench_results/agents.tsv b/bench_results/agents.tsv
new file mode 100644
index 0000000..27116d6
--- /dev/null
+++ b/bench_results/agents.tsv
@@ -0,0 +1,343 @@
+experiment	parameter	iteration	operation	latency_us	ops_count	delta_bytes	notes
+scale_agents	1	1	branch_create_total	4076	1		
+scale_agents	1	1	branch_create_avg	4076	1		
+scale_agents	1	1	parallel_workload	526182	1		fails=0
+scale_agents	1	1	commit	7195	1		
+scale_agents	1	1	cleanup	14207	1		
+scale_agents	1	2	branch_create_total	5431	1		
+scale_agents	1	2	branch_create_avg	5431	1		
+scale_agents	1	2	parallel_workload	528399	1		fails=0
+scale_agents	1	2	commit	6421	1		
+scale_agents	1	2	cleanup	19778	1		
+scale_agents	1	3	branch_create_total	5494	1		
+scale_agents	1	3	branch_create_avg	5494	1		
+scale_agents	1	3	parallel_workload	547399	1		fails=0
+scale_agents	1	3	commit	9607	1		
+scale_agents	1	3	cleanup	16249	1		
+scale_agents	2	1	branch_create_total	7572	2		
+scale_agents	2	1	branch_create_avg	3786	2		
+scale_agents	2	1	parallel_workload	582696	2		fails=0
+scale_agents	2	1	commit	6274	1		
+scale_agents	2	1	estale_detect	8140	1		rc=1
+scale_agents	2	1	cleanup	22013	2		
+scale_agents	2	2	branch_create_total	10103	2		
+scale_agents	2	2	branch_create_avg	5051	2		
+scale_agents	2	2	parallel_workload	543486	2		fails=0
+scale_agents	2	2	commit	9582	1		
+scale_agents	2	2	estale_detect	4476	1		rc=1
+scale_agents	2	2	cleanup	16557	2		
+scale_agents	2	3	branch_create_total	9241	2		
+scale_agents	2	3	branch_create_avg	4620	2		
+scale_agents	2	3	parallel_workload	548317	2		fails=0
+scale_agents	2	3	commit	5399	1		
+scale_agents	2	3	estale_detect	5502	1		rc=1
+scale_agents	2	3	cleanup	16312	2		
+scale_agents	4	1	branch_create_total	21351	4		
+scale_agents	4	1	branch_create_avg	5337	4		
+scale_agents	4	1	parallel_workload	602268	4		fails=0
+scale_agents	4	1	commit	8011	1		
+scale_agents	4	1	estale_detect	6145	1		rc=1
+scale_agents	4	1	cleanup	22772	4		
+scale_agents	4	2	branch_create_total	16742	4		
+scale_agents	4	2	branch_create_avg	4185	4		
+scale_agents	4	2	parallel_workload	599315	4		fails=0
+scale_agents	4	2	commit	4615	1		
+scale_agents	4	2	estale_detect	4333	1		rc=1
+scale_agents	4	2	cleanup	24804	4		
+scale_agents	4	3	branch_create_total	17419	4		
+scale_agents	4	3	branch_create_avg	4354	4		
+scale_agents	4	3	parallel_workload	619138	4		fails=0
+scale_agents	4	3	commit	4957	1		
+scale_agents	4	3	estale_detect	3831	1		rc=1
+scale_agents	4	3	cleanup	26757	4		
+scale_agents	8	1	branch_create_total	38580	8		
+scale_agents	8	1	branch_create_avg	4822	8		
+scale_agents	8	1	parallel_workload	669078	8		fails=0
+scale_agents	8	1	commit	5471	1		
+scale_agents	8	1	estale_detect	4193	1		rc=1
+scale_agents	8	1	cleanup	38947	8		
+scale_agents	8	2	branch_create_total	37897	8		
+scale_agents	8	2	branch_create_avg	4737	8		
+scale_agents	8	2	parallel_workload	628198	8		fails=0
+scale_agents	8	2	commit	6667	1		
+scale_agents	8	2	estale_detect	4457	1		rc=1
+scale_agents	8	2	cleanup	31432	8		
+scale_agents	8	3	branch_create_total	37498	8		
+scale_agents	8	3	branch_create_avg	4687	8		
+scale_agents	8	3	parallel_workload	639915	8		fails=0
+scale_agents	8	3	commit	7485	1		
+scale_agents	8	3	estale_detect	4584	1		rc=1
+scale_agents	8	3	cleanup	28399	8		
+scale_agents	16	1	branch_create_total	92533	16		
+scale_agents	16	1	branch_create_avg	5783	16		
+scale_agents	16	1	parallel_workload	763445	16		fails=0
+scale_agents	16	1	commit	10479	1		
+scale_agents	16	1	estale_detect	4315	1		rc=1
+scale_agents	16	1	cleanup	45969	16		
+scale_agents	16	2	branch_create_total	74579	16		
+scale_agents	16	2	branch_create_avg	4661	16		
+scale_agents	16	2	parallel_workload	793028	16		fails=0
+scale_agents	16	2	commit	7999	1		
+scale_agents	16	2	estale_detect	5960	1		rc=1
+scale_agents	16	2	cleanup	49899	16		
+scale_agents	16	3	branch_create_total	82432	16		
+scale_agents	16	3	branch_create_avg	5152	16		
+scale_agents	16	3	parallel_workload	787314	16		fails=0
+scale_agents	16	3	commit	8582	1		
+scale_agents	16	3	estale_detect	4980	1		rc=1
+scale_agents	16	3	cleanup	50737	16		
+scale_agents	32	1	branch_create_total	152822	32		
+scale_agents	32	1	branch_create_avg	4775	32		
+scale_agents	32	1	parallel_workload	859543	32		fails=0
+scale_agents	32	1	commit	5684	1		
+scale_agents	32	1	estale_detect	4547	1		rc=1
+scale_agents	32	1	cleanup	104499	32		
+scale_agents	32	2	branch_create_total	153409	32		
+scale_agents	32	2	branch_create_avg	4794	32		
+scale_agents	32	2	parallel_workload	846043	32		fails=0
+scale_agents	32	2	commit	11058	1		
+scale_agents	32	2	estale_detect	4032	1		rc=1
+scale_agents	32	2	cleanup	106802	32		
+scale_agents	32	3	branch_create_total	156743	32		
+scale_agents	32	3	branch_create_avg	4898	32		
+scale_agents	32	3	parallel_workload	907180	32		fails=0
+scale_agents	32	3	commit	6461	1		
+scale_agents	32	3	estale_detect	5525	1		rc=1
+scale_agents	32	3	cleanup	98815	32		
+scale_depth	1	1	chain_create	11518	1		
+scale_depth	1	1	workload	551627	31		
+scale_depth	1	1	commit	6019	1		
+scale_depth	1	2	chain_create	9727	1		
+scale_depth	1	2	workload	502185	31		
+scale_depth	1	2	commit	6570	1		
+scale_depth	1	3	chain_create	10794	1		
+scale_depth	1	3	workload	515411	31		
+scale_depth	1	3	commit	11072	1		
+scale_depth	2	1	chain_create	10614	2		
+scale_depth	2	1	workload	526138	31		
+scale_depth	2	1	commit	10166	2		
+scale_depth	2	2	chain_create	16656	2		
+scale_depth	2	2	workload	523364	31		
+scale_depth	2	2	commit	16197	2		
+scale_depth	2	3	chain_create	16981	2		
+scale_depth	2	3	workload	547954	31		
+scale_depth	2	3	commit	5271	2		
+scale_depth	4	1	chain_create	22894	4		
+scale_depth	4	1	workload	538063	31		
+scale_depth	4	1	commit	10491	4		
+scale_depth	4	2	chain_create	22786	4		
+scale_depth	4	2	workload	511997	31		
+scale_depth	4	2	commit	6308	4		
+scale_depth	4	3	chain_create	23795	4		
+scale_depth	4	3	workload	530256	31		
+scale_depth	4	3	commit	6508	4		
+scale_depth	8	1	chain_create	36028	8		
+scale_depth	8	1	workload	530269	31		
+scale_depth	8	1	commit	10982	8		
+scale_depth	8	2	chain_create	39830	8		
+scale_depth	8	2	workload	500300	31		
+scale_depth	8	2	commit	6466	8		
+scale_depth	8	3	chain_create	36476	8		
+scale_depth	8	3	workload	542417	31		
+scale_depth	8	3	commit	14053	8		
+commit_cost	10	1	workload	202319	10		
+commit_cost	10	1	commit	14481	10		
+commit_cost	10	2	workload	181876	10		
+commit_cost	10	2	commit	5625	10		
+commit_cost	10	3	workload	183334	10		
+commit_cost	10	3	commit	7841	10		
+commit_cost	50	1	workload	1466946	50		
+commit_cost	50	1	commit	8523	50		
+commit_cost	50	2	workload	1464664	50		
+commit_cost	50	2	commit	10627	50		
+commit_cost	50	3	workload	1425226	50		
+commit_cost	50	3	commit	12598	50		
+commit_cost	100	1	workload	3626982	100		
+commit_cost	100	1	commit	11834	100		
+commit_cost	100	2	workload	3530194	100		
+commit_cost	100	2	commit	7431	100		
+commit_cost	100	3	workload	3541508	100		
+commit_cost	100	3	commit	11713	100		
+baseline_tmpfs	1	1	branch_create_total	2443	1		
+baseline_tmpfs	1	1	branch_create_avg	2443	1		
+baseline_tmpfs	1	1	parallel_workload	28326	1		
+baseline_tmpfs	1	1	branch_create_total	3515	1		
+baseline_tmpfs	1	1	branch_create_avg	3515	1		
+baseline_tmpfs	1	1	parallel_workload	32225	1		
+baseline_tmpfs	1	1	commit	86604	1		
+baseline_tmpfs	1	1	cleanup	4935	1		
+baseline_tmpfs	1	2	branch_create_total	3464	1		
+baseline_tmpfs	1	2	branch_create_avg	3464	1		
+baseline_tmpfs	1	2	parallel_workload	31332	1		
+baseline_tmpfs	1	2	commit	46055	1		
+baseline_tmpfs	1	2	cleanup	4084	1		
+baseline_tmpfs	1	3	branch_create_total	2905	1		
+baseline_tmpfs	1	3	branch_create_avg	2905	1		
+baseline_tmpfs	1	3	parallel_workload	31260	1		
+baseline_tmpfs	1	3	commit	45946	1		
+baseline_tmpfs	1	3	cleanup	3764	1		
+baseline_tmpfs	2	1	branch_create_total	5445	2		
+baseline_tmpfs	2	1	branch_create_avg	2722	2		
+baseline_tmpfs	2	1	parallel_workload	31963	2		
+baseline_tmpfs	2	1	commit	46219	1		
+baseline_tmpfs	2	1	cleanup	5454	2		
+baseline_tmpfs	2	2	branch_create_total	5455	2		
+baseline_tmpfs	2	2	branch_create_avg	2727	2		
+baseline_tmpfs	2	2	parallel_workload	32328	2		
+baseline_tmpfs	2	2	commit	46005	1		
+baseline_tmpfs	2	2	cleanup	4468	2		
+baseline_tmpfs	2	3	branch_create_total	5913	2		
+baseline_tmpfs	2	3	branch_create_avg	2956	2		
+baseline_tmpfs	2	3	parallel_workload	32508	2		
+baseline_tmpfs	2	3	commit	45440	1		
+baseline_tmpfs	2	3	cleanup	4795	2		
+baseline_tmpfs	4	1	branch_create_total	10858	4		
+baseline_tmpfs	4	1	branch_create_avg	2714	4		
+baseline_tmpfs	4	1	parallel_workload	34528	4		
+baseline_tmpfs	4	1	commit	45547	1		
+baseline_tmpfs	4	1	cleanup	5669	4		
+baseline_tmpfs	4	2	branch_create_total	11170	4		
+baseline_tmpfs	4	2	branch_create_avg	2792	4		
+baseline_tmpfs	4	2	parallel_workload	33417	4		
+baseline_tmpfs	4	2	commit	46485	1		
+baseline_tmpfs	4	2	cleanup	5774	4		
+baseline_tmpfs	4	3	branch_create_total	9925	4		
+baseline_tmpfs	4	3	branch_create_avg	2481	4		
+baseline_tmpfs	4	3	parallel_workload	33273	4		
+baseline_tmpfs	4	3	commit	45777	1		
+baseline_tmpfs	4	3	cleanup	5591	4		
+baseline_tmpfs	8	1	branch_create_total	22590	8		
+baseline_tmpfs	8	1	branch_create_avg	2823	8		
+baseline_tmpfs	8	1	parallel_workload	37262	8		
+baseline_tmpfs	8	1	commit	46174	1		
+baseline_tmpfs	8	1	cleanup	7164	8		
+baseline_tmpfs	8	2	branch_create_total	21631	8		
+baseline_tmpfs	8	2	branch_create_avg	2703	8		
+baseline_tmpfs	8	2	parallel_workload	35463	8		
+baseline_tmpfs	8	2	commit	45251	1		
+baseline_tmpfs	8	2	cleanup	8081	8		
+baseline_tmpfs	8	3	branch_create_total	21555	8		
+baseline_tmpfs	8	3	branch_create_avg	2694	8		
+baseline_tmpfs	8	3	parallel_workload	36366	8		
+baseline_tmpfs	8	3	commit	44625	1		
+baseline_tmpfs	8	3	cleanup	8296	8		
+baseline_tmpfs	16	1	branch_create_total	41263	16		
+baseline_tmpfs	16	1	branch_create_avg	2578	16		
+baseline_tmpfs	16	1	parallel_workload	38129	16		
+baseline_tmpfs	16	1	commit	45669	1		
+baseline_tmpfs	16	1	cleanup	12275	16		
+baseline_tmpfs	16	2	branch_create_total	42322	16		
+baseline_tmpfs	16	2	branch_create_avg	2645	16		
+baseline_tmpfs	16	2	parallel_workload	38157	16		
+baseline_tmpfs	16	2	commit	45554	1		
+baseline_tmpfs	16	2	cleanup	11999	16		
+baseline_tmpfs	16	3	branch_create_total	45596	16		
+baseline_tmpfs	16	3	branch_create_avg	2849	16		
+baseline_tmpfs	16	3	parallel_workload	39854	16		
+baseline_tmpfs	16	3	commit	45796	1		
+baseline_tmpfs	16	3	cleanup	13391	16		
+baseline_tmpfs	32	1	branch_create_total	93096	32		
+baseline_tmpfs	32	1	branch_create_avg	2909	32		
+baseline_tmpfs	32	1	parallel_workload	47494	32		
+baseline_tmpfs	32	1	commit	45508	1		
+baseline_tmpfs	32	1	cleanup	22176	32		
+baseline_tmpfs	32	2	branch_create_total	93741	32		
+baseline_tmpfs	32	2	branch_create_avg	2929	32		
+baseline_tmpfs	32	2	parallel_workload	47132	32		
+baseline_tmpfs	32	2	commit	46098	1		
+baseline_tmpfs	32	2	cleanup	23098	32		
+baseline_tmpfs	32	3	branch_create_total	94132	32		
+baseline_tmpfs	32	3	branch_create_avg	2941	32		
+baseline_tmpfs	32	3	parallel_workload	47511	32		
+baseline_tmpfs	32	3	commit	46239	1		
+baseline_tmpfs	32	3	cleanup	22055	32		
+baseline_overlayfs	1	1	branch_create_total	3291	1		
+baseline_overlayfs	1	1	branch_create_avg	3291	1		
+baseline_overlayfs	1	1	parallel_workload	33499	1		
+baseline_overlayfs	1	1	commit	3278	1		
+baseline_overlayfs	1	1	cleanup	4828	1		
+baseline_overlayfs	1	2	branch_create_total	2831	1		
+baseline_overlayfs	1	2	branch_create_avg	2831	1		
+baseline_overlayfs	1	2	parallel_workload	32808	1		
+baseline_overlayfs	1	2	commit	2925	1		
+baseline_overlayfs	1	2	cleanup	5142	1		
+baseline_overlayfs	1	3	branch_create_total	2909	1		
+baseline_overlayfs	1	3	branch_create_avg	2909	1		
+baseline_overlayfs	1	3	parallel_workload	32252	1		
+baseline_overlayfs	1	3	commit	2652	1		
+baseline_overlayfs	1	3	cleanup	5210	1		
+baseline_overlayfs	2	1	branch_create_total	5302	2		
+baseline_overlayfs	2	1	branch_create_avg	2651	2		
+baseline_overlayfs	2	1	parallel_workload	33196	2		
+baseline_overlayfs	2	1	commit	3125	1		
+baseline_overlayfs	2	1	cleanup	7034	2		
+baseline_overlayfs	2	2	branch_create_total	5865	2		
+baseline_overlayfs	2	2	branch_create_avg	2932	2		
+baseline_overlayfs	2	2	parallel_workload	34825	2		
+baseline_overlayfs	2	2	commit	2536	1		
+baseline_overlayfs	2	2	cleanup	6319	2		
+baseline_overlayfs	2	3	branch_create_total	5533	2		
+baseline_overlayfs	2	3	branch_create_avg	2766	2		
+baseline_overlayfs	2	3	parallel_workload	34535	2		
+baseline_overlayfs	2	3	commit	3048	1		
+baseline_overlayfs	2	3	cleanup	6330	2		
+baseline_overlayfs	4	1	branch_create_total	10830	4		
+baseline_overlayfs	4	1	branch_create_avg	2707	4		
+baseline_overlayfs	4	1	parallel_workload	35952	4		
+baseline_overlayfs	4	1	commit	2599	1		
+baseline_overlayfs	4	1	cleanup	9030	4		
+baseline_overlayfs	4	2	branch_create_total	12304	4		
+baseline_overlayfs	4	2	branch_create_avg	3076	4		
+baseline_overlayfs	4	2	parallel_workload	36010	4		
+baseline_overlayfs	4	2	commit	2603	1		
+baseline_overlayfs	4	2	cleanup	8558	4		
+baseline_overlayfs	4	3	branch_create_total	12863	4		
+baseline_overlayfs	4	3	branch_create_avg	3215	4		
+baseline_overlayfs	4	3	parallel_workload	35081	4		
+baseline_overlayfs	4	3	commit	2977	1		
+baseline_overlayfs	4	3	cleanup	8511	4		
+baseline_overlayfs	8	1	branch_create_total	21960	8		
+baseline_overlayfs	8	1	branch_create_avg	2745	8		
+baseline_overlayfs	8	1	parallel_workload	38572	8		
+baseline_overlayfs	8	1	commit	3419	1		
+baseline_overlayfs	8	1	cleanup	14066	8		
+baseline_overlayfs	8	2	branch_create_total	23409	8		
+baseline_overlayfs	8	2	branch_create_avg	2926	8		
+baseline_overlayfs	8	2	parallel_workload	38714	8		
+baseline_overlayfs	8	2	commit	2969	1		
+baseline_overlayfs	8	2	cleanup	14206	8		
+baseline_overlayfs	8	3	branch_create_total	21952	8		
+baseline_overlayfs	8	3	branch_create_avg	2744	8		
+baseline_overlayfs	8	3	parallel_workload	38664	8		
+baseline_overlayfs	8	3	commit	3138	1		
+baseline_overlayfs	8	3	cleanup	14526	8		
+baseline_overlayfs	16	1	branch_create_total	46128	16		
+baseline_overlayfs	16	1	branch_create_avg	2883	16		
+baseline_overlayfs	16	1	parallel_workload	39639	16		
+baseline_overlayfs	16	1	commit	3359	1		
+baseline_overlayfs	16	1	cleanup	25121	16		
+baseline_overlayfs	16	2	branch_create_total	45612	16		
+baseline_overlayfs	16	2	branch_create_avg	2850	16		
+baseline_overlayfs	16	2	parallel_workload	38792	16		
+baseline_overlayfs	16	2	commit	2510	1		
+baseline_overlayfs	16	2	cleanup	23516	16		
+baseline_overlayfs	16	3	branch_create_total	44094	16		
+baseline_overlayfs	16	3	branch_create_avg	2755	16		
+baseline_overlayfs	16	3	parallel_workload	37822	16		
+baseline_overlayfs	16	3	commit	2596	1		
+baseline_overlayfs	16	3	cleanup	25847	16		
+baseline_overlayfs	32	1	branch_create_total	87711	32		
+baseline_overlayfs	32	1	branch_create_avg	2740	32		
+baseline_overlayfs	32	1	parallel_workload	44015	32		
+baseline_overlayfs	32	1	commit	2854	1		
+baseline_overlayfs	32	1	cleanup	46727	32		
+baseline_overlayfs	32	2	branch_create_total	90229	32		
+baseline_overlayfs	32	2	branch_create_avg	2819	32		
+baseline_overlayfs	32	2	parallel_workload	45849	32		
+baseline_overlayfs	32	2	commit	2800	1		
+baseline_overlayfs	32	2	cleanup	43240	32		
+baseline_overlayfs	32	3	branch_create_total	90360	32		
+baseline_overlayfs	32	3	branch_create_avg	2823	32		
+baseline_overlayfs	32	3	parallel_workload	48086	32		
+baseline_overlayfs	32	3	commit	3001	1		
+baseline_overlayfs	32	3	cleanup	47871	32		
diff --git a/bench_results/daxfs_bench.png b/bench_results/daxfs_bench.png
new file mode 100644
index 0000000..16b5ec2
Binary files /dev/null and b/bench_results/daxfs_bench.png differ
diff --git a/bench_results/daxfs_bench_branch_avg.png b/bench_results/daxfs_bench_branch_avg.png
new file mode 100644
index 0000000..e23e7de
Binary files /dev/null and b/bench_results/daxfs_bench_branch_avg.png differ
diff --git a/bench_results/daxfs_bench_branch_creation.png b/bench_results/daxfs_bench_branch_creation.png
new file mode 100644
index 0000000..f951846
Binary files /dev/null and b/bench_results/daxfs_bench_branch_creation.png differ
diff --git a/bench_results/daxfs_bench_commit_cost.png b/bench_results/daxfs_bench_commit_cost.png
new file mode 100644
index 0000000..0baf123
Binary files /dev/null and b/bench_results/daxfs_bench_commit_cost.png differ
diff --git a/bench_results/daxfs_bench_commit_time.png b/bench_results/daxfs_bench_commit_time.png
new file mode 100644
index 0000000..966b15c
Binary files /dev/null and b/bench_results/daxfs_bench_commit_time.png differ
diff --git a/bench_results/daxfs_bench_nesting_depth.png b/bench_results/daxfs_bench_nesting_depth.png
new file mode 100644
index 0000000..7ffb024
Binary files /dev/null and b/bench_results/daxfs_bench_nesting_depth.png differ
diff --git a/bench_results/daxfs_bench_parallel_workload.png b/bench_results/daxfs_bench_parallel_workload.png
new file mode 100644
index 0000000..6dbd430
Binary files /dev/null and b/bench_results/daxfs_bench_parallel_workload.png differ
diff --git a/bench_results/daxfs_gpu_bench.png b/bench_results/daxfs_gpu_bench.png
new file mode 100644
index 0000000..b89a565
Binary files /dev/null and b/bench_results/daxfs_gpu_bench.png differ
diff --git a/bench_results/daxfs_gpu_bench_cas_throughput.png b/bench_results/daxfs_gpu_bench_cas_throughput.png
new file mode 100644
index 0000000..7a99f51
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_cas_throughput.png differ
diff --git a/bench_results/daxfs_gpu_bench_claim_throughput.png b/bench_results/daxfs_gpu_bench_claim_throughput.png
new file mode 100644
index 0000000..3a900d5
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_claim_throughput.png differ
diff --git a/bench_results/daxfs_gpu_bench_latency_bars.png b/bench_results/daxfs_gpu_bench_latency_bars.png
new file mode 100644
index 0000000..ba53aa3
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_latency_bars.png differ
diff --git a/bench_results/daxfs_gpu_bench_lock_contention.png b/bench_results/daxfs_gpu_bench_lock_contention.png
new file mode 100644
index 0000000..e0e456f
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_lock_contention.png differ
diff --git a/bench_results/daxfs_gpu_bench_lookup_latency.png b/bench_results/daxfs_gpu_bench_lookup_latency.png
new file mode 100644
index 0000000..605355c
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_lookup_latency.png differ
diff --git a/bench_results/daxfs_gpu_bench_lookup_throughput.png b/bench_results/daxfs_gpu_bench_lookup_throughput.png
new file mode 100644
index 0000000..b458849
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_lookup_throughput.png differ
diff --git a/bench_results/daxfs_gpu_bench_mt_acl_overhead.png b/bench_results/daxfs_gpu_bench_mt_acl_overhead.png
new file mode 100644
index 0000000..41db17f
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_mt_acl_overhead.png differ
diff --git a/bench_results/daxfs_gpu_bench_mt_isolation.png b/bench_results/daxfs_gpu_bench_mt_isolation.png
new file mode 100644
index 0000000..92eb8bc
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_mt_isolation.png differ
diff --git a/bench_results/daxfs_gpu_bench_mt_read_tenants.png b/bench_results/daxfs_gpu_bench_mt_read_tenants.png
new file mode 100644
index 0000000..e490354
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_mt_read_tenants.png differ
diff --git a/bench_results/daxfs_gpu_bench_mt_write_tenants.png b/bench_results/daxfs_gpu_bench_mt_write_tenants.png
new file mode 100644
index 0000000..c6323a4
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_mt_write_tenants.png differ
diff --git a/bench_results/daxfs_gpu_bench_p2p_bandwidth.png b/bench_results/daxfs_gpu_bench_p2p_bandwidth.png
new file mode 100644
index 0000000..81843ec
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_p2p_bandwidth.png differ
diff --git a/bench_results/daxfs_gpu_bench_p2p_latency.png b/bench_results/daxfs_gpu_bench_p2p_latency.png
new file mode 100644
index 0000000..8b72f0b
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_p2p_latency.png differ
diff --git a/bench_results/daxfs_gpu_bench_p2p_mt.png b/bench_results/daxfs_gpu_bench_p2p_mt.png
new file mode 100644
index 0000000..bce5afb
Binary files /dev/null and b/bench_results/daxfs_gpu_bench_p2p_mt.png differ
diff --git a/bench_results/gpu.tsv b/bench_results/gpu.tsv
new file mode 100644
index 0000000..ddec281
--- /dev/null
+++ b/bench_results/gpu.tsv
@@ -0,0 +1,143 @@
+experiment	parameter	iteration	operation	latency_ns	ops_count	thru_mops	notes
+gpu_coord_lock	1	1	lock_unlock_rt	2047.8	10000	0.569	single_thread
+gpu_coord_lock	1	2	lock_unlock_rt	2048.9	10000	0.569	single_thread
+gpu_coord_lock	1	3	lock_unlock_rt	2046.7	10000	0.569	single_thread
+gpu_coord_lock	1	4	lock_unlock_rt	2050.9	10000	0.568	single_thread
+gpu_coord_lock	1	5	lock_unlock_rt	2045.4	10000	0.570	single_thread
+gpu_commit_seq	1	1	volatile_read	532.3	10000	2.189	single_thread
+gpu_commit_seq	1	2	volatile_read	532.8	10000	2.187	single_thread
+gpu_commit_seq	1	3	volatile_read	531.8	10000	2.191	single_thread
+gpu_commit_seq	1	4	volatile_read	524.5	10000	2.219	single_thread
+gpu_commit_seq	1	5	volatile_read	524.1	10000	2.221	single_thread
+gpu_pending_ctr	1	1	cas_inc_dec	1786.7	20000	0.653	single_thread
+gpu_pending_ctr	1	2	cas_inc_dec	1817.5	20000	0.641	single_thread
+gpu_pending_ctr	1	3	cas_inc_dec	1817.8	20000	0.641	single_thread
+gpu_pending_ctr	1	4	cas_inc_dec	1817.6	20000	0.641	single_thread
+gpu_pending_ctr	1	5	cas_inc_dec	1817.6	20000	0.641	single_thread
+gpu_pcache_lookup	1	1	lookup_throughput	918.8	10000	1.088	
+gpu_pcache_lookup	1	2	lookup_throughput	914.4	10000	1.094	
+gpu_pcache_lookup	1	3	lookup_throughput	908.5	10000	1.101	
+gpu_pcache_lookup	1	4	lookup_throughput	908.4	10000	1.101	
+gpu_pcache_lookup	1	5	lookup_throughput	908.7	10000	1.101	
+gpu_pcache_lookup	32	1	lookup_throughput	30.4	320000	32.929	
+gpu_pcache_lookup	32	2	lookup_throughput	30.4	320000	32.860	
+gpu_pcache_lookup	32	3	lookup_throughput	30.4	320000	32.847	
+gpu_pcache_lookup	32	4	lookup_throughput	30.4	320000	32.860	
+gpu_pcache_lookup	32	5	lookup_throughput	30.5	320000	32.833	
+gpu_pcache_lookup	64	1	lookup_throughput	20.3	640000	49.253	
+gpu_pcache_lookup	64	2	lookup_throughput	20.3	640000	49.204	
+gpu_pcache_lookup	64	3	lookup_throughput	20.3	640000	49.161	
+gpu_pcache_lookup	64	4	lookup_throughput	20.3	640000	49.246	
+gpu_pcache_lookup	64	5	lookup_throughput	20.3	640000	49.247	
+gpu_pcache_lookup	128	1	lookup_throughput	15.2	1280000	65.662	
+gpu_pcache_lookup	128	2	lookup_throughput	15.2	1280000	65.661	
+gpu_pcache_lookup	128	3	lookup_throughput	15.2	1280000	65.683	
+gpu_pcache_lookup	128	4	lookup_throughput	15.2	1280000	65.671	
+gpu_pcache_lookup	128	5	lookup_throughput	15.2	1280000	65.663	
+gpu_pcache_lookup	256	1	lookup_throughput	7.6	2560000	131.157	
+gpu_pcache_lookup	256	2	lookup_throughput	7.6	2560000	131.091	
+gpu_pcache_lookup	256	3	lookup_throughput	7.6	2560000	130.796	
+gpu_pcache_lookup	256	4	lookup_throughput	7.6	2560000	130.978	
+gpu_pcache_lookup	256	5	lookup_throughput	7.6	2560000	131.228	
+gpu_pcache_lookup	512	1	lookup_throughput	3.7	5120000	268.549	
+gpu_pcache_lookup	512	2	lookup_throughput	3.7	5120000	267.784	
+gpu_pcache_lookup	512	3	lookup_throughput	3.7	5120000	268.770	
+gpu_pcache_lookup	512	4	lookup_throughput	3.7	5120000	267.830	
+gpu_pcache_lookup	512	5	lookup_throughput	3.7	5120000	268.546	
+gpu_pcache_lookup	1024	1	lookup_throughput	1.9	10240000	539.197	
+gpu_pcache_lookup	1024	2	lookup_throughput	1.9	10240000	538.267	
+gpu_pcache_lookup	1024	3	lookup_throughput	1.9	10240000	537.556	
+gpu_pcache_lookup	1024	4	lookup_throughput	1.9	10240000	537.570	
+gpu_pcache_lookup	1024	5	lookup_throughput	1.9	10240000	538.479	
+gpu_slot_cas	1	1	cas_throughput	1407.7	1000	0.710	independent
+gpu_slot_cas	1	2	cas_throughput	1366.5	1000	0.732	independent
+gpu_slot_cas	1	3	cas_throughput	1365.6	1000	0.732	independent
+gpu_slot_cas	1	4	cas_throughput	1364.6	1000	0.733	independent
+gpu_slot_cas	1	5	cas_throughput	1365.9	1000	0.732	independent
+gpu_slot_cas	32	1	cas_throughput	137.0	32000	7.297	independent
+gpu_slot_cas	32	2	cas_throughput	137.5	32000	7.275	independent
+gpu_slot_cas	32	3	cas_throughput	137.1	32000	7.295	independent
+gpu_slot_cas	32	4	cas_throughput	138.1	32000	7.242	independent
+gpu_slot_cas	32	5	cas_throughput	137.2	32000	7.290	independent
+gpu_slot_cas	64	1	cas_throughput	129.5	64000	7.723	independent
+gpu_slot_cas	64	2	cas_throughput	129.4	64000	7.727	independent
+gpu_slot_cas	64	3	cas_throughput	129.5	64000	7.720	independent
+gpu_slot_cas	64	4	cas_throughput	129.4	64000	7.725	independent
+gpu_slot_cas	64	5	cas_throughput	129.5	64000	7.722	independent
+gpu_slot_cas	128	1	cas_throughput	137.4	128000	7.278	independent
+gpu_slot_cas	128	2	cas_throughput	140.6	128000	7.114	independent
+gpu_slot_cas	128	3	cas_throughput	141.1	128000	7.086	independent
+gpu_slot_cas	128	4	cas_throughput	143.3	128000	6.981	independent
+gpu_slot_cas	128	5	cas_throughput	140.9	128000	7.099	independent
+gpu_slot_cas	256	1	cas_throughput	87.9	256000	11.383	independent
+gpu_slot_cas	256	2	cas_throughput	91.7	256000	10.910	independent
+gpu_slot_cas	256	3	cas_throughput	91.7	256000	10.908	independent
+gpu_slot_cas	256	4	cas_throughput	89.1	256000	11.230	independent
+gpu_slot_cas	256	5	cas_throughput	89.9	256000	11.122	independent
+gpu_slot_cas	512	1	cas_throughput	86.4	512000	11.568	independent
+gpu_slot_cas	512	2	cas_throughput	84.0	512000	11.912	independent
+gpu_slot_cas	512	3	cas_throughput	85.7	512000	11.671	independent
+gpu_slot_cas	512	4	cas_throughput	85.0	512000	11.759	independent
+gpu_slot_cas	512	5	cas_throughput	85.7	512000	11.664	independent
+gpu_slot_cas	1024	1	cas_throughput	85.5	1024000	11.703	independent
+gpu_slot_cas	1024	2	cas_throughput	85.6	1024000	11.678	independent
+gpu_slot_cas	1024	3	cas_throughput	86.0	1024000	11.629	independent
+gpu_slot_cas	1024	4	cas_throughput	85.9	1024000	11.645	independent
+gpu_slot_cas	1024	5	cas_throughput	85.6	1024000	11.681	independent
+gpu_lock_contention	1	1	lock_acquisition	2664.0	100	0.375	counter=100
+gpu_lock_contention	1	2	lock_acquisition	2257.0	100	0.443	counter=100
+gpu_lock_contention	1	3	lock_acquisition	2277.1	100	0.439	counter=100
+gpu_lock_contention	1	4	lock_acquisition	2251.5	100	0.444	counter=100
+gpu_lock_contention	1	5	lock_acquisition	2260.2	100	0.442	counter=100
+gpu_lock_contention	2	1	lock_acquisition	1151.2	200	0.869	counter=101
+gpu_lock_contention	2	2	lock_acquisition	1140.6	200	0.877	counter=101
+gpu_lock_contention	2	3	lock_acquisition	1151.2	200	0.869	counter=101
+gpu_lock_contention	2	4	lock_acquisition	1149.6	200	0.870	counter=101
+gpu_lock_contention	2	5	lock_acquisition	1149.6	200	0.870	counter=101
+gpu_lock_contention	4	1	lock_acquisition	591.2	400	1.691	counter=103
+gpu_lock_contention	4	2	lock_acquisition	592.6	400	1.688	counter=103
+gpu_lock_contention	4	3	lock_acquisition	592.6	400	1.688	counter=103
+gpu_lock_contention	4	4	lock_acquisition	588.0	400	1.701	counter=103
+gpu_lock_contention	4	5	lock_acquisition	588.5	400	1.699	counter=103
+gpu_lock_contention	8	1	lock_acquisition	319.7	800	3.128	counter=107
+gpu_lock_contention	8	2	lock_acquisition	319.4	800	3.130	counter=107
+gpu_lock_contention	8	3	lock_acquisition	319.7	800	3.128	counter=107
+gpu_lock_contention	8	4	lock_acquisition	319.5	800	3.130	counter=107
+gpu_lock_contention	8	5	lock_acquisition	319.4	800	3.131	counter=107
+gpu_lock_contention	16	1	lock_acquisition	199.3	1600	5.017	counter=115
+gpu_lock_contention	16	2	lock_acquisition	198.8	1600	5.030	counter=115
+gpu_lock_contention	16	3	lock_acquisition	198.8	1600	5.030	counter=115
+gpu_lock_contention	16	4	lock_acquisition	199.4	1600	5.015	counter=115
+gpu_lock_contention	16	5	lock_acquisition	198.8	1600	5.030	counter=115
+gpu_lock_contention	32	1	lock_acquisition	168.1	3200	5.948	counter=131
+gpu_lock_contention	32	2	lock_acquisition	168.0	3200	5.953	counter=131
+gpu_lock_contention	32	3	lock_acquisition	168.2	3200	5.944	counter=131
+gpu_lock_contention	32	4	lock_acquisition	167.8	3200	5.959	counter=131
+gpu_lock_contention	32	5	lock_acquisition	167.8	3200	5.958	counter=131
+gpu_pcache_claim	1	1	claim_throughput	4397.6	1000	0.227	free_to_pending
+gpu_pcache_claim	1	2	claim_throughput	4387.6	1000	0.228	free_to_pending
+gpu_pcache_claim	1	3	claim_throughput	4388.1	1000	0.228	free_to_pending
+gpu_pcache_claim	1	4	claim_throughput	4388.2	1000	0.228	free_to_pending
+gpu_pcache_claim	1	5	claim_throughput	4388.0	1000	0.228	free_to_pending
+gpu_pcache_claim	32	1	claim_throughput	20820.4	32000	0.048	free_to_pending
+gpu_pcache_claim	32	2	claim_throughput	20827.9	32000	0.048	free_to_pending
+gpu_pcache_claim	32	3	claim_throughput	19216.1	32000	0.052	free_to_pending
+gpu_pcache_claim	32	4	claim_throughput	17914.2	32000	0.056	free_to_pending
+gpu_pcache_claim	32	5	claim_throughput	20202.7	32000	0.049	free_to_pending
+gpu_pcache_claim	64	1	claim_throughput	50021.0	64000	0.020	free_to_pending
+gpu_pcache_claim	64	2	claim_throughput	49535.5	64000	0.020	free_to_pending
+gpu_pcache_claim	64	3	claim_throughput	50754.8	64000	0.020	free_to_pending
+gpu_pcache_claim	64	4	claim_throughput	49424.7	64000	0.020	free_to_pending
+gpu_pcache_claim	64	5	claim_throughput	49654.4	64000	0.020	free_to_pending
+gpu_pcache_claim	128	1	claim_throughput	48636.7	128000	0.021	free_to_pending
+gpu_pcache_claim	128	2	claim_throughput	51128.5	128000	0.020	free_to_pending
+gpu_pcache_claim	128	3	claim_throughput	50332.4	128000	0.020	free_to_pending
+gpu_pcache_claim	128	4	claim_throughput	50294.6	128000	0.020	free_to_pending
+gpu_pcache_claim	128	5	claim_throughput	50091.9	128000	0.020	free_to_pending
+gpu_pcache_claim	256	1	claim_throughput	97352.1	256000	0.010	free_to_pending
+gpu_pcache_claim	256	2	claim_throughput	98231.3	256000	0.010	free_to_pending
+gpu_pcache_claim	256	3	claim_throughput	97677.1	256000	0.010	free_to_pending
+gpu_pcache_claim	256	4	claim_throughput	98483.2	256000	0.010	free_to_pending
+gpu_pcache_claim	256	5	claim_throughput	98240.5	256000	0.010	free_to_pending
+gpu_pcache_claim	512	1	claim_throughput	196027.4	512000	0.005	free_to_pending
+gpu_pcache_claim	512	2	claim_throughput	196364.6	512000	0.005	free_to_pending
diff --git a/daxfs/file.c b/daxfs/file.c
index f11e59b..b4b4e76 100644
--- a/daxfs/file.c
+++ b/daxfs/file.c
@@ -11,6 +11,8 @@
 #include <linux/pagemap.h>
 #include <linux/dma-buf.h>
 #include <linux/splice.h>
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
 #include <linux/prefetch.h>
 #include "daxfs.h"
 
@@ -761,6 +763,47 @@ long daxfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			dma_buf_put(info->dmabuf);
 		return fd;
 	}
+	case DAXFS_IOC_GET_GPU_INFO: {
+		struct daxfs_gpu_info gi;
+		u64 pcache_off;
+
+		memset(&gi, 0, sizeof(gi));
+
+		gi.dax_phys_addr = daxfs_mem_phys(info, 0);
+		gi.dax_size = info->size;
+
+		if (info->coord) {
+			gi.coord_offset = daxfs_mem_offset(info, info->coord);
+			gi.coord_lock_off = offsetof(struct daxfs_global_coord,
+						     coord_lock);
+			gi.commit_seq_off = offsetof(struct daxfs_global_coord,
+						     commit_sequence);
+		}
+
+		pcache_off = le64_to_cpu(info->super->pcache_offset);
+		if (pcache_off && info->pcache) {
+			struct daxfs_pcache_header *hdr;
+
+			hdr = info->pcache->header;
+			gi.pcache_offset = pcache_off;
+			gi.pcache_slots_offset = pcache_off +
+				le64_to_cpu(hdr->slot_meta_offset);
+			gi.pcache_data_offset = pcache_off +
+				le64_to_cpu(hdr->slot_data_offset);
+			gi.pcache_slot_count = info->pcache->slot_count;
+			gi.pcache_slot_stride =
+				sizeof(struct daxfs_pcache_slot);
+			gi.state_tag_off = offsetof(struct daxfs_pcache_slot,
+						    state_tag);
+			gi.pending_count_off =
+				offsetof(struct daxfs_pcache_header,
+					 pending_count);
+		}
+
+		if (copy_to_user((void __user *)arg, &gi, sizeof(gi)))
+			return -EFAULT;
+		return 0;
+	}
 	}
 	return -ENOTTY;
 }
diff --git a/include/daxfs_format.h b/include/daxfs_format.h
index b752708..cc50222 100644
--- a/include/daxfs_format.h
+++ b/include/daxfs_format.h
@@ -14,6 +14,29 @@
 
 /* ioctl commands */
 #define DAXFS_IOC_GET_DMABUF	_IO('D', 1)	/* Get dma-buf fd for this mount */
+#define DAXFS_IOC_GET_GPU_INFO	_IOR('D', 2, struct daxfs_gpu_info)
+
+/*
+ * GPU info for PCIe AtomicOps coordination.
+ * Exposes physical addresses and field offsets so a GPU can participate
+ * in the same cmpxchg-based protocols (coord lock, page cache state
+ * machine) via PCIe CAS TLPs.
+ */
+struct daxfs_gpu_info {
+	__u64 dax_phys_addr;		/* Physical base of DAX region */
+	__u64 dax_size;			/* Total DAX region size */
+	__u64 coord_offset;		/* Offset of daxfs_global_coord from base */
+	__u32 coord_lock_off;		/* offsetof(coord_lock) within coord */
+	__u32 commit_seq_off;		/* offsetof(commit_sequence) within coord */
+	__u64 pcache_offset;		/* Offset of pcache region (0 = none) */
+	__u64 pcache_slots_offset;	/* Offset of slot metadata array */
+	__u64 pcache_data_offset;	/* Offset of slot data area */
+	__u32 pcache_slot_count;	/* Number of cache slots */
+	__u32 pcache_slot_stride;	/* sizeof(daxfs_pcache_slot) = 16 */
+	__u32 state_tag_off;		/* offsetof(state_tag) in slot = 0 */
+	__u32 pending_count_off;	/* offsetof(pending_count) in pcache_header */
+	__u64 reserved[4];
+};
 
 #define DAXFS_SUPER_MAGIC	0x64617835	/* "dax5" */
 #define DAXFS_VERSION		8
diff --git a/include/daxfs_gpu.h b/include/daxfs_gpu.h
new file mode 100644
index 0000000..aae24c3
--- /dev/null
+++ b/include/daxfs_gpu.h
@@ -0,0 +1,269 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * daxfs GPU-side coordination header (CUDA)
+ *
+ * Device-inline functions that mirror the kernel's cmpxchg-based
+ * coordination protocols for the coordination lock and shared page
+ * cache.  Each function compiles to PCIe AtomicOp TLPs (CAS / Swap)
+ * that serialize at the memory controller alongside CPU LOCK CMPXCHG,
+ * giving mutual atomicity across CPU and GPU.
+ *
+ * All pointers below are CUdeviceptr-derived device pointers into the
+ * DAX region mapped via daxfs-gpu-map.  The caller is responsible for
+ * computing field addresses from the base pointer and the offsets
+ * returned by DAXFS_IOC_GET_GPU_INFO.
+ *
+ * Copyright (C) 2026 Multikernel Technologies, Inc. All rights reserved.
+ */
+#ifndef _DAXFS_GPU_H
+#define _DAXFS_GPU_H
+
+#ifdef __CUDACC__
+
+/* Re-export the state/tag helpers so GPU code matches kernel conventions.
+ * Guarded so this header can coexist with daxfs_format.h. */
+#ifndef PCACHE_STATE_FREE
+#define PCACHE_STATE_FREE    0
+#define PCACHE_STATE_PENDING 1
+#define PCACHE_STATE_VALID   2
+#endif
+
+#ifndef PCACHE_STATE
+#define PCACHE_STATE(v)        ((v) & 3ULL)
+#define PCACHE_TAG(v)          ((v) >> 2)
+#define PCACHE_MAKE(state, tag) (((unsigned long long)(tag) << 2) | (state))
+#endif
+
+/*
+ * =========================================================================
+ * Coordination lock (mirrors branch.c daxfs_coord_lock / unlock)
+ * =========================================================================
+ */
+
+/*
+ * Acquire the global coordination lock.
+ *
+ * @lock: device pointer to coord_lock (__le32, 0 = free, 1 = held)
+ *
+ * Spins with atomicCAS (32-bit PCIe CAS TLP) until the lock is acquired.
+ * __threadfence_system() after acquisition ensures all subsequent GPU
+ * loads/stores are ordered after the lock is visible on the PCIe fabric.
+ */
+static __device__ __forceinline__ void
+daxfs_gpu_coord_lock(unsigned int *lock)
+{
+	while (atomicCAS(lock, 0u, 1u) != 0u)
+		;  /* spin */
+	__threadfence_system();
+}
+
+/*
+ * Release the global coordination lock.
+ *
+ * @lock: device pointer to coord_lock
+ *
+ * __threadfence_system() before release ensures all prior GPU stores are
+ * visible on the PCIe fabric before the lock word is cleared.
+ * atomicExch generates a PCIe Swap TLP (unconditional write with return).
+ */
+static __device__ __forceinline__ void
+daxfs_gpu_coord_unlock(unsigned int *lock)
+{
+	__threadfence_system();
+	atomicExch(lock, 0u);
+}
+
+/*
+ * =========================================================================
+ * Commit sequence (mirrors branch.c daxfs_commit_seq_changed)
+ * =========================================================================
+ */
+
+/*
+ * Read the current commit sequence number.
+ *
+ * @seq: device pointer to commit_sequence (__le64)
+ *
+ * Returns the current value.  Caller compares against a cached copy to
+ * detect new commits.  Uses a volatile load (no atomicCAS needed for
+ * read-only access on a naturally-aligned 64-bit word).
+ */
+static __device__ __forceinline__ unsigned long long
+daxfs_gpu_read_commit_seq(const unsigned long long *seq)
+{
+	return *(volatile const unsigned long long *)seq;
+}
+
+/*
+ * =========================================================================
+ * Page cache slot state machine (mirrors pcache.c slot_cmpxchg)
+ * =========================================================================
+ */
+
+/*
+ * Atomic compare-and-swap on a pcache slot's state_tag field.
+ *
+ * @state_tag: device pointer to slot->state_tag (__le64)
+ * @expected:  expected old value (packed state | tag)
+ * @desired:   new value to write if *state_tag == expected
+ *
+ * Returns the value that was in *state_tag before the operation.
+ * Maps to a 64-bit PCIe CAS TLP.
+ */
+static __device__ __forceinline__ unsigned long long
+daxfs_gpu_slot_cmpxchg(unsigned long long *state_tag,
+		       unsigned long long expected,
+		       unsigned long long desired)
+{
+	unsigned long long old = atomicCAS(state_tag, expected, desired);
+
+	__threadfence_system();
+	return old;
+}
+
+/*
+ * =========================================================================
+ * Pending counter (mirrors pcache.c pcache_inc/dec_pending)
+ * =========================================================================
+ */
+
+/*
+ * Atomically increment the pending slot counter.
+ *
+ * @pending_count: device pointer to pcache_header->pending_count (__le32)
+ *
+ * CAS loop identical to the kernel's pcache_inc_pending().
+ */
+static __device__ __forceinline__ void
+daxfs_gpu_pcache_inc_pending(unsigned int *pending_count)
+{
+	unsigned int old_val, new_val;
+
+	do {
+		old_val = *(volatile unsigned int *)pending_count;
+		new_val = old_val + 1;
+	} while (atomicCAS(pending_count, old_val, new_val) != old_val);
+	__threadfence_system();
+}
+
+/*
+ * Atomically decrement the pending slot counter (saturates at 0).
+ *
+ * @pending_count: device pointer to pcache_header->pending_count (__le32)
+ */
+static __device__ __forceinline__ void
+daxfs_gpu_pcache_dec_pending(unsigned int *pending_count)
+{
+	unsigned int old_val, new_val;
+
+	do {
+		old_val = *(volatile unsigned int *)pending_count;
+		if (old_val == 0)
+			return;
+		new_val = old_val - 1;
+	} while (atomicCAS(pending_count, old_val, new_val) != old_val);
+	__threadfence_system();
+}
+
+/*
+ * =========================================================================
+ * Page cache fast-path lookup (mirrors pcache.c daxfs_pcache_get_page)
+ * =========================================================================
+ */
+
+/*
+ * Fast-path cache lookup: load state_tag, check VALID + tag match.
+ *
+ * @state_tag:   device pointer to slot->state_tag
+ * @desired_tag: expected tag value (backing_page_offset >> 12)
+ *
+ * Returns true if slot is VALID with matching tag (cache hit).
+ * Caller can then read directly from the slot data area.
+ */
+static __device__ __forceinline__ bool
+daxfs_gpu_pcache_lookup(const unsigned long long *state_tag,
+			unsigned long long desired_tag)
+{
+	unsigned long long val;
+
+	val = *(volatile const unsigned long long *)state_tag;
+	if (val == PCACHE_MAKE(PCACHE_STATE_VALID, desired_tag)) {
+		__threadfence_system();  /* order data read after state check */
+		return true;
+	}
+	return false;
+}
+
+/*
+ * =========================================================================
+ * Page cache slot claim (mirrors pcache.c slow path FREE -> PENDING)
+ * =========================================================================
+ */
+
+/*
+ * Attempt to claim a FREE cache slot by transitioning to PENDING.
+ *
+ * @state_tag:   device pointer to slot->state_tag
+ * @desired_tag: tag to install (backing_page_offset >> 12)
+ *
+ * Returns true if the slot was successfully claimed (FREE -> PENDING).
+ * On success the caller must also call daxfs_gpu_pcache_inc_pending().
+ * On failure the slot was not FREE or was raced; caller should re-read
+ * and retry.
+ */
+static __device__ __forceinline__ bool
+daxfs_gpu_pcache_claim(unsigned long long *state_tag,
+		       unsigned long long desired_tag)
+{
+	unsigned long long free_val = PCACHE_MAKE(PCACHE_STATE_FREE, 0);
+	unsigned long long pend_val = PCACHE_MAKE(PCACHE_STATE_PENDING,
+						  desired_tag);
+	unsigned long long old;
+
+	old = atomicCAS(state_tag, free_val, pend_val);
+	__threadfence_system();
+	return old == free_val;
+}
+
+/*
+ * =========================================================================
+ * Wait for slot to become VALID (mirrors pcache.c wait_valid)
+ * =========================================================================
+ */
+
+/*
+ * Poll until a PENDING slot transitions to VALID with matching tag.
+ *
+ * @state_tag:   device pointer to slot->state_tag
+ * @desired_tag: expected tag value
+ * @max_iters:   maximum poll iterations before giving up
+ *
+ * Returns true if slot became VALID with matching tag within the
+ * iteration budget.  Returns false on timeout or unexpected state
+ * (e.g., slot was evicted to FREE).
+ */
+static __device__ __forceinline__ bool
+daxfs_gpu_pcache_wait_valid(const unsigned long long *state_tag,
+			    unsigned long long desired_tag,
+			    unsigned int max_iters)
+{
+	unsigned long long val;
+	unsigned long long expected = PCACHE_MAKE(PCACHE_STATE_VALID,
+						  desired_tag);
+	unsigned int i;
+
+	for (i = 0; i < max_iters; i++) {
+		val = *(volatile const unsigned long long *)state_tag;
+		if (val == expected) {
+			__threadfence_system();
+			return true;
+		}
+		/* Slot evicted from under us */
+		if (PCACHE_STATE(val) == PCACHE_STATE_FREE)
+			return false;
+	}
+	return false;
+}
+
+#endif /* __CUDACC__ */
+#endif /* _DAXFS_GPU_H */
diff --git a/tests/bench_agent.sh b/tests/bench_agent.sh
new file mode 100755
index 0000000..2a83f3e
--- /dev/null
+++ b/tests/bench_agent.sh
@@ -0,0 +1,841 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# DAXFS AI Agent Speculative Branching Benchmark
+#
+# Measures branch creation, parallel workload execution, commit/merge,
+# and sibling invalidation — the core operations for AI agent speculative
+# execution where N agents fork from shared filesystem state.
+#
+# Usage: sudo ./tests/bench_agent.sh [options]
+#
+# Options:
+#   -n NUM    Maximum agent count (powers of 2 up to this, default: 64)
+#   -d NUM    Maximum nesting depth (default: 8)
+#   -w SIZE   Workload: small, medium, large (default: medium)
+#   -r NUM    Repetitions per measurement (default: 3)
+#   -o DIR    Output directory (default: ./bench_results)
+#   -v        Verbose output
+#
+# Requirements:
+#   - Root privileges
+#   - daxfs.ko module built
+#   - mkdaxfs, daxfs-branch, daxfs-inspect tools built
+#   - /dev/dma_heap/system available (or modify HEAP_DEV)
+
+set -e
+
+# ── Configuration ────────────────────────────────────────────────────
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
+MKDAXFS="$PROJECT_DIR/tools/mkdaxfs"
+DAXFS_BRANCH="$PROJECT_DIR/tools/daxfs-branch"
+DAXFS_INSPECT="$PROJECT_DIR/tools/daxfs-inspect"
+MODULE="$PROJECT_DIR/daxfs/daxfs.ko"
+HEAP_DEV="/dev/dma_heap/system"
+
+# Defaults
+MAX_AGENTS=64
+MAX_DEPTH=8
+WORKLOAD="medium"
+REPS=3
+OUTPUT_DIR="./bench_results"
+VERBOSE=0
+
+# Runtime state
+TEST_DIR=""
+MODULE_LOADED_BY_US=0
+
+# ── Output helpers ───────────────────────────────────────────────────
+
+log() { echo -e "$@"; }
+
+log_verbose() {
+    if [ "$VERBOSE" -eq 1 ]; then
+        echo -e "  [v] $*" >&2
+    fi
+}
+
+die() {
+    echo "ERROR: $1" >&2
+    exit 1
+}
+
+# ── Timing ───────────────────────────────────────────────────────────
+
+# Returns nanosecond wall-clock timestamp
+now_ns() {
+    date +%s%N
+}
+
+# Converts nanosecond delta to microseconds
+ns_to_us() {
+    echo $(( $1 / 1000 ))
+}
+
+# ── TSV output ───────────────────────────────────────────────────────
+
+TSV_FILE=""
+
+tsv_init() {
+    TSV_FILE="$OUTPUT_DIR/agents.tsv"
+    mkdir -p "$OUTPUT_DIR"
+    printf "experiment\tparameter\titeration\toperation\tlatency_us\tops_count\tdelta_bytes\tnotes\n" \
+        > "$TSV_FILE"
+}
+
+tsv_row() {
+    # args: experiment parameter iteration operation latency_us ops_count delta_bytes notes
+    printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" \
+        >> "$TSV_FILE"
+}
+
+# ── Source tree generation ───────────────────────────────────────────
+
+# Generates a C-like source file of approximately the given size (bytes)
+generate_c_file() {
+    local path="$1" target_size="$2"
+    local written=0
+    {
+        echo "/* Auto-generated source file */"
+        echo "#include <stdio.h>"
+        echo "#include <stdlib.h>"
+        echo ""
+        local func_num=0
+        while [ "$written" -lt "$target_size" ]; do
+            cat <<CFUNC
+
+static int compute_${func_num}(int x, int y) {
+    int result = 0;
+    for (int i = 0; i < x; i++) {
+        result += (i * y) ^ (i >> 2);
+        if (result > 1000000) result %= 997;
+    }
+    return result;
+}
+CFUNC
+            func_num=$((func_num + 1))
+            written=$(( written + 180 ))
+        done
+    } > "$path"
+}
+
+# Generates a header file of approximately the given size
+generate_h_file() {
+    local path="$1" target_size="$2"
+    local guard
+    guard=$(basename "$path" | tr '[:lower:].' '[:upper:]_')
+    {
+        echo "#ifndef ${guard}"
+        echo "#define ${guard}"
+        echo ""
+        local decl_num=0 written=0
+        while [ "$written" -lt "$target_size" ]; do
+            echo "int compute_${decl_num}(int x, int y);"
+            echo "struct data_${decl_num} { int field_a; long field_b; char name[64]; };"
+            echo ""
+            decl_num=$((decl_num + 1))
+            written=$((written + 100))
+        done
+        echo "#endif /* ${guard} */"
+    } > "$path"
+}
+
+# Creates ~30-file realistic C project in the given directory
+create_source_tree() {
+    local dir="$1"
+    mkdir -p "$dir/src" "$dir/include" "$dir/tests" "$dir/docs" "$dir/config"
+
+    # src/module_{1..10}.c  — 2-8KB each
+    for i in $(seq 1 10); do
+        local size=$(( 2048 + (i * 600) ))
+        generate_c_file "$dir/src/module_${i}.c" "$size"
+    done
+
+    # include/module_{1..5}.h  — 512-1536 bytes each
+    for i in $(seq 1 5); do
+        local size=$(( 512 + (i * 200) ))
+        generate_h_file "$dir/include/module_${i}.h" "$size"
+    done
+
+    # tests/test_{1..3}.c  — 1-2KB each
+    for i in $(seq 1 3); do
+        generate_c_file "$dir/tests/test_${i}.c" $(( 1024 + i * 300 ))
+    done
+
+    # docs/ARCHITECTURE.md
+    {
+        echo "# Architecture"
+        echo ""
+        echo "This project consists of 10 modules that implement a data processing pipeline."
+        echo "Each module handles a specific stage of the computation."
+        echo ""
+        for i in $(seq 1 10); do
+            echo "## Module $i"
+            echo ""
+            echo "Handles stage $i of the pipeline. Dependencies: module_$((i > 1 ? i-1 : 1))."
+            echo ""
+        done
+    } > "$dir/docs/ARCHITECTURE.md"
+
+    # config/settings.json
+    cat > "$dir/config/settings.json" <<'JSON'
+{
+    "version": "1.0.0",
+    "modules": 10,
+    "optimization_level": 2,
+    "debug": false,
+    "paths": {
+        "input": "/data/input",
+        "output": "/data/output",
+        "cache": "/tmp/cache"
+    }
+}
+JSON
+
+    # Makefile
+    {
+        echo "CC = gcc"
+        echo "CFLAGS = -Wall -Wextra -O2 -Iinclude"
+        echo "SRCS = \$(wildcard src/*.c)"
+        echo "OBJS = \$(SRCS:.c=.o)"
+        echo ""
+        echo "all: libproject.a"
+        echo ""
+        echo "libproject.a: \$(OBJS)"
+        echo "	ar rcs \$@ \$^"
+        echo ""
+        echo "clean:"
+        echo "	rm -f src/*.o libproject.a"
+    } > "$dir/Makefile"
+
+    # README.md
+    {
+        echo "# Project"
+        echo ""
+        echo "A data processing pipeline with 10 modules."
+        echo ""
+        echo "## Build"
+        echo ""
+        echo '```'
+        echo "make"
+        echo '```'
+    } > "$dir/README.md"
+}
+
+# ── Workload ─────────────────────────────────────────────────────────
+
+# Runs the AI agent workload on a mounted branch.
+# Simulates an AI agent editing a code project.
+#
+# Args: $1 = mount path, $2 = workload size (small|medium|large)
+# Returns: number of operations performed (via global WORKLOAD_OPS)
+WORKLOAD_OPS=0
+
+run_agent_workload() {
+    local mnt="$1" size="$2"
+    local n_reads n_creates n_modifies n_mkdirs n_deletes n_renames
+    local ops=0
+
+    case "$size" in
+        small)   n_reads=5;  n_creates=3;  n_modifies=1; n_mkdirs=1; n_deletes=0; n_renames=0 ;;
+        medium)  n_reads=10; n_creates=10; n_modifies=5; n_mkdirs=3; n_deletes=2; n_renames=1 ;;
+        large)   n_reads=20; n_creates=50; n_modifies=20; n_mkdirs=10; n_deletes=5; n_renames=3 ;;
+        *)       die "Unknown workload size: $size" ;;
+    esac
+
+    # Reads — cat existing files
+    for i in $(seq 1 "$n_reads"); do
+        local idx=$(( (i % 10) + 1 ))
+        cat "$mnt/src/module_${idx}.c" > /dev/null 2>&1 || true
+        ops=$((ops + 1))
+    done
+
+    # Mkdirs — create new directories
+    for i in $(seq 1 "$n_mkdirs"); do
+        mkdir -p "$mnt/agent_dir_${i}/sub" 2>/dev/null || true
+        ops=$((ops + 1))
+    done
+
+    # Creates — write new files (1-4KB of generated C content)
+    for i in $(seq 1 "$n_creates"); do
+        local fsize=$(( 1024 + (i * 73 % 3072) ))
+        generate_c_file "$mnt/agent_file_${i}.c" "$fsize"
+        ops=$((ops + 1))
+    done
+
+    # Modifies — overwrite existing source files
+    for i in $(seq 1 "$n_modifies"); do
+        local idx=$(( (i % 10) + 1 ))
+        local fsize=$(( 2048 + (i * 500) ))
+        generate_c_file "$mnt/src/module_${idx}.c" "$fsize"
+        ops=$((ops + 1))
+    done
+
+    # Deletes — remove some created files
+    for i in $(seq 1 "$n_deletes"); do
+        rm -f "$mnt/agent_file_${i}.c" 2>/dev/null || true
+        ops=$((ops + 1))
+    done
+
+    # Renames — rename some files
+    for i in $(seq 1 "$n_renames"); do
+        local src_idx=$(( n_creates - i + 1 ))
+        if [ -f "$mnt/agent_file_${src_idx}.c" ]; then
+            mv "$mnt/agent_file_${src_idx}.c" "$mnt/agent_file_${src_idx}_renamed.c" 2>/dev/null || true
+        fi
+        ops=$((ops + 1))
+    done
+
+    WORKLOAD_OPS=$ops
+}
+
+# ── Environment setup / teardown ─────────────────────────────────────
+
+# Creates a fresh daxfs image and mounts it as main.
+# Args: $1 = source dir, $2 = image size, $3 = delta size, $4 = main mount point
+setup_fresh_image() {
+    local src="$1" img_size="$2" delta_size="$3" mnt_main="$4"
+    mkdir -p "$mnt_main"
+
+    "$MKDAXFS" -d "$src" -H "$HEAP_DEV" -s "$img_size" -m "$mnt_main" -b -D "$delta_size" \
+        || die "mkdaxfs failed (size=$img_size delta=$delta_size)"
+}
+
+# Unmount all mounts under TEST_DIR, unload module if we loaded it
+full_cleanup() {
+    log_verbose "Full cleanup..."
+
+    # Unmount everything under TEST_DIR
+    if [ -n "$TEST_DIR" ] && [ -d "$TEST_DIR" ]; then
+        # Find and unmount in reverse order
+        local mnts
+        mnts=$(mount | grep "$TEST_DIR" | awk '{print $3}' | sort -r) || true
+        for m in $mnts; do
+            umount "$m" 2>/dev/null || umount -l "$m" 2>/dev/null || true
+        done
+        rm -rf "$TEST_DIR"
+    fi
+
+    if [ "$MODULE_LOADED_BY_US" = "1" ]; then
+        rmmod daxfs 2>/dev/null || true
+    fi
+}
+
+# Quick teardown: just unmount everything under TEST_DIR (between experiments)
+teardown_mounts() {
+    if [ -n "$TEST_DIR" ] && [ -d "$TEST_DIR" ]; then
+        local mnts
+        mnts=$(mount | grep "$TEST_DIR" | awk '{print $3}' | sort -r) || true
+        for m in $mnts; do
+            umount "$m" 2>/dev/null || umount -l "$m" 2>/dev/null || true
+        done
+    fi
+}
+
+check_prerequisites() {
+    if [ "$(id -u)" -ne 0 ]; then
+        die "Must run as root"
+    fi
+    for tool in "$MKDAXFS" "$DAXFS_BRANCH" "$DAXFS_INSPECT" "$MODULE"; do
+        if [ ! -f "$tool" ]; then
+            die "$(basename "$tool") not found at $tool — run 'make' first"
+        fi
+    done
+    if [ ! -e "$HEAP_DEV" ]; then
+        die "DMA heap not found at $HEAP_DEV"
+    fi
+}
+
+load_module() {
+    MODULE_LOADED_BY_US=0
+    if ! lsmod | grep -q "^daxfs"; then
+        log_verbose "Loading daxfs module..."
+        insmod "$MODULE" || die "Failed to load module"
+        MODULE_LOADED_BY_US=1
+    fi
+}
+
+# ── Experiment 1: Agent Scalability ──────────────────────────────────
+
+scale_agents() {
+    log "Experiment 1: Agent Scalability (scale_agents)"
+
+    local n=1
+    while [ "$n" -le "$MAX_AGENTS" ]; do
+        log "  N=$n agents..."
+        local iter
+        for iter in $(seq 1 "$REPS"); do
+            log_verbose "Iteration $iter/$REPS for N=$n"
+
+            # Fresh image: delta = (N+2)*1M + 64M headroom
+            local delta_mb=$(( (n + 2) + 64 ))
+            local img_mb=$(( delta_mb + 128 ))
+            local mnt_main="$TEST_DIR/main"
+            local source="$TEST_DIR/source"
+
+            teardown_mounts
+            rm -rf "$TEST_DIR/main" "$TEST_DIR/branch_"*
+            mkdir -p "$mnt_main"
+
+            setup_fresh_image "$source" "${img_mb}M" "${delta_mb}M" "$mnt_main"
+
+            # ── Branch creation (sequential, branch_lock serializes) ──
+            local t0 t1 branch_total_us=0
+            local branch_mnts=()
+            for b in $(seq 1 "$n"); do
+                local bmnt="$TEST_DIR/branch_${b}"
+                mkdir -p "$bmnt"
+                branch_mnts+=("$bmnt")
+
+                t0=$(now_ns)
+                "$DAXFS_BRANCH" create "agent_${b}" -m "$bmnt" -p main \
+                    || die "Failed to create branch agent_${b}"
+                t1=$(now_ns)
+
+                local lat_us
+                lat_us=$(ns_to_us $((t1 - t0)))
+                branch_total_us=$((branch_total_us + lat_us))
+                log_verbose "  branch agent_${b} created in ${lat_us} us"
+            done
+            tsv_row "scale_agents" "$n" "$iter" "branch_create_total" "$branch_total_us" "$n" "" ""
+            tsv_row "scale_agents" "$n" "$iter" "branch_create_avg" "$((branch_total_us / n))" "$n" "" ""
+
+            # ── Parallel workload on all branches ──
+            t0=$(now_ns)
+            local pids=()
+            for b in $(seq 1 "$n"); do
+                (
+                    run_agent_workload "${branch_mnts[$((b-1))]}" "$WORKLOAD"
+                ) &
+                pids+=($!)
+            done
+            # Wait for all
+            local workload_fail=0
+            for pid in "${pids[@]}"; do
+                wait "$pid" 2>/dev/null || workload_fail=$((workload_fail + 1))
+            done
+            t1=$(now_ns)
+            local workload_us
+            workload_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "scale_agents" "$n" "$iter" "parallel_workload" "$workload_us" "$n" "" \
+                "fails=$workload_fail"
+
+            # ── Commit winner (branch 1) ──
+            t0=$(now_ns)
+            "$DAXFS_BRANCH" commit -m "${branch_mnts[0]}" \
+                || die "Commit failed for agent_1"
+            t1=$(now_ns)
+            local commit_us
+            commit_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "scale_agents" "$n" "$iter" "commit" "$commit_us" "1" "" ""
+
+            # ── Detect ESTALE on sibling (branch 2, if it exists) ──
+            if [ "$n" -ge 2 ]; then
+                t0=$(now_ns)
+                # Any operation on an invalidated sibling should fail with ESTALE
+                local estale_rc=0
+                cat "${branch_mnts[1]}/src/module_1.c" > /dev/null 2>&1 || estale_rc=$?
+                t1=$(now_ns)
+                local estale_us
+                estale_us=$(ns_to_us $((t1 - t0)))
+                tsv_row "scale_agents" "$n" "$iter" "estale_detect" "$estale_us" "1" "" \
+                    "rc=$estale_rc"
+            fi
+
+            # ── End-to-end cleanup time ──
+            t0=$(now_ns)
+            teardown_mounts
+            t1=$(now_ns)
+            local cleanup_us
+            cleanup_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "scale_agents" "$n" "$iter" "cleanup" "$cleanup_us" "$n" "" ""
+
+        done
+        n=$((n * 2))
+    done
+}
+
+# ── Experiment 2: Nesting Depth ──────────────────────────────────────
+
+scale_depth() {
+    log "Experiment 2: Nesting Depth (scale_depth)"
+
+    local d=1
+    while [ "$d" -le "$MAX_DEPTH" ]; do
+        log "  D=$d levels..."
+        local iter
+        for iter in $(seq 1 "$REPS"); do
+            log_verbose "Iteration $iter/$REPS for D=$d"
+
+            # Fresh image: need d+2 branches worth of delta + headroom
+            local delta_mb=$(( (d + 4) + 64 ))
+            local img_mb=$(( delta_mb + 128 ))
+            local mnt_main="$TEST_DIR/main"
+            local source="$TEST_DIR/source"
+
+            teardown_mounts
+            rm -rf "$TEST_DIR/main" "$TEST_DIR/level_"*
+            mkdir -p "$mnt_main"
+
+            setup_fresh_image "$source" "${img_mb}M" "${delta_mb}M" "$mnt_main"
+
+            # ── Create chain: main → L1 → L2 → ... → LD ──
+            local t0 t1
+            t0=$(now_ns)
+            local parent="main"
+            local level_mnts=()
+            for lvl in $(seq 1 "$d"); do
+                local lmnt="$TEST_DIR/level_${lvl}"
+                mkdir -p "$lmnt"
+                level_mnts+=("$lmnt")
+                "$DAXFS_BRANCH" create "level_${lvl}" -m "$lmnt" -p "$parent" \
+                    || die "Failed to create level_${lvl}"
+                parent="level_${lvl}"
+            done
+            t1=$(now_ns)
+            local chain_us
+            chain_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "scale_depth" "$d" "$iter" "chain_create" "$chain_us" "$d" "" ""
+
+            # ── Workload at deepest level ──
+            local deepest="${level_mnts[$((d-1))]}"
+            t0=$(now_ns)
+            run_agent_workload "$deepest" "$WORKLOAD"
+            t1=$(now_ns)
+            local work_us
+            work_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "scale_depth" "$d" "$iter" "workload" "$work_us" "$WORKLOAD_OPS" "" ""
+
+            # ── Get delta bytes before commit ──
+            local delta_bytes=""
+            delta_bytes=$("$DAXFS_INSPECT" info -m "$deepest" -b "level_${d}" 2>/dev/null \
+                | grep -i "delta.*used" | head -1 | grep -oP '\d+' | tail -1) || true
+            log_verbose "  delta_bytes at level_${d}: $delta_bytes"
+
+            # ── Commit from deepest level (walks up D levels) ──
+            t0=$(now_ns)
+            "$DAXFS_BRANCH" commit -m "$deepest" \
+                || die "Commit from depth $d failed"
+            t1=$(now_ns)
+            local commit_us
+            commit_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "scale_depth" "$d" "$iter" "commit" "$commit_us" "$d" "$delta_bytes" ""
+
+            teardown_mounts
+        done
+        d=$((d * 2))
+    done
+}
+
+# ── Experiment 3: Commit Cost vs Delta Size ──────────────────────────
+
+commit_cost() {
+    log "Experiment 3: Commit Cost vs Delta Size (commit_cost)"
+
+    local ops_counts="10 50 100 500 1000 5000"
+
+    for ops_target in $ops_counts; do
+        log "  ops=$ops_target..."
+        local iter
+        for iter in $(seq 1 "$REPS"); do
+            log_verbose "Iteration $iter/$REPS for ops=$ops_target"
+
+            # Fresh image with generous delta
+            local delta_mb=128
+            local img_mb=256
+            local mnt_main="$TEST_DIR/main"
+            local bmnt="$TEST_DIR/branch_cost"
+            local source="$TEST_DIR/source"
+
+            teardown_mounts
+            rm -rf "$TEST_DIR/main" "$TEST_DIR/branch_cost"
+            mkdir -p "$mnt_main" "$bmnt"
+
+            setup_fresh_image "$source" "${img_mb}M" "${delta_mb}M" "$mnt_main"
+
+            "$DAXFS_BRANCH" create "cost_test" -m "$bmnt" -p main \
+                || die "Failed to create cost_test branch"
+
+            # ── Run exactly ops_target write operations ──
+            local t0 t1
+            t0=$(now_ns)
+            local op=0
+            while [ "$op" -lt "$ops_target" ]; do
+                local fsize=$(( 1024 + (op * 37 % 3072) ))
+                generate_c_file "$bmnt/cost_file_${op}.c" "$fsize"
+                op=$((op + 1))
+            done
+            t1=$(now_ns)
+            local workload_us
+            workload_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "commit_cost" "$ops_target" "$iter" "workload" "$workload_us" "$ops_target" "" ""
+
+            # ── Read delta_bytes via daxfs-inspect ──
+            local delta_bytes=""
+            delta_bytes=$("$DAXFS_INSPECT" info -m "$bmnt" -b "cost_test" 2>/dev/null \
+                | grep -i "delta.*used" | head -1 | grep -oP '\d+' | tail -1) || true
+            log_verbose "  delta_bytes: $delta_bytes"
+
+            # ── Commit and time it ──
+            t0=$(now_ns)
+            "$DAXFS_BRANCH" commit -m "$bmnt" \
+                || die "Commit failed for ops=$ops_target"
+            t1=$(now_ns)
+            local commit_us
+            commit_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "commit_cost" "$ops_target" "$iter" "commit" "$commit_us" "$ops_target" \
+                "$delta_bytes" ""
+
+            teardown_mounts
+        done
+    done
+}
+
+# ── Experiment 4: tmpfs Baseline ─────────────────────────────────────
+
+baseline_tmpfs() {
+    log "Experiment 4: tmpfs Baseline (baseline_tmpfs)"
+
+    local source="$TEST_DIR/source"
+    local tmpfs_base="$TEST_DIR/tmpfs_base"
+
+    local n=1
+    while [ "$n" -le "$MAX_AGENTS" ]; do
+        log "  N=$n agents (tmpfs)..."
+        local iter
+        for iter in $(seq 1 "$REPS"); do
+            log_verbose "Iteration $iter/$REPS for N=$n"
+
+            # Fresh tmpfs base
+            rm -rf "$tmpfs_base"
+            mkdir -p "$tmpfs_base"
+            mount -t tmpfs -o size=512M tmpfs "$tmpfs_base"
+            cp -a "$source"/. "$tmpfs_base/"
+
+            # ── "Branch" creation = cp -a (O(source_size)) ──
+            local t0 t1 branch_total_us=0
+            local branch_dirs=()
+            for b in $(seq 1 "$n"); do
+                local bdir="$TEST_DIR/tmpfs_branch_${b}"
+                t0=$(now_ns)
+                cp -a "$tmpfs_base" "$bdir"
+                t1=$(now_ns)
+                branch_dirs+=("$bdir")
+                local lat_us
+                lat_us=$(ns_to_us $((t1 - t0)))
+                branch_total_us=$((branch_total_us + lat_us))
+            done
+            tsv_row "baseline_tmpfs" "$n" "$iter" "branch_create_total" "$branch_total_us" "$n" "" ""
+            tsv_row "baseline_tmpfs" "$n" "$iter" "branch_create_avg" "$((branch_total_us / n))" "$n" "" ""
+
+            # ── Parallel workload ──
+            t0=$(now_ns)
+            local pids=()
+            for b in $(seq 1 "$n"); do
+                (
+                    run_agent_workload "${branch_dirs[$((b-1))]}" "$WORKLOAD"
+                ) &
+                pids+=($!)
+            done
+            for pid in "${pids[@]}"; do
+                wait "$pid" 2>/dev/null || true
+            done
+            t1=$(now_ns)
+            local workload_us
+            workload_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "baseline_tmpfs" "$n" "$iter" "parallel_workload" "$workload_us" "$n" "" ""
+
+            # ── "Commit" = cp -a winner back to base ──
+            t0=$(now_ns)
+            rm -rf "$tmpfs_base"
+            cp -a "${branch_dirs[0]}" "$tmpfs_base"
+            t1=$(now_ns)
+            local commit_us
+            commit_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "baseline_tmpfs" "$n" "$iter" "commit" "$commit_us" "1" "" ""
+
+            # ── Cleanup branches ──
+            t0=$(now_ns)
+            for bdir in "${branch_dirs[@]}"; do
+                rm -rf "$bdir"
+            done
+            umount "$tmpfs_base" 2>/dev/null || true
+            rm -rf "$tmpfs_base"
+            t1=$(now_ns)
+            local cleanup_us
+            cleanup_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "baseline_tmpfs" "$n" "$iter" "cleanup" "$cleanup_us" "$n" "" ""
+        done
+        n=$((n * 2))
+    done
+}
+
+# ── Experiment 5: OverlayFS Baseline ─────────────────────────────────
+
+baseline_overlayfs() {
+    log "Experiment 5: OverlayFS Baseline (baseline_overlayfs)"
+
+    local source="$TEST_DIR/source"
+
+    local n=1
+    while [ "$n" -le "$MAX_AGENTS" ]; do
+        log "  N=$n agents (overlayfs)..."
+        local iter
+        for iter in $(seq 1 "$REPS"); do
+            log_verbose "Iteration $iter/$REPS for N=$n"
+
+            # ── "Branch" creation = overlayfs mount with upper/work dirs ──
+            local t0 t1 branch_total_us=0
+            local overlay_mnts=()
+            for b in $(seq 1 "$n"); do
+                local upper="$TEST_DIR/ovl_upper_${b}"
+                local work="$TEST_DIR/ovl_work_${b}"
+                local merged="$TEST_DIR/ovl_merged_${b}"
+                mkdir -p "$upper" "$work" "$merged"
+
+                t0=$(now_ns)
+                mount -t overlay overlay \
+                    -o "lowerdir=${source},upperdir=${upper},workdir=${work}" \
+                    "$merged" 2>/dev/null \
+                    || { log_verbose "overlayfs mount failed for branch $b"; break; }
+                t1=$(now_ns)
+                overlay_mnts+=("$merged")
+                local lat_us
+                lat_us=$(ns_to_us $((t1 - t0)))
+                branch_total_us=$((branch_total_us + lat_us))
+            done
+            local actual_n=${#overlay_mnts[@]}
+            if [ "$actual_n" -eq 0 ]; then
+                log_verbose "No overlayfs mounts succeeded, skipping"
+                continue
+            fi
+            tsv_row "baseline_overlayfs" "$n" "$iter" "branch_create_total" "$branch_total_us" \
+                "$actual_n" "" ""
+            tsv_row "baseline_overlayfs" "$n" "$iter" "branch_create_avg" \
+                "$((branch_total_us / actual_n))" "$actual_n" "" ""
+
+            # ── Parallel workload ──
+            t0=$(now_ns)
+            local pids=()
+            for b in $(seq 1 "$actual_n"); do
+                (
+                    run_agent_workload "${overlay_mnts[$((b-1))]}" "$WORKLOAD"
+                ) &
+                pids+=($!)
+            done
+            for pid in "${pids[@]}"; do
+                wait "$pid" 2>/dev/null || true
+            done
+            t1=$(now_ns)
+            local workload_us
+            workload_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "baseline_overlayfs" "$n" "$iter" "parallel_workload" "$workload_us" \
+                "$actual_n" "" ""
+
+            # ── "Commit" = copy upper dir to source ──
+            t0=$(now_ns)
+            cp -a "$TEST_DIR/ovl_upper_1"/. "$source/" 2>/dev/null || true
+            t1=$(now_ns)
+            local commit_us
+            commit_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "baseline_overlayfs" "$n" "$iter" "commit" "$commit_us" "1" "" ""
+
+            # ── Cleanup ──
+            t0=$(now_ns)
+            for b in $(seq 1 "$actual_n"); do
+                umount "$TEST_DIR/ovl_merged_${b}" 2>/dev/null || true
+            done
+            rm -rf "$TEST_DIR"/ovl_upper_* "$TEST_DIR"/ovl_work_* "$TEST_DIR"/ovl_merged_*
+            t1=$(now_ns)
+            local cleanup_us
+            cleanup_us=$(ns_to_us $((t1 - t0)))
+            tsv_row "baseline_overlayfs" "$n" "$iter" "cleanup" "$cleanup_us" "$actual_n" "" \
+                "no_nesting"
+        done
+        n=$((n * 2))
+    done
+}
+
+# ── Argument parsing ─────────────────────────────────────────────────
+
+parse_args() {
+    while getopts "n:d:w:r:o:v" opt; do
+        case $opt in
+            n) MAX_AGENTS="$OPTARG" ;;
+            d) MAX_DEPTH="$OPTARG" ;;
+            w) WORKLOAD="$OPTARG" ;;
+            r) REPS="$OPTARG" ;;
+            o) OUTPUT_DIR="$OPTARG" ;;
+            v) VERBOSE=1 ;;
+            *) echo "Usage: $0 [-n agents] [-d depth] [-w small|medium|large] [-r reps] [-o dir] [-v]"
+               exit 1 ;;
+        esac
+    done
+
+    # Validate workload
+    case "$WORKLOAD" in
+        small|medium|large) ;;
+        *) die "Invalid workload: $WORKLOAD (must be small, medium, or large)" ;;
+    esac
+
+    # Validate max_agents is reasonable
+    if [ "$MAX_AGENTS" -lt 1 ] || [ "$MAX_AGENTS" -gt 256 ]; then
+        die "Agent count must be between 1 and 256"
+    fi
+}
+
+# ── Main ─────────────────────────────────────────────────────────────
+
+main() {
+    parse_args "$@"
+
+    log "DAXFS AI Agent Speculative Branching Benchmark"
+    log "==============================================="
+    log "  max_agents=$MAX_AGENTS  max_depth=$MAX_DEPTH  workload=$WORKLOAD"
+    log "  reps=$REPS  output=$OUTPUT_DIR"
+    log ""
+
+    check_prerequisites
+    load_module
+    tsv_init
+
+    # Create persistent test directory and source tree
+    TEST_DIR=$(mktemp -d /tmp/daxfs_bench.XXXXXX)
+    trap full_cleanup EXIT
+
+    local source="$TEST_DIR/source"
+    log "Generating source tree..."
+    create_source_tree "$source"
+    local src_files
+    src_files=$(find "$source" -type f | wc -l)
+    log "  $src_files files in source tree"
+    log ""
+
+    # ── Run experiments ──
+
+    scale_agents
+    log ""
+
+    scale_depth
+    log ""
+
+    commit_cost
+    log ""
+
+    baseline_tmpfs
+    log ""
+
+    baseline_overlayfs
+    log ""
+
+    # ── Summary ──
+    local rows
+    rows=$(tail -n +2 "$TSV_FILE" | wc -l)
+    log "==============================================="
+    log "Benchmark complete: $rows data points"
+    log "Results: $TSV_FILE"
+}
+
+main "$@"
diff --git a/tests/bench_gpu.cu b/tests/bench_gpu.cu
new file mode 100644
index 0000000..bb14177
--- /dev/null
+++ b/tests/bench_gpu.cu
@@ -0,0 +1,1004 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAXFS GPU Coordination Benchmark
+ *
+ * Measures PCIe AtomicOp latency and throughput for the GPU-side
+ * coordination primitives (coord lock, page cache CAS, commit seq read).
+ *
+ * Uses cudaMallocHost (pinned host memory) to create a GPU-accessible
+ * region that mirrors the daxfs DAX layout.  GPU kernels then exercise
+ * the same atomicCAS / volatile-read primitives from daxfs_gpu.h,
+ * measuring PCIe round-trip latency across the bus.
+ *
+ * Usage: sudo ./tests/bench_gpu [options]
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* GPU-side primitives */
+#include "../include/daxfs_gpu.h"
+
+/* ── Configuration ────────────────────────────────────────────────── */
+
+#define DEFAULT_ITERS       10000
+#define DEFAULT_REPS        5
+#define WARMUP_ITERS        200
+#define PCACHE_SLOTS        4096
+
+/* ── P2P DMA and Multi-tenant constants ──────────────────────────── */
+
+#define P2P_MAX_SIZE        (1 << 20)   /* 1 MB max transfer */
+#define P2P_REGION_SIZE     (4 << 20)   /* 4 MB P2P data region */
+
+#define MT_MAX_TENANTS      32
+#define MT_PERM_NONE        0
+#define MT_PERM_READ        1
+#define MT_PERM_WRITE       2
+#define MT_PERM_RW          3
+
+/* ── Simulated DAX layout in pinned host memory ───────────────────── */
+
+struct dax_sim {
+    /* Coordination region (mirrors daxfs_global_coord) */
+    unsigned int       coord_lock;       /* 0=free, 1=held */
+    unsigned int       coord_pad;
+    unsigned long long commit_sequence;
+    unsigned long long last_committed_id;
+
+    /* Page cache header (mirrors daxfs_pcache_header) */
+    unsigned int       pcache_pending;
+    unsigned int       pcache_pad[3];
+
+    /* Page cache slot metadata array */
+    struct {
+        unsigned long long state_tag;    /* bits[1:0]=state, bits[63:2]=tag */
+        unsigned int       ref_bit;
+        unsigned int       reserved;
+    } slots[PCACHE_SLOTS];
+
+    /* Multi-tenant permission table (mirrors CXL shared memory ACL).
+     * perm[tenant][slot] = MT_PERM_{NONE,READ,WRITE,RW}.
+     * GPU checks this before every read/write to enforce isolation. */
+    unsigned char perm[MT_MAX_TENANTS][PCACHE_SLOTS];
+};
+
+/* Separate P2P DMA data region (pinned host, simulates CXL memory pages) */
+struct p2p_region {
+    char data[P2P_REGION_SIZE];
+};
+
+/* ── TSV output ───────────────────────────────────────────────────── */
+
+static FILE *tsv_fp;
+
+static void tsv_init(const char *path)
+{
+    tsv_fp = fopen(path, "w");
+    if (!tsv_fp) { perror("fopen"); exit(1); }
+    fprintf(tsv_fp, "experiment\tparameter\titeration\toperation\t"
+                    "latency_ns\tops_count\tthru_mops\tnotes\n");
+}
+
+static void tsv_row(const char *exp, int param, int iter,
+                    const char *op, double lat_ns, int ops,
+                    double mops, const char *notes)
+{
+    fprintf(tsv_fp, "%s\t%d\t%d\t%s\t%.1f\t%d\t%.3f\t%s\n",
+            exp, param, iter, op, lat_ns, ops, mops, notes ? notes : "");
+    fflush(tsv_fp);
+}
+
+/* ── GPU timer ────────────────────────────────────────────────────── */
+
+static double event_ms_to_ns(cudaEvent_t a, cudaEvent_t b)
+{
+    float ms;
+    cudaEventElapsedTime(&ms, a, b);
+    return (double)ms * 1e6;
+}
+
+/* ── Kernel 1: Coord Lock/Unlock Round-Trip (single thread) ──────── */
+
+__global__ void kern_lock_rt(unsigned int *lock, int iters,
+                              long long *out_cycles)
+{
+    long long t0 = clock64();
+    for (int i = 0; i < iters; i++) {
+        daxfs_gpu_coord_lock(lock);
+        daxfs_gpu_coord_unlock(lock);
+    }
+    *out_cycles = clock64() - t0;
+}
+
+/* ── Kernel 2: Commit Sequence Volatile Read ─────────────────────── */
+
+__global__ void kern_seq_read(const unsigned long long *seq, int iters,
+                               unsigned long long *sink, long long *out_cycles)
+{
+    unsigned long long s = 0;
+    long long t0 = clock64();
+    for (int i = 0; i < iters; i++)
+        s += daxfs_gpu_read_commit_seq(seq);
+    *out_cycles = clock64() - t0;
+    *sink = s;
+}
+
+/* ── Kernel 3: Pending Counter Inc/Dec ────────────────────────────── */
+
+__global__ void kern_pending(unsigned int *cnt, int iters,
+                              long long *out_cycles)
+{
+    long long t0 = clock64();
+    for (int i = 0; i < iters; i++) {
+        daxfs_gpu_pcache_inc_pending(cnt);
+        daxfs_gpu_pcache_dec_pending(cnt);
+    }
+    *out_cycles = clock64() - t0;
+}
+
+/* ── Kernel 4: Page Cache Lookup (scaling threads) ────────────────── */
+
+__global__ void kern_lookup(const unsigned long long *slot_base,
+                             unsigned int stride, unsigned int st_off,
+                             unsigned int slot_count, int iters,
+                             unsigned int *out_hits)
+{
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int idx = tid % slot_count;
+    const unsigned long long *st =
+        (const unsigned long long *)((const char *)slot_base +
+                                      (unsigned long long)idx * stride + st_off);
+    unsigned long long tag = (unsigned long long)(idx + 1);
+    unsigned int hits = 0;
+
+    for (int i = 0; i < iters; i++)
+        if (daxfs_gpu_pcache_lookup(st, tag))
+            hits++;
+
+    atomicAdd(out_hits, hits);
+}
+
+/* ── Kernel 5: CAS Throughput on Independent Slots ────────────────── */
+
+__global__ void kern_cas_indep(unsigned long long *slot_base,
+                                unsigned int stride, unsigned int st_off,
+                                unsigned int slot_count, int iters)
+{
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int idx = tid % slot_count;
+    unsigned long long *st =
+        (unsigned long long *)((char *)slot_base +
+                                (unsigned long long)idx * stride + st_off);
+
+    unsigned long long va = PCACHE_MAKE(PCACHE_STATE_FREE, 0);
+    unsigned long long vb = PCACHE_MAKE(PCACHE_STATE_PENDING,
+                                         (unsigned long long)(tid + 1));
+
+    for (int i = 0; i < iters; i++) {
+        unsigned long long old = daxfs_gpu_slot_cmpxchg(st, va, vb);
+        if (old == vb)
+            daxfs_gpu_slot_cmpxchg(st, vb, va);
+    }
+}
+
+/* ── Kernel 6: Lock Contention (multiple threads, one lock) ──────── */
+
+__global__ void kern_lock_contend(unsigned int *lock, int iters,
+                                   unsigned int *counter)
+{
+    for (int i = 0; i < iters; i++) {
+        daxfs_gpu_coord_lock(lock);
+        (*counter)++;
+        __threadfence_system();
+        daxfs_gpu_coord_unlock(lock);
+    }
+}
+
+/* ── Kernel 7: Page Cache Claim (FREE→PENDING transition) ─────────── */
+
+__global__ void kern_claim(unsigned long long *slot_base,
+                            unsigned int stride, unsigned int st_off,
+                            unsigned int slot_count, int iters,
+                            unsigned int *pending_count)
+{
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int idx = tid % slot_count;
+    unsigned long long *st =
+        (unsigned long long *)((char *)slot_base +
+                                (unsigned long long)idx * stride + st_off);
+    unsigned long long tag = (unsigned long long)(idx + 1);
+
+    for (int i = 0; i < iters; i++) {
+        if (daxfs_gpu_pcache_claim(st, tag)) {
+            daxfs_gpu_pcache_inc_pending(pending_count);
+            /* Reset to FREE for next iteration */
+            *st = PCACHE_MAKE(PCACHE_STATE_FREE, 0);
+            __threadfence_system();
+            daxfs_gpu_pcache_dec_pending(pending_count);
+        }
+    }
+}
+
+/* ═══════════════════════════════════════════════════════════════════
+ * P2P DMA kernels — GPU directly reads/writes CXL/DAX host memory
+ * ═══════════════════════════════════════════════════════════════════ */
+
+/* Kernel 8a: P2P DMA Read — GPU reads pages from host CXL memory */
+__global__ void kern_p2p_read(const char *host_src, char *dev_dst,
+                               unsigned int xfer_size, int iters)
+{
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int offset = (tid * xfer_size) % P2P_REGION_SIZE;
+    const char *src = host_src + offset;
+
+    for (int i = 0; i < iters; i++) {
+        /* Explicit GPU load from host (PCIe read TLPs) */
+        for (unsigned int b = 0; b < xfer_size; b += sizeof(unsigned long long)) {
+            unsigned long long val =
+                *(volatile const unsigned long long *)(src + b);
+            /* Store to device mem so compiler doesn't elide the load */
+            if (dev_dst)
+                *(unsigned long long *)(dev_dst + b) = val;
+        }
+    }
+}
+
+/* Kernel 8b: P2P DMA Write — GPU writes pages to host CXL memory */
+__global__ void kern_p2p_write(char *host_dst, unsigned int xfer_size,
+                                int iters)
+{
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int offset = (tid * xfer_size) % P2P_REGION_SIZE;
+    char *dst = host_dst + offset;
+    unsigned long long pattern = 0xDA5F5DA5ULL + tid;
+
+    for (int i = 0; i < iters; i++) {
+        /* Explicit GPU store to host (PCIe write TLPs) */
+        for (unsigned int b = 0; b < xfer_size; b += sizeof(unsigned long long)) {
+            *(volatile unsigned long long *)(dst + b) = pattern + b;
+        }
+        __threadfence_system();
+    }
+}
+
+/* ═══════════════════════════════════════════════════════════════════
+ * Multi-tenant CXL memory permission-checked access kernels
+ *
+ * Simulates GPU agents from different tenants sharing a CXL memory
+ * pool.  Each access performs an explicit permission check against
+ * a per-tenant ACL bitmap in the shared DAX region before issuing
+ * the actual read/write.  This is the GPU-side enforcement path
+ * that mirrors what hardware CXL.mem permission checks would do.
+ * ═══════════════════════════════════════════════════════════════════ */
+
+/*
+ * Permission check + read: GPU thread loads perm[tenant][slot],
+ * verifies MT_PERM_READ is set, then reads the slot data.
+ */
+__global__ void kern_mt_read(const unsigned long long *slot_base,
+                              unsigned int stride, unsigned int st_off,
+                              unsigned int slot_count,
+                              const unsigned char *perm_base,
+                              unsigned int tenant_id, int iters,
+                              unsigned int *out_allowed,
+                              unsigned int *out_denied)
+{
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int idx = tid % slot_count;
+    unsigned int allowed = 0, denied = 0;
+
+    /* Permission table row for this tenant */
+    const unsigned char *my_perm = perm_base + tenant_id * slot_count;
+
+    const unsigned long long *st =
+        (const unsigned long long *)((const char *)slot_base +
+                                      (unsigned long long)idx * stride + st_off);
+
+    for (int i = 0; i < iters; i++) {
+        /* Step 1: Explicit permission check (volatile read from host ACL) */
+        unsigned char p = *(volatile const unsigned char *)(my_perm + idx);
+
+        if (p & MT_PERM_READ) {
+            /* Step 2: Allowed — read the slot via PCIe */
+            unsigned long long val = *(volatile const unsigned long long *)st;
+            (void)val;
+            allowed++;
+        } else {
+            denied++;
+        }
+        /* Rotate to next slot */
+        idx = (idx + 7) % slot_count;
+        st = (const unsigned long long *)((const char *)slot_base +
+              (unsigned long long)idx * stride + st_off);
+    }
+
+    atomicAdd(out_allowed, allowed);
+    atomicAdd(out_denied, denied);
+}
+
+/*
+ * Permission check + write: GPU thread checks MT_PERM_WRITE,
+ * then performs CAS on the slot (simulating a page cache mutation).
+ */
+__global__ void kern_mt_write(unsigned long long *slot_base,
+                               unsigned int stride, unsigned int st_off,
+                               unsigned int slot_count,
+                               const unsigned char *perm_base,
+                               unsigned int tenant_id, int iters,
+                               unsigned int *out_allowed,
+                               unsigned int *out_denied)
+{
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int idx = tid % slot_count;
+    unsigned int allowed = 0, denied = 0;
+
+    const unsigned char *my_perm = perm_base + tenant_id * slot_count;
+
+    for (int i = 0; i < iters; i++) {
+        unsigned long long *st =
+            (unsigned long long *)((char *)slot_base +
+                                    (unsigned long long)idx * stride + st_off);
+
+        /* Step 1: Explicit permission check */
+        unsigned char p = *(volatile const unsigned char *)(my_perm + idx);
+
+        if (p & MT_PERM_WRITE) {
+            /* Step 2: Allowed — CAS write to slot (PCIe CAS TLP) */
+            unsigned long long old = *st;
+            unsigned long long new_val = old + 1;
+            daxfs_gpu_slot_cmpxchg(st, old, new_val);
+            allowed++;
+        } else {
+            denied++;
+        }
+        idx = (idx + 7) % slot_count;
+    }
+
+    atomicAdd(out_allowed, allowed);
+    atomicAdd(out_denied, denied);
+}
+
+/*
+ * Baseline: same read without permission check, to measure
+ * the overhead of the ACL lookup.
+ */
+__global__ void kern_mt_read_noacl(const unsigned long long *slot_base,
+                                    unsigned int stride, unsigned int st_off,
+                                    unsigned int slot_count, int iters,
+                                    unsigned int *out_count)
+{
+    unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int idx = tid % slot_count;
+    unsigned int count = 0;
+
+    for (int i = 0; i < iters; i++) {
+        const unsigned long long *st =
+            (const unsigned long long *)((const char *)slot_base +
+                                          (unsigned long long)idx * stride + st_off);
+        unsigned long long val = *(volatile const unsigned long long *)st;
+        (void)val;
+        count++;
+        idx = (idx + 7) % slot_count;
+    }
+
+    atomicAdd(out_count, count);
+}
+
+/* ── Error checking ───────────────────────────────────────────────── */
+
+#define CK(call) do {                                                   \
+    cudaError_t e = (call);                                             \
+    if (e != cudaSuccess) {                                             \
+        fprintf(stderr, "CUDA %s:%d: %s\n", __FILE__, __LINE__,        \
+                cudaGetErrorString(e));                                  \
+        exit(1);                                                        \
+    }                                                                   \
+} while(0)
+
+/* ── Main ─────────────────────────────────────────────────────────── */
+
+int main(int argc, char **argv)
+{
+    const char *tsv_path = "./bench_results/gpu.tsv";
+    int iters = DEFAULT_ITERS;
+    int reps = DEFAULT_REPS;
+    int opt;
+
+    while ((opt = getopt(argc, argv, "i:r:o:h")) != -1) {
+        switch (opt) {
+        case 'i': iters = atoi(optarg); break;
+        case 'r': reps = atoi(optarg); break;
+        case 'o': tsv_path = optarg; break;
+        default:
+            fprintf(stderr, "Usage: %s [-i iters] [-r reps] [-o out.tsv]\n",
+                    argv[0]);
+            return 1;
+        }
+    }
+
+    /* GPU info */
+    cudaDeviceProp prop;
+    CK(cudaGetDeviceProperties(&prop, 0));
+    int clock_khz;
+    CK(cudaDeviceGetAttribute(&clock_khz, cudaDevAttrClockRate, 0));
+    double ns_per_cycle = 1e6 / (double)clock_khz;
+
+    printf("DAXFS GPU Coordination Benchmark\n");
+    printf("================================\n");
+    printf("GPU: %s (SM %d.%d, %d SMs, %.0f MHz)\n",
+           prop.name, prop.major, prop.minor,
+           prop.multiProcessorCount, prop.clockRate / 1000.0);
+    printf("  iters=%d  reps=%d  pcache_slots=%d\n", iters, reps, PCACHE_SLOTS);
+    printf("  ns/cycle=%.2f\n\n", ns_per_cycle);
+
+    /* Allocate pinned host memory simulating DAX region */
+    struct dax_sim *dax;
+    CK(cudaMallocHost(&dax, sizeof(struct dax_sim)));
+    memset(dax, 0, sizeof(struct dax_sim));
+
+    /* Initialize pcache slots as VALID with known tags for lookup test */
+    for (int i = 0; i < PCACHE_SLOTS; i++)
+        dax->slots[i].state_tag = PCACHE_MAKE(PCACHE_STATE_VALID,
+                                               (unsigned long long)(i + 1));
+
+    /* Get device pointers for pinned memory */
+    unsigned int       *d_lock = &dax->coord_lock;
+    unsigned long long *d_seq  = &dax->commit_sequence;
+    unsigned int       *d_pend = &dax->pcache_pending;
+    unsigned long long *d_slot_base = &dax->slots[0].state_tag;
+    unsigned int        slot_stride = sizeof(dax->slots[0]);  /* 16 bytes */
+    unsigned int        st_off = 0;  /* state_tag at offset 0 in slot */
+
+    /* Scratch device memory */
+    long long *d_cycles;
+    unsigned long long *d_sink;
+    unsigned int *d_counter, *d_hits;
+    CK(cudaMalloc(&d_cycles, sizeof(long long)));
+    CK(cudaMalloc(&d_sink, sizeof(unsigned long long)));
+    CK(cudaMalloc(&d_counter, sizeof(unsigned int)));
+    CK(cudaMalloc(&d_hits, sizeof(unsigned int)));
+
+    cudaEvent_t ev0, ev1;
+    CK(cudaEventCreate(&ev0));
+    CK(cudaEventCreate(&ev1));
+
+    tsv_init(tsv_path);
+
+    printf("DAX simulation at %p (pinned host memory, GPU-visible via PCIe)\n\n",
+           (void *)dax);
+
+    /* ── Exp 1: Coord Lock Round-Trip ─────────────────────────────── */
+    printf("Exp 1: Coordination Lock Round-Trip (single thread)\n");
+    kern_lock_rt<<<1, 1>>>(d_lock, WARMUP_ITERS, d_cycles);
+    CK(cudaDeviceSynchronize());
+    dax->coord_lock = 0;
+
+    for (int r = 0; r < reps; r++) {
+        long long cyc;
+        CK(cudaEventRecord(ev0));
+        kern_lock_rt<<<1, 1>>>(d_lock, iters, d_cycles);
+        CK(cudaEventRecord(ev1));
+        CK(cudaDeviceSynchronize());
+        CK(cudaMemcpy(&cyc, d_cycles, sizeof(cyc), cudaMemcpyDeviceToHost));
+        double ns_op = (double)cyc * ns_per_cycle / iters;
+        double ev_ns = event_ms_to_ns(ev0, ev1);
+        printf("  rep %d: %.1f ns/op (cycles), %.1f ns/op (wall)\n",
+               r + 1, ns_op, ev_ns / iters);
+        tsv_row("gpu_coord_lock", 1, r + 1, "lock_unlock_rt",
+                ns_op, iters, iters / (ev_ns / 1e3), "single_thread");
+        dax->coord_lock = 0;
+    }
+
+    /* ── Exp 2: Commit Seq Volatile Read ──────────────────────────── */
+    printf("\nExp 2: Commit Sequence Read (volatile PCIe read)\n");
+    kern_seq_read<<<1, 1>>>(d_seq, WARMUP_ITERS, d_sink, d_cycles);
+    CK(cudaDeviceSynchronize());
+
+    for (int r = 0; r < reps; r++) {
+        long long cyc;
+        CK(cudaEventRecord(ev0));
+        kern_seq_read<<<1, 1>>>(d_seq, iters, d_sink, d_cycles);
+        CK(cudaEventRecord(ev1));
+        CK(cudaDeviceSynchronize());
+        CK(cudaMemcpy(&cyc, d_cycles, sizeof(cyc), cudaMemcpyDeviceToHost));
+        double ns_op = (double)cyc * ns_per_cycle / iters;
+        double ev_ns = event_ms_to_ns(ev0, ev1);
+        printf("  rep %d: %.1f ns/op\n", r + 1, ns_op);
+        tsv_row("gpu_commit_seq", 1, r + 1, "volatile_read",
+                ns_op, iters, iters / (ev_ns / 1e3), "single_thread");
+    }
+
+    /* ── Exp 3: Pending Counter Inc/Dec ───────────────────────────── */
+    printf("\nExp 3: Pending Counter CAS Inc/Dec\n");
+    kern_pending<<<1, 1>>>(d_pend, WARMUP_ITERS, d_cycles);
+    CK(cudaDeviceSynchronize());
+    dax->pcache_pending = 0;
+
+    for (int r = 0; r < reps; r++) {
+        long long cyc;
+        CK(cudaEventRecord(ev0));
+        kern_pending<<<1, 1>>>(d_pend, iters, d_cycles);
+        CK(cudaEventRecord(ev1));
+        CK(cudaDeviceSynchronize());
+        CK(cudaMemcpy(&cyc, d_cycles, sizeof(cyc), cudaMemcpyDeviceToHost));
+        double ns_op = (double)cyc * ns_per_cycle / (iters * 2);
+        double ev_ns = event_ms_to_ns(ev0, ev1);
+        printf("  rep %d: %.1f ns/op (CAS pair)\n", r + 1, ns_op);
+        tsv_row("gpu_pending_ctr", 1, r + 1, "cas_inc_dec",
+                ns_op, iters * 2, (iters * 2) / (ev_ns / 1e3), "single_thread");
+        dax->pcache_pending = 0;
+    }
+
+    /* ── Exp 4: Page Cache Lookup Throughput ──────────────────────── */
+    printf("\nExp 4: Page Cache Lookup (scaling threads)\n");
+    {
+        int tcounts[] = {1, 32, 64, 128, 256, 512, 1024};
+        for (int c = 0; c < 7; c++) {
+            int nt = tcounts[c];
+            int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256;
+            for (int r = 0; r < reps; r++) {
+                CK(cudaMemset(d_hits, 0, sizeof(unsigned int)));
+                CK(cudaEventRecord(ev0));
+                kern_lookup<<<blk, tpb>>>(d_slot_base, slot_stride, st_off,
+                                           PCACHE_SLOTS, iters, d_hits);
+                CK(cudaEventRecord(ev1));
+                CK(cudaDeviceSynchronize());
+                double ev_ns = event_ms_to_ns(ev0, ev1);
+                long long ops = (long long)nt * iters;
+                double mops = ops / (ev_ns / 1e3);
+                printf("  threads=%4d: %8.3f Mops/s\n", nt, mops);
+                tsv_row("gpu_pcache_lookup", nt, r + 1, "lookup_throughput",
+                        ev_ns / ops, (int)ops, mops, "");
+            }
+        }
+    }
+
+    /* ── Exp 5: Slot CAS Throughput (independent) ─────────────────── */
+    printf("\nExp 5: Slot CAS Throughput (independent slots)\n");
+    {
+        /* Reset slots to FREE for CAS test */
+        for (int i = 0; i < PCACHE_SLOTS; i++)
+            dax->slots[i].state_tag = PCACHE_MAKE(PCACHE_STATE_FREE, 0);
+
+        int tcounts[] = {1, 32, 64, 128, 256, 512, 1024};
+        int cas_iters = iters / 10;
+        for (int c = 0; c < 7; c++) {
+            int nt = tcounts[c];
+            int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256;
+            for (int r = 0; r < reps; r++) {
+                CK(cudaEventRecord(ev0));
+                kern_cas_indep<<<blk, tpb>>>(d_slot_base, slot_stride, st_off,
+                                              PCACHE_SLOTS, cas_iters);
+                CK(cudaEventRecord(ev1));
+                CK(cudaDeviceSynchronize());
+                double ev_ns = event_ms_to_ns(ev0, ev1);
+                long long ops = (long long)nt * cas_iters;
+                double mops = ops / (ev_ns / 1e3);
+                printf("  threads=%4d: %8.3f Mops/s\n", nt, mops);
+                tsv_row("gpu_slot_cas", nt, r + 1, "cas_throughput",
+                        ev_ns / ops, (int)ops, mops, "independent");
+            }
+        }
+    }
+
+    /* ── Exp 6: Lock Contention ───────────────────────────────────── */
+    printf("\nExp 6: Lock Contention (scaling threads)\n");
+    {
+        int tcounts[] = {1, 2, 4, 8, 16, 32};
+        int lock_iters = iters / 100;
+        for (int c = 0; c < 6; c++) {
+            int nt = tcounts[c];
+            for (int r = 0; r < reps; r++) {
+                dax->coord_lock = 0;
+                CK(cudaMemset(d_counter, 0, sizeof(unsigned int)));
+                CK(cudaEventRecord(ev0));
+                kern_lock_contend<<<1, nt>>>(d_lock, lock_iters, d_counter);
+                CK(cudaEventRecord(ev1));
+                CK(cudaDeviceSynchronize());
+                unsigned int ctr;
+                CK(cudaMemcpy(&ctr, d_counter, sizeof(ctr),
+                              cudaMemcpyDeviceToHost));
+                double ev_ns = event_ms_to_ns(ev0, ev1);
+                long long ops = (long long)nt * lock_iters;
+                double ns_acq = ev_ns / ops;
+                printf("  threads=%2d: %8.1f ns/acq  counter=%u/%lld\n",
+                       nt, ns_acq, ctr, ops);
+                char note[64];
+                snprintf(note, sizeof(note), "counter=%u", ctr);
+                tsv_row("gpu_lock_contention", nt, r + 1, "lock_acquisition",
+                        ns_acq, (int)ops, ops / (ev_ns / 1e3), note);
+            }
+        }
+    }
+
+    /* ── Exp 7: Page Cache Claim (FREE→PENDING) ───────────────────── */
+    printf("\nExp 7: Page Cache Claim (FREE->PENDING transition)\n");
+    {
+        int tcounts[] = {1, 32, 64, 128, 256, 512, 1024};
+        int claim_iters = iters / 10;
+        for (int c = 0; c < 7; c++) {
+            int nt = tcounts[c];
+            int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256;
+            /* Reset slots to FREE */
+            for (int i = 0; i < PCACHE_SLOTS; i++)
+                dax->slots[i].state_tag = PCACHE_MAKE(PCACHE_STATE_FREE, 0);
+            dax->pcache_pending = 0;
+
+            for (int r = 0; r < reps; r++) {
+                CK(cudaEventRecord(ev0));
+                kern_claim<<<blk, tpb>>>(d_slot_base, slot_stride, st_off,
+                                          PCACHE_SLOTS, claim_iters, d_pend);
+                CK(cudaEventRecord(ev1));
+                CK(cudaDeviceSynchronize());
+                double ev_ns = event_ms_to_ns(ev0, ev1);
+                long long ops = (long long)nt * claim_iters;
+                double mops = ops / (ev_ns / 1e3);
+                printf("  threads=%4d: %8.3f Mops/s\n", nt, mops);
+                tsv_row("gpu_pcache_claim", nt, r + 1, "claim_throughput",
+                        ev_ns / ops, (int)ops, mops, "free_to_pending");
+                /* Reset for next rep */
+                for (int i = 0; i < PCACHE_SLOTS; i++)
+                    dax->slots[i].state_tag = PCACHE_MAKE(PCACHE_STATE_FREE, 0);
+                dax->pcache_pending = 0;
+            }
+        }
+    }
+
+    /* ═══════════════════════════════════════════════════════════════
+     * Exp 8: P2P DMA — GPU explicit read/write to CXL host memory
+     * ═══════════════════════════════════════════════════════════════ */
+    printf("\n════════════════════════════════════════════\n");
+    printf("Exp 8: P2P DMA Read/Write (GPU ↔ CXL host memory)\n");
+    {
+        /* Allocate pinned host region simulating CXL memory pages */
+        struct p2p_region *p2p;
+        CK(cudaMallocHost(&p2p, sizeof(struct p2p_region)));
+        memset(p2p->data, 0xAB, P2P_REGION_SIZE);
+
+        /* Device-side scratch buffer for reads */
+        char *d_scratch;
+        CK(cudaMalloc(&d_scratch, P2P_MAX_SIZE));
+
+        unsigned int sizes[] = {64, 256, 4096, 65536, 1 << 20};
+        const char *labels[] = {"64B", "256B", "4KB", "64KB", "1MB"};
+
+        /* 8a: P2P Read (GPU loads from host CXL memory) */
+        printf("  ── P2P Read ──\n");
+        for (int s = 0; s < 5; s++) {
+            unsigned int xsz = sizes[s];
+            int nt = (xsz <= 4096) ? 64 : 1;
+            int p2p_iters = (xsz <= 4096) ? iters : iters / 100;
+            int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256;
+
+            for (int r = 0; r < reps; r++) {
+                CK(cudaEventRecord(ev0));
+                kern_p2p_read<<<blk, tpb>>>(p2p->data, d_scratch,
+                                              xsz, p2p_iters);
+                CK(cudaEventRecord(ev1));
+                CK(cudaDeviceSynchronize());
+                double ev_ns = event_ms_to_ns(ev0, ev1);
+                long long total_bytes = (long long)nt * p2p_iters * xsz;
+                double bw_gbps = total_bytes / (ev_ns / 1.0); /* B/ns = GB/s */
+                double lat_ns = ev_ns / ((long long)nt * p2p_iters);
+                printf("    %5s: %8.2f GB/s  %8.1f ns/op\n",
+                       labels[s], bw_gbps, lat_ns);
+                char note[32];
+                snprintf(note, sizeof(note), "size=%s", labels[s]);
+                tsv_row("gpu_p2p_read", xsz, r + 1, "read_bw",
+                        lat_ns, (int)(total_bytes >> 10), bw_gbps * 1000, note);
+            }
+        }
+
+        /* 8b: P2P Write (GPU stores to host CXL memory) */
+        printf("  ── P2P Write ──\n");
+        for (int s = 0; s < 5; s++) {
+            unsigned int xsz = sizes[s];
+            int nt = (xsz <= 4096) ? 64 : 1;
+            int p2p_iters = (xsz <= 4096) ? iters : iters / 100;
+            int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256;
+
+            for (int r = 0; r < reps; r++) {
+                CK(cudaEventRecord(ev0));
+                kern_p2p_write<<<blk, tpb>>>(p2p->data, xsz, p2p_iters);
+                CK(cudaEventRecord(ev1));
+                CK(cudaDeviceSynchronize());
+                double ev_ns = event_ms_to_ns(ev0, ev1);
+                long long total_bytes = (long long)nt * p2p_iters * xsz;
+                double bw_gbps = total_bytes / (ev_ns / 1.0);
+                double lat_ns = ev_ns / ((long long)nt * p2p_iters);
+                printf("    %5s: %8.2f GB/s  %8.1f ns/op\n",
+                       labels[s], bw_gbps, lat_ns);
+                char note[32];
+                snprintf(note, sizeof(note), "size=%s", labels[s]);
+                tsv_row("gpu_p2p_write", xsz, r + 1, "write_bw",
+                        lat_ns, (int)(total_bytes >> 10), bw_gbps * 1000, note);
+            }
+        }
+
+        cudaFreeHost(p2p);
+        cudaFree(d_scratch);
+    }
+
+    /* ═══════════════════════════════════════════════════════════════
+     * Exp 9: Multi-tenant CXL memory permission-checked access
+     *
+     * Simulates multiple AI agent tenants sharing CXL memory.
+     * GPU explicitly checks per-tenant permission bitmap in shared
+     * DAX region before each read/write — the GPU-side enforcement
+     * of CXL.mem access control.
+     * ═══════════════════════════════════════════════════════════════ */
+    printf("\n════════════════════════════════════════════\n");
+    printf("Exp 9: Multi-tenant CXL Permission-Checked Access\n");
+    {
+        /* Re-init slots to VALID for read tests */
+        for (int i = 0; i < PCACHE_SLOTS; i++)
+            dax->slots[i].state_tag = PCACHE_MAKE(PCACHE_STATE_VALID,
+                                                   (unsigned long long)(i + 1));
+
+        unsigned int *d_allowed, *d_denied;
+        CK(cudaMalloc(&d_allowed, sizeof(unsigned int)));
+        CK(cudaMalloc(&d_denied, sizeof(unsigned int)));
+
+        unsigned char *perm_base = &dax->perm[0][0];
+
+        /* 9a: Permission check overhead — compare with vs without ACL */
+        printf("  ── 9a: ACL Check Overhead (256 threads) ──\n");
+        {
+            int nt = 256, blk = 1, tpb = 256;
+            int mt_iters = iters;
+
+            /* Setup: tenant 0 has READ on all slots */
+            memset(dax->perm[0], MT_PERM_RW, PCACHE_SLOTS);
+
+            /* With ACL */
+            for (int r = 0; r < reps; r++) {
+                CK(cudaMemset(d_allowed, 0, sizeof(unsigned int)));
+                CK(cudaMemset(d_denied, 0, sizeof(unsigned int)));
+                CK(cudaEventRecord(ev0));
+                kern_mt_read<<<blk, tpb>>>(d_slot_base, slot_stride, st_off,
+                                            PCACHE_SLOTS, perm_base,
+                                            0, mt_iters,
+                                            d_allowed, d_denied);
+                CK(cudaEventRecord(ev1));
+                CK(cudaDeviceSynchronize());
+                double ev_ns = event_ms_to_ns(ev0, ev1);
+                long long ops = (long long)nt * mt_iters;
+                double mops = ops / (ev_ns / 1e3);
+                printf("    WITH ACL:  %8.3f Mops/s  (%.1f ns/op)\n",
+                       mops, ev_ns / ops);
+                tsv_row("gpu_mt_acl_overhead", nt, r + 1, "read_with_acl",
+                        ev_ns / ops, (int)ops, mops, "all_allowed");
+            }
+
+            /* Without ACL (baseline) */
+            CK(cudaMemset(d_hits, 0, sizeof(unsigned int)));
+            for (int r = 0; r < reps; r++) {
+                CK(cudaMemset(d_hits, 0, sizeof(unsigned int)));
+                CK(cudaEventRecord(ev0));
+                kern_mt_read_noacl<<<blk, tpb>>>(d_slot_base, slot_stride,
+                                                  st_off, PCACHE_SLOTS,
+                                                  mt_iters, d_hits);
+                CK(cudaEventRecord(ev1));
+                CK(cudaDeviceSynchronize());
+                double ev_ns = event_ms_to_ns(ev0, ev1);
+                long long ops = (long long)nt * mt_iters;
+                double mops = ops / (ev_ns / 1e3);
+                printf("    NO ACL:    %8.3f Mops/s  (%.1f ns/op)\n",
+                       mops, ev_ns / ops);
+                tsv_row("gpu_mt_acl_overhead", nt, r + 1, "read_no_acl",
+                        ev_ns / ops, (int)ops, mops, "baseline");
+            }
+        }
+
+        /* 9b: Multi-tenant read scaling (each warp = different tenant) */
+        printf("  ── 9b: Multi-tenant Read (scaling tenants) ──\n");
+        {
+            int tcounts[] = {1, 2, 4, 8, 16, 32};
+            int mt_iters = iters;
+
+            for (int c = 0; c < 6; c++) {
+                int n_tenants = tcounts[c];
+                int threads_per_tenant = 32; /* one warp per tenant */
+                int nt = n_tenants * threads_per_tenant;
+                int blk = (nt + 255) / 256, tpb = nt < 256 ? nt : 256;
+
+                /* Setup: all tenants get READ on their own slot range,
+                 * NONE on others' slots (isolation) */
+                memset(dax->perm, MT_PERM_NONE,
+                       MT_MAX_TENANTS * PCACHE_SLOTS);
+                int slots_per_tenant = PCACHE_SLOTS / n_tenants;
+                for (int t = 0; t < n_tenants; t++) {
+                    int start = t * slots_per_tenant;
+                    memset(&dax->perm[t][start], MT_PERM_READ,
+                           slots_per_tenant);
+                }
+
+                for (int r = 0; r < reps; r++) {
+                    CK(cudaMemset(d_allowed, 0, sizeof(unsigned int)));
+                    CK(cudaMemset(d_denied, 0, sizeof(unsigned int)));
+                    CK(cudaEventRecord(ev0));
+
+                    /* Launch one warp per tenant, each with its tenant_id.
+                     * Use a per-tenant kernel launch to keep it simple. */
+                    for (int t = 0; t < n_tenants; t++) {
+                        kern_mt_read<<<1, threads_per_tenant>>>(
+                            d_slot_base, slot_stride, st_off,
+                            PCACHE_SLOTS, perm_base,
+                            t, mt_iters, d_allowed, d_denied);
+                    }
+
+                    CK(cudaEventRecord(ev1));
+                    CK(cudaDeviceSynchronize());
+
+                    unsigned int h_allowed, h_denied;
+                    CK(cudaMemcpy(&h_allowed, d_allowed, sizeof(unsigned int),
+                                  cudaMemcpyDeviceToHost));
+                    CK(cudaMemcpy(&h_denied, d_denied, sizeof(unsigned int),
+                                  cudaMemcpyDeviceToHost));
+
+                    double ev_ns = event_ms_to_ns(ev0, ev1);
+                    long long ops = (long long)nt * mt_iters;
+                    double mops = ops / (ev_ns / 1e3);
+
+                    printf("    tenants=%2d: %8.3f Mops/s  allowed=%u "
+                           "denied=%u\n",
+                           n_tenants, mops, h_allowed, h_denied);
+
+                    char note[64];
+                    snprintf(note, sizeof(note),
+                             "allowed=%u,denied=%u", h_allowed, h_denied);
+                    tsv_row("gpu_mt_read", n_tenants, r + 1,
+                            "mt_read_throughput",
+                            ev_ns / ops, (int)ops, mops, note);
+                }
+            }
+        }
+
+        /* 9c: Multi-tenant write with permission check */
+        printf("  ── 9c: Multi-tenant Write (scaling tenants) ──\n");
+        {
+            int tcounts[] = {1, 2, 4, 8, 16, 32};
+            int mt_iters = iters / 10; /* CAS writes are slower */
+
+            for (int c = 0; c < 6; c++) {
+                int n_tenants = tcounts[c];
+                int threads_per_tenant = 32;
+                int nt = n_tenants * threads_per_tenant;
+
+                /* Setup: each tenant gets WRITE on its slot range */
+                memset(dax->perm, MT_PERM_NONE,
+                       MT_MAX_TENANTS * PCACHE_SLOTS);
+                int slots_per_tenant = PCACHE_SLOTS / n_tenants;
+                for (int t = 0; t < n_tenants; t++) {
+                    int start = t * slots_per_tenant;
+                    memset(&dax->perm[t][start], MT_PERM_RW,
+                           slots_per_tenant);
+                }
+
+                /* Reset slots */
+                for (int i = 0; i < PCACHE_SLOTS; i++)
+                    dax->slots[i].state_tag = PCACHE_MAKE(
+                        PCACHE_STATE_VALID, (unsigned long long)(i + 1));
+
+                for (int r = 0; r < reps; r++) {
+                    CK(cudaMemset(d_allowed, 0, sizeof(unsigned int)));
+                    CK(cudaMemset(d_denied, 0, sizeof(unsigned int)));
+                    CK(cudaEventRecord(ev0));
+
+                    for (int t = 0; t < n_tenants; t++) {
+                        kern_mt_write<<<1, threads_per_tenant>>>(
+                            d_slot_base, slot_stride, st_off,
+                            PCACHE_SLOTS, perm_base,
+                            t, mt_iters, d_allowed, d_denied);
+                    }
+
+                    CK(cudaEventRecord(ev1));
+                    CK(cudaDeviceSynchronize());
+
+                    unsigned int h_allowed, h_denied;
+                    CK(cudaMemcpy(&h_allowed, d_allowed, sizeof(unsigned int),
+                                  cudaMemcpyDeviceToHost));
+                    CK(cudaMemcpy(&h_denied, d_denied, sizeof(unsigned int),
+                                  cudaMemcpyDeviceToHost));
+
+                    double ev_ns = event_ms_to_ns(ev0, ev1);
+                    long long ops = (long long)nt * mt_iters;
+                    double mops = ops / (ev_ns / 1e3);
+
+                    printf("    tenants=%2d: %8.3f Mops/s  allowed=%u "
+                           "denied=%u\n",
+                           n_tenants, mops, h_allowed, h_denied);
+
+                    char note[64];
+                    snprintf(note, sizeof(note),
+                             "allowed=%u,denied=%u", h_allowed, h_denied);
+                    tsv_row("gpu_mt_write", n_tenants, r + 1,
+                            "mt_write_throughput",
+                            ev_ns / ops, (int)ops, mops, note);
+                }
+            }
+        }
+
+        /* 9d: Cross-tenant isolation test — tenant tries to access
+         * another tenant's slots (should all be denied) */
+        printf("  ── 9d: Cross-tenant Isolation (denied access) ──\n");
+        {
+            int n_tenants = 4;
+            int threads_per_tenant = 32;
+            int mt_iters = iters;
+
+            /* Setup: 4 tenants, each owns 1/4 of slots */
+            memset(dax->perm, MT_PERM_NONE,
+                   MT_MAX_TENANTS * PCACHE_SLOTS);
+            int slots_per_tenant = PCACHE_SLOTS / n_tenants;
+            for (int t = 0; t < n_tenants; t++) {
+                int start = t * slots_per_tenant;
+                memset(&dax->perm[t][start], MT_PERM_RW,
+                       slots_per_tenant);
+            }
+
+            for (int r = 0; r < reps; r++) {
+                CK(cudaMemset(d_allowed, 0, sizeof(unsigned int)));
+                CK(cudaMemset(d_denied, 0, sizeof(unsigned int)));
+                CK(cudaEventRecord(ev0));
+
+                /* Tenant 0 tries to read ALL slots (including tenant 1-3's).
+                 * Perm check uses global slot index, so tenant 0 can only
+                 * access its own 1/4 — the rest should be denied. */
+                kern_mt_read<<<1, threads_per_tenant>>>(
+                    d_slot_base,
+                    slot_stride, st_off, PCACHE_SLOTS,
+                    perm_base, 0 /* tenant 0 */, mt_iters,
+                    d_allowed, d_denied);
+
+                CK(cudaEventRecord(ev1));
+                CK(cudaDeviceSynchronize());
+
+                unsigned int h_allowed, h_denied;
+                CK(cudaMemcpy(&h_allowed, d_allowed, sizeof(unsigned int),
+                              cudaMemcpyDeviceToHost));
+                CK(cudaMemcpy(&h_denied, d_denied, sizeof(unsigned int),
+                              cudaMemcpyDeviceToHost));
+
+                double ev_ns = event_ms_to_ns(ev0, ev1);
+                long long ops = (long long)threads_per_tenant * mt_iters;
+                double mops = ops / (ev_ns / 1e3);
+
+                printf("    cross-tenant: %8.3f Mops/s  allowed=%u "
+                       "denied=%u (%s)\n",
+                       mops, h_allowed, h_denied,
+                       h_allowed == 0 ? "ISOLATED" : "VIOLATION!");
+
+                char note[80];
+                snprintf(note, sizeof(note),
+                         "allowed=%u,denied=%u,%s",
+                         h_allowed, h_denied,
+                         h_allowed == 0 ? "isolated" : "violation");
+                tsv_row("gpu_mt_isolation", n_tenants, r + 1,
+                        "cross_tenant_read",
+                        ev_ns / ops, (int)ops, mops, note);
+            }
+        }
+
+        cudaFree(d_allowed);
+        cudaFree(d_denied);
+    }
+
+    /* ── Cleanup ──────────────────────────────────────────────────── */
+    fclose(tsv_fp);
+    cudaEventDestroy(ev0);
+    cudaEventDestroy(ev1);
+    cudaFree(d_cycles);
+    cudaFree(d_sink);
+    cudaFree(d_counter);
+    cudaFree(d_hits);
+    cudaFreeHost(dax);
+
+    printf("\nResults written to %s\n", tsv_path);
+    return 0;
+}
diff --git a/tests/cuda_compat/crt/math_functions.h b/tests/cuda_compat/crt/math_functions.h
new file mode 100644
index 0000000..5bb5adc
--- /dev/null
+++ b/tests/cuda_compat/crt/math_functions.h
@@ -0,0 +1,24 @@
+/* Local wrapper to fix glibc 2.41+ / CUDA rsqrt noexcept conflict */
+#ifndef _DAXFS_MATH_FUNCTIONS_COMPAT_H
+#define _DAXFS_MATH_FUNCTIONS_COMPAT_H
+
+/* Include the real CUDA header first */
+#include_next <crt/math_functions.h>
+
+/* Now suppress the glibc redeclarations by pre-declaring them
+   in a compatible way before glibc's math.h gets pulled in. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Override glibc's rsqrt/rsqrtf declarations to avoid noexcept mismatch.
+   We define them as weak aliases so they don't conflict. */
+#ifdef __GLIBC__
+#define __DAXFS_RSQRT_COMPAT
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tests/plot_bench.py b/tests/plot_bench.py
new file mode 100644
index 0000000..9ac8860
--- /dev/null
+++ b/tests/plot_bench.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+"""
+DAXFS Benchmark Plotter
+
+Reads agents.tsv and produces a multi-panel figure comparing DAXFS
+against tmpfs and overlayfs baselines.
+"""
+
+import sys
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import numpy as np
+
+def load_data(tsv_path):
+    df = pd.read_csv(tsv_path, sep='\t')
+    df['latency_ms'] = df['latency_us'] / 1000.0
+    return df
+
+def mean_latency(df, experiment, operation):
+    sub = df[(df['experiment'] == experiment) & (df['operation'] == operation)]
+    return sub.groupby('parameter')['latency_ms'].agg(['mean', 'std']).reset_index()
+
+def plot_agent_scalability(ax, df):
+    """Panel 1: Branch creation time (total) vs N agents — DAXFS vs tmpfs vs overlayfs"""
+    for exp, label, color, marker in [
+        ('scale_agents', 'DAXFS', '#2196F3', 'o'),
+        ('baseline_tmpfs', 'tmpfs (cp -a)', '#FF5722', 's'),
+        ('baseline_overlayfs', 'OverlayFS', '#4CAF50', '^'),
+    ]:
+        d = mean_latency(df, exp, 'branch_create_total')
+        if len(d) == 0:
+            continue
+        ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                     label=label, color=color, marker=marker,
+                     capsize=3, linewidth=2, markersize=7)
+    ax.set_xlabel('Number of Agents (N)')
+    ax.set_ylabel('Total Branch Creation (ms)')
+    ax.set_title('Branch Creation Cost vs Agent Count')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+def plot_parallel_workload(ax, df):
+    """Panel 2: Parallel workload time vs N agents"""
+    for exp, label, color, marker in [
+        ('scale_agents', 'DAXFS', '#2196F3', 'o'),
+        ('baseline_tmpfs', 'tmpfs', '#FF5722', 's'),
+        ('baseline_overlayfs', 'OverlayFS', '#4CAF50', '^'),
+    ]:
+        d = mean_latency(df, exp, 'parallel_workload')
+        if len(d) == 0:
+            continue
+        ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                     label=label, color=color, marker=marker,
+                     capsize=3, linewidth=2, markersize=7)
+    ax.set_xlabel('Number of Agents (N)')
+    ax.set_ylabel('Parallel Workload Time (ms)')
+    ax.set_title('Parallel Agent Workload Scalability')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+def plot_commit_time(ax, df):
+    """Panel 3: Commit time vs N agents"""
+    for exp, label, color, marker in [
+        ('scale_agents', 'DAXFS', '#2196F3', 'o'),
+        ('baseline_tmpfs', 'tmpfs (cp -a)', '#FF5722', 's'),
+        ('baseline_overlayfs', 'OverlayFS (cp upper)', '#4CAF50', '^'),
+    ]:
+        d = mean_latency(df, exp, 'commit')
+        if len(d) == 0:
+            continue
+        ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                     label=label, color=color, marker=marker,
+                     capsize=3, linewidth=2, markersize=7)
+    ax.set_xlabel('Number of Agents (N)')
+    ax.set_ylabel('Commit Time (ms)')
+    ax.set_title('Commit / Merge Cost')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+def plot_branch_create_avg(ax, df):
+    """Panel 4: Per-branch creation cost (avg) vs N — shows O(1) vs O(tree_size)"""
+    for exp, label, color, marker in [
+        ('scale_agents', 'DAXFS (O(1) CoW)', '#2196F3', 'o'),
+        ('baseline_tmpfs', 'tmpfs (O(n) copy)', '#FF5722', 's'),
+        ('baseline_overlayfs', 'OverlayFS (mount)', '#4CAF50', '^'),
+    ]:
+        d = mean_latency(df, exp, 'branch_create_avg')
+        if len(d) == 0:
+            continue
+        ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                     label=label, color=color, marker=marker,
+                     capsize=3, linewidth=2, markersize=7)
+    ax.set_xlabel('Number of Agents (N)')
+    ax.set_ylabel('Per-Branch Creation (ms)')
+    ax.set_title('Per-Branch Creation Cost (Average)')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+def plot_nesting_depth(ax, df):
+    """Panel 5: Chain creation + commit time vs nesting depth"""
+    for op, label, color, marker in [
+        ('chain_create', 'Chain Create', '#2196F3', 'o'),
+        ('commit', 'Commit (walks chain)', '#E91E63', 'D'),
+        ('workload', 'Workload at Deepest', '#9E9E9E', 'x'),
+    ]:
+        d = mean_latency(df, 'scale_depth', op)
+        if len(d) == 0:
+            continue
+        ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                     label=label, color=color, marker=marker,
+                     capsize=3, linewidth=2, markersize=7)
+    ax.set_xlabel('Nesting Depth (D)')
+    ax.set_ylabel('Latency (ms)')
+    ax.set_title('Speculation Nesting Depth')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+def plot_commit_cost(ax, df):
+    """Panel 6: Commit cost vs number of write operations"""
+    for op, label, color, marker in [
+        ('commit', 'Commit Time', '#E91E63', 'D'),
+        ('workload', 'Workload Time', '#9E9E9E', 'x'),
+    ]:
+        d = mean_latency(df, 'commit_cost', op)
+        if len(d) == 0:
+            continue
+        ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                     label=label, color=color, marker=marker,
+                     capsize=3, linewidth=2, markersize=7)
+    ax.set_xlabel('Number of Write Operations')
+    ax.set_ylabel('Latency (ms)')
+    ax.set_title('Commit Cost vs Delta Size')
+    ax.set_xscale('log')
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+def main():
+    tsv_path = sys.argv[1] if len(sys.argv) > 1 else './bench_results/agents.tsv'
+    out_path = sys.argv[2] if len(sys.argv) > 2 else './bench_results/daxfs_bench.png'
+
+    df = load_data(tsv_path)
+    print(f"Loaded {len(df)} rows from {tsv_path}")
+    print(f"Experiments: {df['experiment'].unique()}")
+
+    fig, axes = plt.subplots(2, 3, figsize=(18, 11))
+    fig.suptitle('DAXFS AI Agent Speculative Branching Benchmark', fontsize=16, fontweight='bold')
+
+    plot_agent_scalability(axes[0, 0], df)
+    plot_branch_create_avg(axes[0, 1], df)
+    plot_parallel_workload(axes[0, 2], df)
+    plot_commit_time(axes[1, 0], df)
+    plot_nesting_depth(axes[1, 1], df)
+    plot_commit_cost(axes[1, 2], df)
+
+    plt.tight_layout(rect=[0, 0, 1, 0.95])
+    fig.savefig(out_path, dpi=150, bbox_inches='tight')
+    print(f"Saved figure to {out_path}")
+
+    # Also save individual plots for paper use
+    for name, plot_fn in [
+        ('branch_creation', plot_agent_scalability),
+        ('branch_avg', plot_branch_create_avg),
+        ('parallel_workload', plot_parallel_workload),
+        ('commit_time', plot_commit_time),
+        ('nesting_depth', plot_nesting_depth),
+        ('commit_cost', plot_commit_cost),
+    ]:
+        fig2, ax2 = plt.subplots(figsize=(7, 5))
+        plot_fn(ax2, df)
+        fig2.tight_layout()
+        p = out_path.replace('.png', f'_{name}.png')
+        fig2.savefig(p, dpi=150, bbox_inches='tight')
+        plt.close(fig2)
+        print(f"  -> {p}")
+
+    plt.close(fig)
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/plot_gpu.py b/tests/plot_gpu.py
new file mode 100644
index 0000000..97e696e
--- /dev/null
+++ b/tests/plot_gpu.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+"""
+DAXFS GPU Coordination Benchmark Plotter
+
+Reads gpu.tsv and produces multi-panel figures showing PCIe AtomicOp
+latency/throughput, P2P DMA bandwidth, and multi-tenant CXL permission
+check overhead.
+"""
+
+import sys
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import numpy as np
+
+def load_data(path):
+    df = pd.read_csv(path, sep='\t')
+    return df
+
+def mean_by_param(df, experiment, operation, value_col='latency_ns'):
+    sub = df[(df['experiment'] == experiment) & (df['operation'] == operation)]
+    return sub.groupby('parameter')[value_col].agg(['mean', 'std']).reset_index()
+
+def mean_by_param_mops(df, experiment, operation):
+    sub = df[(df['experiment'] == experiment) & (df['operation'] == operation)]
+    return sub.groupby('parameter')['thru_mops'].agg(['mean', 'std']).reset_index()
+
+# ── Original 6 panels ────────────────────────────────────────────────
+
+def plot_latency_bars(ax, df):
+    primitives = [
+        ('gpu_commit_seq', 'volatile_read', 'PCIe Read\n(commit_seq)', '#2196F3'),
+        ('gpu_pending_ctr', 'cas_inc_dec', 'CAS Inc/Dec\n(pending_ctr)', '#FF9800'),
+        ('gpu_coord_lock', 'lock_unlock_rt', 'Lock+Unlock\n(coord_lock)', '#F44336'),
+    ]
+    names, means, stds, colors = [], [], [], []
+    for exp, op, name, color in primitives:
+        sub = df[(df['experiment'] == exp) & (df['operation'] == op)]
+        if len(sub) == 0: continue
+        names.append(name); means.append(sub['latency_ns'].mean())
+        stds.append(sub['latency_ns'].std()); colors.append(color)
+    bars = ax.bar(names, means, yerr=stds, color=colors, capsize=5,
+                  edgecolor='black', linewidth=0.5)
+    ax.set_ylabel('Latency (ns)')
+    ax.set_title('PCIe Atomic Primitive Latency\n(single GPU thread)')
+    ax.grid(True, alpha=0.3, axis='y')
+    for bar, m in zip(bars, means):
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 30,
+                f'{m:.0f} ns', ha='center', va='bottom', fontsize=9, fontweight='bold')
+
+def plot_lookup_throughput(ax, df):
+    d = mean_by_param_mops(df, 'gpu_pcache_lookup', 'lookup_throughput')
+    if len(d) == 0: return
+    ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                color='#2196F3', marker='o', capsize=3, linewidth=2, markersize=7)
+    base = d['mean'].iloc[0]
+    ax.plot(d['parameter'], d['parameter'] * base, '--', color='gray',
+            alpha=0.5, label='Ideal linear')
+    ax.set_xlabel('GPU Threads'); ax.set_ylabel('Throughput (Mops/s)')
+    ax.set_title('Page Cache Lookup Throughput')
+    ax.set_xscale('log', base=2); ax.set_yscale('log', base=10)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.legend(fontsize=9); ax.grid(True, alpha=0.3)
+
+def plot_cas_throughput(ax, df):
+    d = mean_by_param_mops(df, 'gpu_slot_cas', 'cas_throughput')
+    if len(d) == 0: return
+    ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                color='#E91E63', marker='D', capsize=3, linewidth=2, markersize=7)
+    ax.set_xlabel('GPU Threads'); ax.set_ylabel('Throughput (Mops/s)')
+    ax.set_title('Slot CAS Throughput\n(independent slots, PCIe atomics)')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.grid(True, alpha=0.3)
+    if len(d) >= 4:
+        plateau = d['mean'].iloc[-3:].mean()
+        ax.axhline(y=plateau, color='gray', linestyle=':', alpha=0.5)
+        ax.text(d['parameter'].iloc[-1], plateau * 1.1,
+                f'PCIe limit: {plateau:.1f} Mops/s',
+                ha='right', fontsize=8, color='gray')
+
+def plot_lock_contention(ax, df):
+    d = mean_by_param(df, 'gpu_lock_contention', 'lock_acquisition')
+    if len(d) == 0: return
+    ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                color='#F44336', marker='s', capsize=3, linewidth=2, markersize=7)
+    ax.set_xlabel('Contending GPU Threads')
+    ax.set_ylabel('Time per Acquisition (ns)')
+    ax.set_title('Coordination Lock Contention')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.grid(True, alpha=0.3)
+
+def plot_claim_throughput(ax, df):
+    d = mean_by_param_mops(df, 'gpu_pcache_claim', 'claim_throughput')
+    if len(d) == 0: return
+    ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                color='#4CAF50', marker='^', capsize=3, linewidth=2, markersize=7)
+    ax.set_xlabel('GPU Threads'); ax.set_ylabel('Throughput (Mops/s)')
+    ax.set_title('Page Cache Claim\n(FREE->PENDING + pending counter)')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.grid(True, alpha=0.3)
+
+def plot_lookup_latency(ax, df):
+    d = mean_by_param(df, 'gpu_pcache_lookup', 'lookup_throughput')
+    if len(d) == 0: return
+    ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                color='#2196F3', marker='o', capsize=3, linewidth=2, markersize=7)
+    ax.set_xlabel('GPU Threads'); ax.set_ylabel('Per-Op Latency (ns)')
+    ax.set_title('Page Cache Lookup Latency')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.grid(True, alpha=0.3)
+
+# ── New panels: P2P DMA & Multi-tenant ────────────────────────────────
+
+def plot_p2p_bandwidth(ax, df):
+    """P2P DMA read/write bandwidth vs transfer size"""
+    sizes = [64, 256, 4096, 65536, 1 << 20]
+    labels = ['64B', '256B', '4KB', '64KB', '1MB']
+
+    for exp, label, color, marker in [
+        ('gpu_p2p_read', 'GPU Read from CXL', '#2196F3', 'o'),
+        ('gpu_p2p_write', 'GPU Write to CXL', '#FF5722', 's'),
+    ]:
+        sub = df[df['experiment'] == exp]
+        if len(sub) == 0: continue
+        # thru_mops is actually GB/s * 1000 in this context
+        means, stds_v = [], []
+        for sz in sizes:
+            s = sub[sub['parameter'] == sz]
+            if len(s) == 0: continue
+            # Compute GB/s from latency and size
+            lat = s['latency_ns'].mean()
+            bw = sz / lat  # bytes/ns = GB/s
+            means.append(bw)
+            stds_v.append(0)
+        if means:
+            ax.plot(range(len(means)), means, marker=marker, color=color,
+                    linewidth=2, markersize=7, label=label)
+
+    ax.set_xticks(range(len(labels)))
+    ax.set_xticklabels(labels)
+    ax.set_xlabel('Transfer Size')
+    ax.set_ylabel('Bandwidth (GB/s)')
+    ax.set_title('P2P DMA Bandwidth\n(GPU <-> CXL Host Memory)')
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+def plot_p2p_latency(ax, df):
+    """P2P DMA read/write latency vs transfer size"""
+    sizes = [64, 256, 4096, 65536, 1 << 20]
+    labels = ['64B', '256B', '4KB', '64KB', '1MB']
+
+    for exp, label, color, marker in [
+        ('gpu_p2p_read', 'GPU Read', '#2196F3', 'o'),
+        ('gpu_p2p_write', 'GPU Write', '#FF5722', 's'),
+    ]:
+        sub = df[df['experiment'] == exp]
+        if len(sub) == 0: continue
+        means = []
+        for sz in sizes:
+            s = sub[sub['parameter'] == sz]
+            if len(s) == 0: continue
+            means.append(s['latency_ns'].mean() / 1e3)  # us
+        if means:
+            ax.plot(range(len(means)), means, marker=marker, color=color,
+                    linewidth=2, markersize=7, label=label)
+
+    ax.set_xticks(range(len(labels)))
+    ax.set_xticklabels(labels)
+    ax.set_xlabel('Transfer Size')
+    ax.set_ylabel('Latency (us)')
+    ax.set_yscale('log')
+    ax.set_title('P2P DMA Latency\n(GPU <-> CXL Host Memory)')
+    ax.legend(fontsize=9)
+    ax.grid(True, alpha=0.3)
+
+def plot_mt_acl_overhead(ax, df):
+    """Multi-tenant: ACL check overhead vs no-check baseline"""
+    sub_acl = df[(df['experiment'] == 'gpu_mt_acl_overhead') &
+                 (df['operation'] == 'read_with_acl')]
+    sub_noacl = df[(df['experiment'] == 'gpu_mt_acl_overhead') &
+                   (df['operation'] == 'read_no_acl')]
+
+    names = ['With ACL\n(perm check + read)', 'Without ACL\n(read only)']
+    means = [sub_acl['thru_mops'].mean() if len(sub_acl) else 0,
+             sub_noacl['thru_mops'].mean() if len(sub_noacl) else 0]
+    stds = [sub_acl['thru_mops'].std() if len(sub_acl) else 0,
+            sub_noacl['thru_mops'].std() if len(sub_noacl) else 0]
+    colors = ['#4CAF50', '#9E9E9E']
+
+    bars = ax.bar(names, means, yerr=stds, color=colors, capsize=5,
+                  edgecolor='black', linewidth=0.5)
+    ax.set_ylabel('Throughput (Mops/s)')
+    ax.set_title('CXL ACL Permission Check Overhead\n(256 threads, all-allowed)')
+    ax.grid(True, alpha=0.3, axis='y')
+    for bar, m in zip(bars, means):
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
+                f'{m:.1f}', ha='center', va='bottom', fontsize=10, fontweight='bold')
+
+def plot_mt_read_tenants(ax, df):
+    """Multi-tenant read throughput vs tenant count"""
+    d = mean_by_param_mops(df, 'gpu_mt_read', 'mt_read_throughput')
+    if len(d) == 0: return
+    ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                color='#2196F3', marker='o', capsize=3, linewidth=2, markersize=7,
+                label='Read (perm-checked)')
+    ax.set_xlabel('Number of Tenants')
+    ax.set_ylabel('Throughput (Mops/s)')
+    ax.set_title('Multi-tenant Read Throughput\n(each tenant: 32 GPU threads)')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.legend(fontsize=9); ax.grid(True, alpha=0.3)
+
+def plot_mt_write_tenants(ax, df):
+    """Multi-tenant write throughput vs tenant count"""
+    d = mean_by_param_mops(df, 'gpu_mt_write', 'mt_write_throughput')
+    if len(d) == 0: return
+    ax.errorbar(d['parameter'], d['mean'], yerr=d['std'],
+                color='#FF5722', marker='s', capsize=3, linewidth=2, markersize=7,
+                label='Write (perm-checked CAS)')
+    ax.set_xlabel('Number of Tenants')
+    ax.set_ylabel('Throughput (Mops/s)')
+    ax.set_title('Multi-tenant Write Throughput\n(each tenant: 32 GPU threads)')
+    ax.set_xscale('log', base=2)
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.legend(fontsize=9); ax.grid(True, alpha=0.3)
+
+def plot_mt_isolation(ax, df):
+    """Cross-tenant isolation: allowed vs denied pie/bar"""
+    sub = df[(df['experiment'] == 'gpu_mt_isolation')]
+    if len(sub) == 0: return
+
+    # Parse allowed/denied from notes
+    allowed_total, denied_total = 0, 0
+    for _, row in sub.iterrows():
+        notes = str(row.get('notes', ''))
+        for part in notes.split(','):
+            if part.startswith('allowed='):
+                allowed_total += int(part.split('=')[1])
+            elif part.startswith('denied='):
+                denied_total += int(part.split('=')[1])
+
+    if allowed_total + denied_total == 0: return
+    total = allowed_total + denied_total
+    pct_allowed = 100.0 * allowed_total / total
+    pct_denied = 100.0 * denied_total / total
+
+    colors = ['#F44336', '#4CAF50']
+    wedges, texts, autotexts = ax.pie(
+        [allowed_total, denied_total],
+        labels=[f'Own Slots\n(allowed)', f'Other Tenants\n(denied)'],
+        colors=colors, autopct='%1.1f%%', startangle=90,
+        textprops={'fontsize': 10})
+    ax.set_title(f'Cross-Tenant Isolation (4 tenants)\n'
+                 f'Tenant 0 accessing all {total//len(sub)} slots/rep')
+
+def main():
+    tsv = sys.argv[1] if len(sys.argv) > 1 else './bench_results/gpu.tsv'
+    out = sys.argv[2] if len(sys.argv) > 2 else './bench_results/daxfs_gpu_bench.png'
+
+    df = load_data(tsv)
+    print(f"Loaded {len(df)} rows from {tsv}")
+    print(f"Experiments: {df['experiment'].unique()}")
+
+    # ── Figure 1: Core PCIe atomics (6 panels) ──
+    fig1, axes1 = plt.subplots(2, 3, figsize=(18, 11))
+    fig1.suptitle('DAXFS GPU PCIe AtomicOp Coordination Benchmark\n'
+                  '(RTX 5090, pinned host memory, PCIe 5.0)',
+                  fontsize=15, fontweight='bold')
+    plot_latency_bars(axes1[0, 0], df)
+    plot_lookup_throughput(axes1[0, 1], df)
+    plot_cas_throughput(axes1[0, 2], df)
+    plot_lock_contention(axes1[1, 0], df)
+    plot_claim_throughput(axes1[1, 1], df)
+    plot_lookup_latency(axes1[1, 2], df)
+    plt.tight_layout(rect=[0, 0, 1, 0.93])
+    fig1.savefig(out, dpi=150, bbox_inches='tight')
+    print(f"Saved: {out}")
+    plt.close(fig1)
+
+    # ── Figure 2: P2P DMA + Multi-tenant (6 panels) ──
+    out2 = out.replace('.png', '_p2p_mt.png')
+    fig2, axes2 = plt.subplots(2, 3, figsize=(18, 11))
+    fig2.suptitle('DAXFS GPU P2P DMA & Multi-tenant CXL Memory Benchmark\n'
+                  '(RTX 5090, PCIe 5.0, shared CXL memory simulation)',
+                  fontsize=15, fontweight='bold')
+    plot_p2p_bandwidth(axes2[0, 0], df)
+    plot_p2p_latency(axes2[0, 1], df)
+    plot_mt_acl_overhead(axes2[0, 2], df)
+    plot_mt_read_tenants(axes2[1, 0], df)
+    plot_mt_write_tenants(axes2[1, 1], df)
+    plot_mt_isolation(axes2[1, 2], df)
+    plt.tight_layout(rect=[0, 0, 1, 0.93])
+    fig2.savefig(out2, dpi=150, bbox_inches='tight')
+    print(f"Saved: {out2}")
+    plt.close(fig2)
+
+    # ── Individual plots ──
+    all_plots = [
+        ('latency_bars', plot_latency_bars),
+        ('lookup_throughput', plot_lookup_throughput),
+        ('cas_throughput', plot_cas_throughput),
+        ('lock_contention', plot_lock_contention),
+        ('claim_throughput', plot_claim_throughput),
+        ('lookup_latency', plot_lookup_latency),
+        ('p2p_bandwidth', plot_p2p_bandwidth),
+        ('p2p_latency', plot_p2p_latency),
+        ('mt_acl_overhead', plot_mt_acl_overhead),
+        ('mt_read_tenants', plot_mt_read_tenants),
+        ('mt_write_tenants', plot_mt_write_tenants),
+        ('mt_isolation', plot_mt_isolation),
+    ]
+    for name, fn in all_plots:
+        f, a = plt.subplots(figsize=(7, 5))
+        fn(a, df)
+        f.tight_layout()
+        p = out.replace('.png', f'_{name}.png')
+        f.savefig(p, dpi=150, bbox_inches='tight')
+        plt.close(f)
+        print(f"  -> {p}")
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/Makefile b/tools/Makefile
index f4335bb..79c5435 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -14,10 +14,23 @@ mkdaxfs: mkdaxfs.c
 daxfs-inspect: daxfs-inspect.c
 	$(CC) $(CFLAGS) -o $@ $<
 
+# Optional CUDA GPU mapping library
+NVCC ?= nvcc
+CUDA_PATH ?= /usr/local/cuda
+CUDA_CFLAGS = -I../include -I$(CUDA_PATH)/include
+CUDA_LDFLAGS = -L$(CUDA_PATH)/lib64 -lcuda
+
+GPU_TARGETS = daxfs-gpu-map.o
+
+gpu: $(GPU_TARGETS)
+
+daxfs-gpu-map.o: daxfs-gpu-map.c daxfs-gpu-map.h ../include/daxfs_format.h
+	$(CC) $(CFLAGS) $(CUDA_CFLAGS) -c -o $@ $<
+
 clean:
-	rm -f $(TARGETS)
+	rm -f $(TARGETS) $(GPU_TARGETS)
 
 install: $(TARGETS)
 	install -m 755 $(TARGETS) /usr/local/bin/
 
-.PHONY: all clean install
+.PHONY: all clean install gpu
diff --git a/tools/daxfs-gpu-map.c b/tools/daxfs-gpu-map.c
new file mode 100644
index 0000000..bd1dda5
--- /dev/null
+++ b/tools/daxfs-gpu-map.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * daxfs GPU mapping helper
+ *
+ * Maps a daxfs DAX region into GPU address space for PCIe AtomicOps.
+ *
+ * Two mapping paths:
+ *   1. dma-buf (preferred): DAXFS_IOC_GET_DMABUF -> cuImportExternalMemory
+ *   2. /dev/mem fallback:   phys_addr from ioctl -> mmap + cuMemHostRegister
+ *
+ * Copyright (C) 2026 Multikernel Technologies, Inc. All rights reserved.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <cuda.h>
+
+#include "daxfs-gpu-map.h"
+
+/* Compute convenience device pointers from base + ioctl offsets */
+static void compute_dev_ptrs(struct daxfs_gpu_map *gpu)
+{
+	CUdeviceptr b = gpu->base;
+	const struct daxfs_gpu_info *gi = &gpu->info;
+
+	if (gi->coord_offset) {
+		gpu->coord_lock = b + gi->coord_offset + gi->coord_lock_off;
+		gpu->commit_seq = b + gi->coord_offset + gi->commit_seq_off;
+	}
+
+	if (gi->pcache_offset) {
+		gpu->pcache_slots = b + gi->pcache_slots_offset;
+		gpu->pcache_data  = b + gi->pcache_data_offset;
+		gpu->pending_count = b + gi->pcache_offset +
+				     gi->pending_count_off;
+	}
+}
+
+/* Primary path: import dma-buf as CUDA external memory */
+static int map_dmabuf(struct daxfs_gpu_map *gpu)
+{
+	int dmabuf_fd;
+	CUresult res;
+	CUDA_EXTERNAL_MEMORY_HANDLE_DESC ext_desc;
+	CUDA_EXTERNAL_MEMORY_BUFFER_DESC buf_desc;
+
+	dmabuf_fd = ioctl(gpu->mount_fd, DAXFS_IOC_GET_DMABUF);
+	if (dmabuf_fd < 0)
+		return -1;	/* not a dma-buf mount */
+	gpu->dmabuf_fd = dmabuf_fd;
+
+	memset(&ext_desc, 0, sizeof(ext_desc));
+	ext_desc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD;
+	ext_desc.handle.fd = dmabuf_fd;
+	ext_desc.size = gpu->info.dax_size;
+
+	res = cuImportExternalMemory(&gpu->ext_mem, &ext_desc);
+	if (res != CUDA_SUCCESS) {
+		fprintf(stderr, "daxfs-gpu-map: cuImportExternalMemory: %d\n",
+			(int)res);
+		close(dmabuf_fd);
+		gpu->dmabuf_fd = -1;
+		return -1;
+	}
+
+	/*
+	 * cuImportExternalMemory with OPAQUE_FD takes ownership of the fd,
+	 * so we must not close it ourselves.
+	 */
+	gpu->dmabuf_fd = -1;
+
+	memset(&buf_desc, 0, sizeof(buf_desc));
+	buf_desc.offset = 0;
+	buf_desc.size = gpu->info.dax_size;
+
+	res = cuExternalMemoryGetMappedBuffer(&gpu->base, gpu->ext_mem,
+					      &buf_desc);
+	if (res != CUDA_SUCCESS) {
+		fprintf(stderr,
+			"daxfs-gpu-map: cuExternalMemoryGetMappedBuffer: %d\n",
+			(int)res);
+		cuDestroyExternalMemory(gpu->ext_mem);
+		gpu->ext_mem = NULL;
+		return -1;
+	}
+
+	gpu->size = gpu->info.dax_size;
+	return 0;
+}
+
+/* Fallback path: /dev/mem mmap + cuMemHostRegister */
+static int map_devmem(struct daxfs_gpu_map *gpu)
+{
+	int fd;
+	void *addr;
+	CUresult res;
+
+	if (!gpu->info.dax_phys_addr) {
+		fprintf(stderr,
+			"daxfs-gpu-map: no physical address available\n");
+		errno = ENOTSUP;
+		return -1;
+	}
+
+	fd = open("/dev/mem", O_RDWR | O_SYNC);
+	if (fd < 0) {
+		perror("daxfs-gpu-map: open /dev/mem");
+		return -1;
+	}
+
+	addr = mmap(NULL, gpu->info.dax_size, PROT_READ | PROT_WRITE,
+		    MAP_SHARED, fd, (off_t)gpu->info.dax_phys_addr);
+	close(fd);
+
+	if (addr == MAP_FAILED) {
+		perror("daxfs-gpu-map: mmap /dev/mem");
+		return -1;
+	}
+	gpu->host_mmap = addr;
+
+	res = cuMemHostRegister(addr, gpu->info.dax_size,
+				CU_MEMHOSTREGISTER_DEVICEMAP);
+	if (res != CUDA_SUCCESS) {
+		fprintf(stderr,
+			"daxfs-gpu-map: cuMemHostRegister: %d\n", (int)res);
+		munmap(addr, gpu->info.dax_size);
+		gpu->host_mmap = NULL;
+		return -1;
+	}
+
+	res = cuMemHostGetDevicePointer(&gpu->base, addr, 0);
+	if (res != CUDA_SUCCESS) {
+		fprintf(stderr,
+			"daxfs-gpu-map: cuMemHostGetDevicePointer: %d\n",
+			(int)res);
+		cuMemHostUnregister(addr);
+		munmap(addr, gpu->info.dax_size);
+		gpu->host_mmap = NULL;
+		return -1;
+	}
+
+	gpu->size = gpu->info.dax_size;
+	return 0;
+}
+
+/* Fallback path 2: mmap dma-buf fd + cuMemHostRegister */
+static int map_dmabuf_mmap(struct daxfs_gpu_map *gpu)
+{
+	int dmabuf_fd;
+	void *addr;
+	CUresult res;
+
+	dmabuf_fd = ioctl(gpu->mount_fd, DAXFS_IOC_GET_DMABUF);
+	if (dmabuf_fd < 0)
+		return -1;
+
+	fprintf(stderr, "daxfs-gpu-map: dmabuf_mmap: fd=%d size=%llu\n",
+		dmabuf_fd, (unsigned long long)gpu->info.dax_size);
+
+	addr = mmap(NULL, gpu->info.dax_size, PROT_READ | PROT_WRITE,
+		    MAP_SHARED, dmabuf_fd, 0);
+	if (addr == MAP_FAILED) {
+		perror("daxfs-gpu-map: mmap dma-buf fd");
+		close(dmabuf_fd);
+		return -1;
+	}
+
+	fprintf(stderr, "daxfs-gpu-map: dmabuf mmap'd at %p\n", addr);
+
+	gpu->dmabuf_fd = dmabuf_fd;
+	gpu->host_mmap = addr;
+
+	res = cuMemHostRegister(addr, gpu->info.dax_size,
+				CU_MEMHOSTREGISTER_DEVICEMAP |
+				CU_MEMHOSTREGISTER_IOMEMORY);
+	if (res != CUDA_SUCCESS) {
+		fprintf(stderr,
+			"daxfs-gpu-map: cuMemHostRegister(dmabuf, IOMEMORY): "
+			"%d, trying without IOMEMORY...\n", (int)res);
+		res = cuMemHostRegister(addr, gpu->info.dax_size,
+					CU_MEMHOSTREGISTER_DEVICEMAP);
+	}
+	if (res != CUDA_SUCCESS) {
+		fprintf(stderr,
+			"daxfs-gpu-map: cuMemHostRegister(dmabuf): %d\n",
+			(int)res);
+		munmap(addr, gpu->info.dax_size);
+		close(dmabuf_fd);
+		gpu->host_mmap = NULL;
+		gpu->dmabuf_fd = -1;
+		return -1;
+	}
+
+	res = cuMemHostGetDevicePointer(&gpu->base, addr, 0);
+	if (res != CUDA_SUCCESS) {
+		fprintf(stderr,
+			"daxfs-gpu-map: cuMemHostGetDevicePointer(dmabuf): "
+			"%d\n", (int)res);
+		cuMemHostUnregister(addr);
+		munmap(addr, gpu->info.dax_size);
+		close(dmabuf_fd);
+		gpu->host_mmap = NULL;
+		gpu->dmabuf_fd = -1;
+		return -1;
+	}
+
+	gpu->size = gpu->info.dax_size;
+	return 0;
+}
+
+int daxfs_gpu_map(int mount_fd, struct daxfs_gpu_map *gpu)
+{
+	memset(gpu, 0, sizeof(*gpu));
+	gpu->mount_fd = mount_fd;
+	gpu->dmabuf_fd = -1;
+
+	/* Step 1: get layout from kernel */
+	if (ioctl(mount_fd, DAXFS_IOC_GET_GPU_INFO, &gpu->info) < 0) {
+		perror("daxfs-gpu-map: DAXFS_IOC_GET_GPU_INFO");
+		return -1;
+	}
+
+	if (gpu->info.dax_size == 0) {
+		fprintf(stderr, "daxfs-gpu-map: zero-length DAX region\n");
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Step 2: map into GPU - try dma-buf import, mmap+register, /dev/mem */
+	if (map_dmabuf(gpu) < 0 && map_dmabuf_mmap(gpu) < 0 &&
+	    map_devmem(gpu) < 0)
+		return -1;
+
+	/* Step 3: compute convenience pointers */
+	compute_dev_ptrs(gpu);
+
+	return 0;
+}
+
+void daxfs_gpu_unmap(struct daxfs_gpu_map *gpu)
+{
+	if (!gpu)
+		return;
+
+	if (gpu->ext_mem) {
+		/* dma-buf path */
+		if (gpu->base)
+			cuMemFree(gpu->base);
+		cuDestroyExternalMemory(gpu->ext_mem);
+		gpu->ext_mem = NULL;
+	} else if (gpu->host_mmap) {
+		/* /dev/mem path */
+		cuMemHostUnregister(gpu->host_mmap);
+		munmap(gpu->host_mmap, gpu->size);
+		gpu->host_mmap = NULL;
+	}
+
+	if (gpu->dmabuf_fd >= 0) {
+		close(gpu->dmabuf_fd);
+		gpu->dmabuf_fd = -1;
+	}
+
+	gpu->base = 0;
+	gpu->size = 0;
+}
diff --git a/tools/daxfs-gpu-map.h b/tools/daxfs-gpu-map.h
new file mode 100644
index 0000000..9a9c7fc
--- /dev/null
+++ b/tools/daxfs-gpu-map.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * daxfs GPU mapping helper - public API
+ *
+ * Maps a daxfs DAX region into GPU address space so that CUDA kernels
+ * can issue PCIe AtomicOps (CAS TLPs) against the coordination lock
+ * and shared page cache.
+ *
+ * Primary path : dma-buf fd  -> cuImportExternalMemory (OPAQUE_FD)
+ * Fallback path: /dev/mem    -> cuMemHostRegister (phys= mounts)
+ *
+ * Copyright (C) 2026 Multikernel Technologies, Inc. All rights reserved.
+ */
+#ifndef _DAXFS_GPU_MAP_H
+#define _DAXFS_GPU_MAP_H
+
+#include <cuda.h>
+#include <stdint.h>
+#include "daxfs_format.h"
+
+struct daxfs_gpu_map {
+	/* GPU mapping */
+	CUdeviceptr			base;		/* GPU pointer to DAX base */
+	CUexternalMemory		ext_mem;	/* Handle (dma-buf path) */
+	size_t				size;		/* Total mapped size */
+
+	/* Layout from ioctl */
+	struct daxfs_gpu_info		info;
+
+	/* Convenience device pointers (computed from base + offsets) */
+	CUdeviceptr			coord_lock;	/* -> coord_lock field */
+	CUdeviceptr			commit_seq;	/* -> commit_sequence */
+	CUdeviceptr			pcache_slots;	/* -> slot metadata */
+	CUdeviceptr			pcache_data;	/* -> slot data area */
+	CUdeviceptr			pending_count;	/* -> pending_count */
+
+	/* Internal state */
+	int				mount_fd;	/* fd used for ioctl */
+	int				dmabuf_fd;	/* dma-buf fd (-1 if none) */
+	void				*host_mmap;	/* mmap addr (/dev/mem path) */
+};
+
+/*
+ * Map the daxfs DAX region into GPU address space.
+ *
+ * @mount_fd: open fd on any file within the daxfs mount
+ * @gpu:      output structure (zeroed by caller)
+ *
+ * Returns 0 on success, -1 on failure (errno set).
+ * On success the caller must eventually call daxfs_gpu_unmap().
+ */
+int daxfs_gpu_map(int mount_fd, struct daxfs_gpu_map *gpu);
+
+/*
+ * Unmap and release all GPU and host resources.
+ */
+void daxfs_gpu_unmap(struct daxfs_gpu_map *gpu);
+
+/*
+ * Convenience: return a device pointer to a specific pcache slot's
+ * state_tag field.
+ */
+static inline CUdeviceptr
+daxfs_gpu_slot_state_tag(const struct daxfs_gpu_map *gpu, uint32_t slot_idx)
+{
+	return gpu->pcache_slots +
+	       (CUdeviceptr)slot_idx * gpu->info.pcache_slot_stride +
+	       gpu->info.state_tag_off;
+}
+
+/*
+ * Convenience: return a device pointer to a specific pcache slot's
+ * data page.
+ */
+static inline CUdeviceptr
+daxfs_gpu_slot_data(const struct daxfs_gpu_map *gpu, uint32_t slot_idx)
+{
+	return gpu->pcache_data + (CUdeviceptr)slot_idx * 4096;
+}
+
+#endif /* _DAXFS_GPU_MAP_H */