From 240185dabc7e84e771447f0006eddded4197f0f7 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Tue, 9 Jan 2024 15:03:06 +0100
Subject: [PATCH 01/25] Add bench cache misses

---
 Cargo.lock                                    | 10 ++++
 polkadot/node/subsystem-bench/Cargo.toml      |  3 +
 .../subsystem-bench/src/subsystem-bench.rs    | 56 ++++++++++++++++++-
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2e2e1dfdb0e2..090458c63080 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3269,6 +3269,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "crabgrind"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e174279a2a6598ba9a1fec494ebb4172916b95e69d7d2396598c6a7b850d048"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "cranelift-bforest"
 version = "0.95.1"
@@ -13557,6 +13566,7 @@ dependencies = [
  "clap-num",
  "color-eyre",
  "colored",
+ "crabgrind",
  "env_logger 0.9.3",
  "futures",
  "futures-timer",
diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml
index 73405a3bc3be..86fcc29cc586 100644
--- a/polkadot/node/subsystem-bench/Cargo.toml
+++ b/polkadot/node/subsystem-bench/Cargo.toml
@@ -59,5 +59,8 @@ orchestra = { version = "0.3.3", default-features = false, features = ["futures_
 pyroscope = "0.5.7"
 pyroscope_pprofrs = "0.2.7"
 
+[target.'cfg(target_os = "linux")'.dependencies]
+crabgrind = "0.1.10"
+
 [features]
 default = []
diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index 29b62b27855a..40450eb35dc6 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -16,6 +16,7 @@
 
 //! A tool for running subsystem benchmark tests designed for development and
 //! CI regression testing.
+
 use clap::Parser;
 use color_eyre::eyre;
 use pyroscope::PyroscopeAgent;
@@ -90,12 +91,21 @@ struct BenchCli {
 	/// Pyroscope Sample Rate
 	pub pyroscope_sample_rate: u32,
 
+	#[clap(long, default_value_t = false)]
+	/// Enable Cache Misses Profiling with Valgrind (must be installed)
+	pub cache_misses: bool,
+
 	#[command(subcommand)]
 	pub objective: cli::TestObjective,
 }
 
 impl BenchCli {
 	fn launch(self) -> eyre::Result<()> {
+		let is_valgrind = is_valgrind_mode();
+		if !is_valgrind && self.cache_misses {
+			return run_valgrind()
+		}
+
 		let agent_running = if self.profile {
 			let agent = PyroscopeAgent::builder(self.pyroscope_url.as_str(), "subsystem-bench")
 				.backend(pprof_backend(PprofConfig::new().sample_rate(self.pyroscope_sample_rate)))
@@ -185,10 +195,20 @@ impl BenchCli {
 
 		let mut state = TestState::new(&test_config);
 		let (mut env, _protocol_config) = prepare_test(test_config, &mut state);
-		// test_config.write_to_disk();
+
+		// Start collecting cache misses data
+		if is_valgrind {
+			valgrind_toggle_collect();
+		}
+
 		env.runtime()
 			.block_on(availability::benchmark_availability_read(&mut env, state));
 
+		// Stop collecting cache misses data
+		if is_valgrind {
+			valgrind_toggle_collect();
+		}
+
 		if let Some(agent_running) = agent_running {
 			let agent_ready = agent_running.stop()?;
 			agent_ready.shutdown();
@@ -198,6 +218,40 @@ impl BenchCli {
 	}
 }
 
+#[cfg(target_os = "linux")]
+fn is_valgrind_mode() -> bool {
+	!matches!(crabgrind::run_mode(), crabgrind::RunMode::Native)
+}
+
+#[cfg(not(target_os = "linux"))]
+fn is_valgrind_mode() -> bool {
+	false
+}
+
+#[cfg(target_os = "linux")]
+fn valgrind_toggle_collect() {
+	crabgrind::callgrind::toggle_collect()
+}
+
+#[cfg(not(target_os = "linux"))]
+fn valgrind_toggle_collect() {}
+
+#[cfg(target_os = "linux")]
+fn run_valgrind() -> eyre::Result<()> {
+	use std::os::unix::process::CommandExt;
+	std::process::Command::new("valgrind")
+		.arg("--tool=cachegrind")
+		.arg("--cache-sim=yes")
+		.arg("--collect-atstart=no")
+		.args(std::env::args())
+		.exec();
+}
+
+#[cfg(not(target_os = "linux"))]
+fn run_valgrind() -> eyre::Result<()> {
+	return Err(eyre::eyre!("Valgrind can be executed only on linux"));
+}
+
 fn main() -> eyre::Result<()> {
 	color_eyre::install()?;
 	env_logger::builder()

From 81ed8ac76a9bda3c0588e428a7573dad0b0f7010 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Tue, 9 Jan 2024 15:08:26 +0100
Subject: [PATCH 02/25] Fix return type

---
 polkadot/node/subsystem-bench/src/subsystem-bench.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index 40450eb35dc6..fd7ecbb71fe0 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -245,6 +245,8 @@ fn run_valgrind() -> eyre::Result<()> {
 		.arg("--collect-atstart=no")
 		.args(std::env::args())
 		.exec();
+
+	return Ok(())
 }
 
 #[cfg(not(target_os = "linux"))]

From 550420a0dd1b21021deeab9f1e2675ddaf269c01 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Tue, 9 Jan 2024 18:04:13 +0100
Subject: [PATCH 03/25] Fix valgrind command

---
 polkadot/node/subsystem-bench/src/subsystem-bench.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index fd7ecbb71fe0..77851a0b3cf7 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -240,8 +240,9 @@ fn valgrind_toggle_collect() {}
 fn run_valgrind() -> eyre::Result<()> {
 	use std::os::unix::process::CommandExt;
 	std::process::Command::new("valgrind")
-		.arg("--tool=cachegrind")
+		.arg("--tool=callgrind")
 		.arg("--cache-sim=yes")
+		.arg("--instr-atstart=no")
 		.arg("--collect-atstart=no")
 		.args(std::env::args())
 		.exec();

From cf2a144f590a12ea13924f33a082e97f30913502 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Tue, 9 Jan 2024 18:08:15 +0100
Subject: [PATCH 04/25] Fix measurements

---
 .../subsystem-bench/src/subsystem-bench.rs    | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index 77851a0b3cf7..14af91a16e83 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -103,7 +103,7 @@ impl BenchCli {
 	fn launch(self) -> eyre::Result<()> {
 		let is_valgrind = is_valgrind_mode();
 		if !is_valgrind && self.cache_misses {
-			return run_valgrind()
+			return valgrind_init()
 		}
 
 		let agent_running = if self.profile {
@@ -196,17 +196,15 @@ impl BenchCli {
 		let mut state = TestState::new(&test_config);
 		let (mut env, _protocol_config) = prepare_test(test_config, &mut state);
 
-		// Start collecting cache misses data
 		if is_valgrind {
-			valgrind_toggle_collect();
+			valgrind_start();
 		}
 
 		env.runtime()
 			.block_on(availability::benchmark_availability_read(&mut env, state));
 
-		// Stop collecting cache misses data
 		if is_valgrind {
-			valgrind_toggle_collect();
+			valgrind_stop();
 		}
 
 		if let Some(agent_running) = agent_running {
@@ -228,16 +226,28 @@ fn is_valgrind_mode() -> bool {
 	false
 }
 
+/// Start collecting cache misses data
 #[cfg(target_os = "linux")]
-fn valgrind_toggle_collect() {
-	crabgrind::callgrind::toggle_collect()
+fn valgrind_start() {
+	crabgrind::callgrind::start_instrumentation();
+	crabgrind::callgrind::toggle_collect();
 }
 
 #[cfg(not(target_os = "linux"))]
-fn valgrind_toggle_collect() {}
+fn valgrind_start() {}
 
+/// Stop collecting cache misses data
 #[cfg(target_os = "linux")]
-fn run_valgrind() -> eyre::Result<()> {
+fn valgrind_stop() {
+	crabgrind::callgrind::toggle_collect();
+	crabgrind::callgrind::stop_instrumentation();
+}
+
+#[cfg(not(target_os = "linux"))]
+fn valgrind_stop() {}
+
+#[cfg(target_os = "linux")]
+fn valgrind_init() -> eyre::Result<()> {
 	use std::os::unix::process::CommandExt;
 	std::process::Command::new("valgrind")
 		.arg("--tool=callgrind")
@@ -251,7 +261,7 @@ fn run_valgrind() -> eyre::Result<()> {
 }
 
 #[cfg(not(target_os = "linux"))]
-fn run_valgrind() -> eyre::Result<()> {
+fn valgrind_init() -> eyre::Result<()> {
 	return Err(eyre::eyre!("Valgrind can be executed only on linux"));
 }
 

From f11656a0f9cfaaf2339ffea30892d21a69397a78 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Wed, 10 Jan 2024 09:55:54 +0100
Subject: [PATCH 05/25] Switch to cachegrind

---
 polkadot/node/subsystem-bench/src/subsystem-bench.rs | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index 14af91a16e83..4568207e3bb7 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -229,8 +229,7 @@ fn is_valgrind_mode() -> bool {
 /// Start collecting cache misses data
 #[cfg(target_os = "linux")]
 fn valgrind_start() {
-	crabgrind::callgrind::start_instrumentation();
-	crabgrind::callgrind::toggle_collect();
+	crabgrind::cachegrind::start_instrumentation();
 }
 
 #[cfg(not(target_os = "linux"))]
@@ -239,8 +238,7 @@ fn valgrind_start() {}
 /// Stop collecting cache misses data
 #[cfg(target_os = "linux")]
 fn valgrind_stop() {
-	crabgrind::callgrind::toggle_collect();
-	crabgrind::callgrind::stop_instrumentation();
+	crabgrind::cachegrind::stop_instrumentation();
 }
 
 #[cfg(not(target_os = "linux"))]
@@ -250,10 +248,9 @@ fn valgrind_stop() {}
 fn valgrind_init() -> eyre::Result<()> {
 	use std::os::unix::process::CommandExt;
 	std::process::Command::new("valgrind")
-		.arg("--tool=callgrind")
+		.arg("--tool=cachegrind")
 		.arg("--cache-sim=yes")
 		.arg("--instr-atstart=no")
-		.arg("--collect-atstart=no")
 		.args(std::env::args())
 		.exec();
 

From ea265fd1d7c4a1b9159713b86a2095b6ca1dbd00 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Wed, 10 Jan 2024 10:37:56 +0100
Subject: [PATCH 06/25] Fix command

---
 polkadot/node/subsystem-bench/src/subsystem-bench.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index 4568207e3bb7..be08b769c85d 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -250,7 +250,7 @@ fn valgrind_init() -> eyre::Result<()> {
 	std::process::Command::new("valgrind")
 		.arg("--tool=cachegrind")
 		.arg("--cache-sim=yes")
-		.arg("--instr-atstart=no")
+		.arg("--instr-at-start=no")
 		.args(std::env::args())
 		.exec();
 

From 24f098396b81f53aace9972191faabaa86b7123f Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Wed, 10 Jan 2024 12:10:40 +0100
Subject: [PATCH 07/25] Add readme instructions

---
 polkadot/node/subsystem-bench/README.md       | 72 ++++++++++++++-----
 .../subsystem-bench/src/subsystem-bench.rs    |  2 +-
 2 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md
index b1476db27548..138952e70605 100644
--- a/polkadot/node/subsystem-bench/README.md
+++ b/polkadot/node/subsystem-bench/README.md
@@ -117,23 +117,24 @@ used to run a suite of tests defined in a `yaml` file like in this [example](exa
 
 ```
 Options:
-      --network <NETWORK>                              The type of network to be emulated [default: ideal] [possible values:
-                                                       ideal, healthy, degraded]
-      --n-cores <N_CORES>                              Number of cores to fetch availability for [default: 100]
-      --n-validators <N_VALIDATORS>                    Number of validators to fetch chunks from [default: 500]
-      --min-pov-size <MIN_POV_SIZE>                    The minimum pov size in KiB [default: 5120]
-      --max-pov-size <MAX_POV_SIZE>                    The maximum pov size bytes [default: 5120]
-  -n, --num-blocks <NUM_BLOCKS>                        The number of blocks the test is going to run [default: 1]
-  -p, --peer-bandwidth <PEER_BANDWIDTH>                The bandwidth of simulated remote peers in KiB
-  -b, --bandwidth <BANDWIDTH>                          The bandwidth of our simulated node in KiB
-      --peer-error <PEER_ERROR>                        Simulated conection error ratio [0-100]
-      --peer-min-latency <PEER_MIN_LATENCY>            Minimum remote peer latency in milliseconds [0-5000]
-      --peer-max-latency <PEER_MAX_LATENCY>            Maximum remote peer latency in milliseconds [0-5000]
-      --profile                                        Enable CPU Profiling with Pyroscope
-      --pyroscope-url <PYROSCOPE_URL>                  Pyroscope Server URL [default: http://localhost:4040]
-      --pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE>  Pyroscope Sample Rate [default: 113]
-  -h, --help                                           Print help
-  -V, --version                                        Print version
+    --network <NETWORK>                              The type of network to be emulated [default: ideal] [possible
+                                                     values: ideal, healthy, degraded]
+    --n-cores <N_CORES>                              Number of cores to fetch availability for [default: 100]
+    --n-validators <N_VALIDATORS>                    Number of validators to fetch chunks from [default: 500]
+    --min-pov-size <MIN_POV_SIZE>                    The minimum pov size in KiB [default: 5120]
+    --max-pov-size <MAX_POV_SIZE>                    The maximum pov size bytes [default: 5120]
+-n, --num-blocks <NUM_BLOCKS>                        The number of blocks the test is going to run [default: 1]
+-p, --peer-bandwidth <PEER_BANDWIDTH>                The bandwidth of simulated remote peers in KiB
+-b, --bandwidth <BANDWIDTH>                          The bandwidth of our simulated node in KiB
+    --peer-error <PEER_ERROR>                        Simulated conection error ratio [0-100]
+    --peer-min-latency <PEER_MIN_LATENCY>            Minimum remote peer latency in milliseconds [0-5000]
+    --peer-max-latency <PEER_MAX_LATENCY>            Maximum remote peer latency in milliseconds [0-5000]
+    --profile                                        Enable CPU Profiling with Pyroscope
+    --pyroscope-url <PYROSCOPE_URL>                  Pyroscope Server URL [default: http://localhost:4040]
+    --pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE>  Pyroscope Sample Rate [default: 113]
+    --cache-misses                                   Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind
+                                                     must be in the PATH
+-h, --help                                           Print help
 ```
 
 These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file.
@@ -221,6 +222,43 @@ view the test progress in real time by accessing [this link](http://localhost:30
 Now run
 `target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml`
 and view the metrics in real time and spot differences between different `n_validators` values.
+
+### Profiling cache misses
+
+Cache misses are profiled using Cachegrind, part of Valgrind. Cachegrind runs slowly, and its cache simulation is basic
+and unlikely to reflect the behavior of a modern machine. However, it still represents the general situation with cache
+usage, and more importantly it doesn't require a bare-metal machine to run on, which means it could be run in CI or in
+a remote virtual installation.
+
+To profile cache misses use the `--cache-misses` flag. Since the execution will be very slow, it's recommended not to run it together with other profiling and not to take benchmark results into account.
+
+Example run results:
+```
+==201761== I   refs:      95,411,656,238
+==201761== I1  misses:         9,032,639
+==201761== LLi misses:            42,409
+==201761== I1  miss rate:           0.01%
+==201761== LLi miss rate:           0.00%
+==201761==
+==201761== D   refs:      18,607,561,833  (10,227,094,854 rd + 8,380,466,979 wr)
+==201761== D1  misses:       335,359,148  (   161,537,270 rd +   173,821,878 wr)
+==201761== LLd misses:        16,428,680  (     6,413,757 rd +    10,014,923 wr)
+==201761== D1  miss rate:            1.8% (           1.6%   +           2.1%  )
+==201761== LLd miss rate:            0.1% (           0.1%   +           0.1%  )
+==201761==
+==201761== LL refs:          344,391,787  (   170,569,909 rd +   173,821,878 wr)
+==201761== LL misses:         16,471,089  (     6,456,166 rd +    10,014,923 wr)
+==201761== LL miss rate:             0.0% (           0.0%   +           0.1%  )
+```
+
+The results show that 1.8% of the L1 data cache missed, but the last level cache only missed 0.1% of the time.
+Instruction data of the L1 has 0.04% of the time and almost nothing was missed at the last level.
+
+Cachegrind writes line-by-line cache profiling information to a file named `cachegrind.out.<pid>`.
+This file is best interpreted with `cg_annotate --auto=yes cachegrind.out.<pid>`. For more information see the [cachegrind manual](https://www.cs.cmu.edu/afs/cs.cmu.edu/project/cmt-40/Nice/RuleRefinement/bin/valgrind-3.2.0/docs/html/cg-manual.html).
+
+For finer profiling of cache misses, better use `perf` on a bare-metal machine.
+
 ## Create new test objectives
 
 This tool is intended to make it easy to write new test objectives that focus individual subsystems,
diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index be08b769c85d..4d6d9527ec1f 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -92,7 +92,7 @@ struct BenchCli {
 	pub pyroscope_sample_rate: u32,
 
 	#[clap(long, default_value_t = false)]
-	/// Enable Cache Misses Profiling with Valgrind (must be installed)
+	/// Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind must be in the PATH
 	pub cache_misses: bool,
 
 	#[command(subcommand)]

From 0ab01439f989d575ce9d5a3a641a0c71fb956a79 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Wed, 10 Jan 2024 12:32:16 +0100
Subject: [PATCH 08/25] Add more examples to readme

---
 polkadot/node/subsystem-bench/README.md | 38 +++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md
index 138952e70605..5ee076f0696e 100644
--- a/polkadot/node/subsystem-bench/README.md
+++ b/polkadot/node/subsystem-bench/README.md
@@ -234,6 +234,44 @@ To profile cache misses use the `--cache-misses` flag. Since the execution will
 
 Example run results:
 ```
+$ target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
+==201761== Callgrind, a call-graph generating cache profiler
+==201761== Copyright (C) 2002-2017, and GNU GPL'd, by Josef Weidendorfer et al.
+==201761== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright info
+==201761== Command: target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
+==201761==
+--201761-- warning: L3 cache found, using its data for the LL simulation.
+--201761-- warning: specified LL cache: line_size 64  assoc 20  total_size 57,671,680
+--201761-- warning: simulated LL cache: line_size 64  assoc 28  total_size 58,720,256
+==201761== For interactive control, run 'callgrind_control -h'.
+[2024-01-10T08:00:32Z INFO  subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, error = 0, latency = N
+one
+[2024-01-10T08:00:32Z INFO  subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880
+[2024-01-10T08:01:21Z INFO  subsystem-bench::availability] Created test environment.
+[2024-01-10T08:01:21Z INFO  subsystem-bench::availability] Pre-generating 10 candidates.
+[2024-01-10T08:07:18Z INFO  subsystem-bench::core] Initializing emulation for a 500 peer network.
+[2024-01-10T08:07:18Z INFO  subsystem-bench::core] connectivity 100%, error 0%
+[2024-01-10T08:07:19Z INFO  subsystem-bench::core] Network created, connected validator count 500
+[2024-01-10T08:07:19Z INFO  subsystem-bench::availability] Current block 1/1
+[2024-01-10T08:07:19Z INFO  substrate_prometheus_endpoint] 〽️ Prometheus exporter started at 127.0.0.1:9999
+[2024-01-10T08:07:20Z INFO  subsystem_bench::availability] 10 recoveries pending
+[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] All work for block completed in 1462465ms
+[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] All blocks processed in 1462533ms
+[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] Throughput: 51200 KiB/block
+[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] Block time: 1462538 ms
+[2024-01-10T08:31:42Z INFO  subsystem_bench::availability]
+
+    Total received from network: 200 MiB
+    Total sent to network: 762 KiB
+    Total subsystem CPU usage 2916.57s
+    CPU usage per block 2916.57s
+    Total test environment CPU usage 6.17s
+    CPU usage per block 6.17s
+
+==201761==
+==201761== Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
+==201761== Collected : 95411656238 10227094854 8380466979 9032639 161537270 173821878 42409 6413757 10014923
+==201761==
 ==201761== I   refs:      95,411,656,238
 ==201761== I1  misses:         9,032,639
 ==201761== LLi misses:            42,409

From d2a5dc5e2c38e47b7f9f319709a31a8aba6a9b20 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Wed, 10 Jan 2024 12:45:03 +0100
Subject: [PATCH 09/25] Fix markdown

---
 polkadot/node/subsystem-bench/README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md
index 5ee076f0696e..6f738b0674c6 100644
--- a/polkadot/node/subsystem-bench/README.md
+++ b/polkadot/node/subsystem-bench/README.md
@@ -230,7 +230,8 @@ and unlikely to reflect the behavior of a modern machine. However, it still repr
 usage, and more importantly it doesn't require a bare-metal machine to run on, which means it could be run in CI or in
 a remote virtual installation.
 
-To profile cache misses use the `--cache-misses` flag. Since the execution will be very slow, it's recommended not to run it together with other profiling and not to take benchmark results into account.
+To profile cache misses use the `--cache-misses` flag. Since the execution will be very slow, it's recommended not to
+run it together with other profiling and not to take benchmark results into account.
 
 Example run results:
 ```
@@ -293,7 +294,8 @@ The results show that 1.8% of the L1 data cache missed, but the last level cache
 Instruction data of the L1 has 0.04% of the time and almost nothing was missed at the last level.
 
 Cachegrind writes line-by-line cache profiling information to a file named `cachegrind.out.<pid>`.
-This file is best interpreted with `cg_annotate --auto=yes cachegrind.out.<pid>`. For more information see the [cachegrind manual](https://www.cs.cmu.edu/afs/cs.cmu.edu/project/cmt-40/Nice/RuleRefinement/bin/valgrind-3.2.0/docs/html/cg-manual.html).
+This file is best interpreted with `cg_annotate --auto=yes cachegrind.out.<pid>`. For more information see the
+[cachegrind manual](https://www.cs.cmu.edu/afs/cs.cmu.edu/project/cmt-40/Nice/RuleRefinement/bin/valgrind-3.2.0/docs/html/cg-manual.html).
 
 For finer profiling of cache misses, better use `perf` on a bare-metal machine.
 

From a2267b42ff314f4c366db63b4b4a42216919ea06 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Thu, 11 Jan 2024 12:40:53 +0100
Subject: [PATCH 10/25] Move to a module

---
 .../subsystem-bench/src/subsystem-bench.rs    |  62 ++---------
 polkadot/node/subsystem-bench/src/valgrind.rs | 100 ++++++++++++++++++
 2 files changed, 109 insertions(+), 53 deletions(-)
 create mode 100644 polkadot/node/subsystem-bench/src/valgrind.rs

diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index 4d6d9527ec1f..9950660d1290 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -28,6 +28,7 @@ use std::{path::Path, time::Duration};
 pub(crate) mod availability;
 pub(crate) mod cli;
 pub(crate) mod core;
+mod valgrind;
 
 use availability::{prepare_test, NetworkEmulation, TestState};
 use cli::TestObjective;
@@ -101,9 +102,9 @@ struct BenchCli {
 
 impl BenchCli {
 	fn launch(self) -> eyre::Result<()> {
-		let is_valgrind = is_valgrind_mode();
-		if !is_valgrind && self.cache_misses {
-			return valgrind_init()
+		let is_valgrind_running = valgrind::is_valgrind_running();
+		if !is_valgrind_running && self.cache_misses {
+			return valgrind::relaunch_in_valgrind_mode()
 		}
 
 		let agent_running = if self.profile {
@@ -196,15 +197,16 @@ impl BenchCli {
 		let mut state = TestState::new(&test_config);
 		let (mut env, _protocol_config) = prepare_test(test_config, &mut state);
 
-		if is_valgrind {
-			valgrind_start();
+		if is_valgrind_running {
+			valgrind::start_measuring();
 		}
 
 		env.runtime()
 			.block_on(availability::benchmark_availability_read(&mut env, state));
 
-		if is_valgrind {
-			valgrind_stop();
+		if is_valgrind_running {
+			valgrind::stop_measuring();
+			valgrind::dispay_report()?;
 		}
 
 		if let Some(agent_running) = agent_running {
@@ -216,52 +218,6 @@ impl BenchCli {
 	}
 }
 
-#[cfg(target_os = "linux")]
-fn is_valgrind_mode() -> bool {
-	!matches!(crabgrind::run_mode(), crabgrind::RunMode::Native)
-}
-
-#[cfg(not(target_os = "linux"))]
-fn is_valgrind_mode() -> bool {
-	false
-}
-
-/// Start collecting cache misses data
-#[cfg(target_os = "linux")]
-fn valgrind_start() {
-	crabgrind::cachegrind::start_instrumentation();
-}
-
-#[cfg(not(target_os = "linux"))]
-fn valgrind_start() {}
-
-/// Stop collecting cache misses data
-#[cfg(target_os = "linux")]
-fn valgrind_stop() {
-	crabgrind::cachegrind::stop_instrumentation();
-}
-
-#[cfg(not(target_os = "linux"))]
-fn valgrind_stop() {}
-
-#[cfg(target_os = "linux")]
-fn valgrind_init() -> eyre::Result<()> {
-	use std::os::unix::process::CommandExt;
-	std::process::Command::new("valgrind")
-		.arg("--tool=cachegrind")
-		.arg("--cache-sim=yes")
-		.arg("--instr-at-start=no")
-		.args(std::env::args())
-		.exec();
-
-	return Ok(())
-}
-
-#[cfg(not(target_os = "linux"))]
-fn valgrind_init() -> eyre::Result<()> {
-	return Err(eyre::eyre!("Valgrind can be executed only on linux"));
-}
-
 fn main() -> eyre::Result<()> {
 	color_eyre::install()?;
 	env_logger::builder()
diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
new file mode 100644
index 000000000000..5456d4f3eeeb
--- /dev/null
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -0,0 +1,100 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// This file is part of Polkadot.
+
+// Polkadot is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// Polkadot is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
+
+use color_eyre::eyre;
+
+#[cfg(target_os = "linux")]
+const LOG_FILE: &str = "cachegrind_logs";
+#[cfg(target_os = "linux")]
+const REPORT_START: &str = "I refs:";
+
+/// Show if the app is running under Valgrind
+#[cfg(target_os = "linux")]
+pub(crate) fn is_valgrind_running() -> bool {
+	!matches!(crabgrind::run_mode(), crabgrind::RunMode::Native)
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn is_valgrind_running() -> bool {
+	false
+}
+
+/// Start collecting cache misses data
+#[cfg(target_os = "linux")]
+pub(crate) fn start_measuring() {
+	crabgrind::cachegrind::start_instrumentation();
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn start_measuring() {}
+
+/// Stop collecting cache misses data
+#[cfg(target_os = "linux")]
+pub(crate) fn stop_measuring() {
+	crabgrind::cachegrind::stop_instrumentation();
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn stop_measuring() {}
+
+/// Stop execution and relaunch the app under valgrind
+#[cfg(target_os = "linux")]
+pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
+	use std::os::unix::process::CommandExt;
+	std::process::Command::new("valgrind")
+		.arg("--tool=cachegrind")
+		.arg("--cache-sim=yes")
+		.arg("--instr-at-start=no")
+		.arg(format!("--log-file={}", LOG_FILE))
+		.args(std::env::args())
+		.exec();
+
+	return Ok(())
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
+	return Err(eyre::eyre!("Valgrind can be executed only on linux"));
+}
+
+#[cfg(target_os = "linux")]
+fn prepare_report() -> eyre::Result<String> {
+	let log_file = std::fs::read_to_string(LOG_FILE)?;
+	let lines: Vec<&str> = log_file.lines().collect();
+	let start = lines
+		.iter()
+		.position(|line| line.contains(REPORT_START))
+		.ok_or(Err(eyre::eyre!("Log file {} does not contain cache misses report", LOG_FILE)))?;
+	let lines: Vec<&str> = lines
+		.iter()
+		.skip(start)
+		.map(|line| line.trim_start_matches(|c: char| !c.is_alphabetic()))
+		.collect();
+
+	Ok(format!("\nCache misses report:\n\n\t{}", lines.join("\n\t")))
+}
+
+#[cfg(target_os = "linux")]
+pub(crate) fn dispay_report() -> eyre::Result<()> {
+	gum::info!("{}", prepare_report()?);
+
+	Ok(())
+}
+
+#[cfg(not(target_os = "linux"))]
+pub(crate) fn dispay_report() -> eyre::Result<()> {
+	Ok(())
+}

From 84c4e2be1620b1c694d02e68a8120fecb6186c77 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Thu, 11 Jan 2024 12:53:50 +0100
Subject: [PATCH 11/25] fix error

---
 polkadot/node/subsystem-bench/src/valgrind.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index 5456d4f3eeeb..38108f1440d3 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -77,7 +77,7 @@ fn prepare_report() -> eyre::Result<String> {
 	let start = lines
 		.iter()
 		.position(|line| line.contains(REPORT_START))
-		.ok_or(Err(eyre::eyre!("Log file {} does not contain cache misses report", LOG_FILE)))?;
+		.ok_or(eyre::eyre!("Log file {} does not contain cache misses report", LOG_FILE))?;
 	let lines: Vec<&str> = lines
 		.iter()
 		.skip(start)

From e7a2d4bc7de71fa19abbd98b31c48f4ef23ab5da Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Thu, 11 Jan 2024 13:28:55 +0100
Subject: [PATCH 12/25] Fix report preparing

---
 polkadot/node/subsystem-bench/src/valgrind.rs | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index 38108f1440d3..d1be5a22da41 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -19,7 +19,9 @@ use color_eyre::eyre;
 #[cfg(target_os = "linux")]
 const LOG_FILE: &str = "cachegrind_logs";
 #[cfg(target_os = "linux")]
-const REPORT_START: &str = "I refs:";
+const HEADER_SIZE: usize = 6;
+#[cfg(target_os = "linux")]
+const WARNING: &str = "warning";
 
 /// Show if the app is running under Valgrind
 #[cfg(target_os = "linux")]
@@ -73,18 +75,14 @@ pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 #[cfg(target_os = "linux")]
 fn prepare_report() -> eyre::Result<String> {
 	let log_file = std::fs::read_to_string(LOG_FILE)?;
-	let lines: Vec<&str> = log_file.lines().collect();
-	let start = lines
-		.iter()
-		.position(|line| line.contains(REPORT_START))
-		.ok_or(eyre::eyre!("Log file {} does not contain cache misses report", LOG_FILE))?;
-	let lines: Vec<&str> = lines
-		.iter()
-		.skip(start)
-		.map(|line| line.trim_start_matches(|c: char| !c.is_alphabetic()))
-		.collect();
-
-	Ok(format!("\nCache misses report:\n\n\t{}", lines.join("\n\t")))
+  let lines: Vec<&str> = contents
+    .lines()
+    .skip(HEADER_SIZE)
+    .map(|line| line.trim_start_matches(|c: char| !c.is_alphabetic()))
+    .filter(|line| !line.contains(WARNING))
+    .collect();
+
+	Ok(format!("\nCache misses report:\n\t{}", lines.join("\n\t")))
 }
 
 #[cfg(target_os = "linux")]

From b38a9905e3eb311fd16dea084a4aece4327b09f0 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Thu, 11 Jan 2024 13:31:55 +0100
Subject: [PATCH 13/25] Fix report preparing

---
 polkadot/node/subsystem-bench/src/valgrind.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index d1be5a22da41..89bcfe92784c 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -75,12 +75,12 @@ pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 #[cfg(target_os = "linux")]
 fn prepare_report() -> eyre::Result<String> {
 	let log_file = std::fs::read_to_string(LOG_FILE)?;
-  let lines: Vec<&str> = contents
-    .lines()
-    .skip(HEADER_SIZE)
-    .map(|line| line.trim_start_matches(|c: char| !c.is_alphabetic()))
-    .filter(|line| !line.contains(WARNING))
-    .collect();
+	let lines: Vec<&str> = log_file
+		.lines()
+		.skip(HEADER_SIZE)
+		.map(|line| line.trim_start_matches(|c: char| !c.is_alphabetic()))
+		.filter(|line| !line.contains(WARNING))
+		.collect();
 
 	Ok(format!("\nCache misses report:\n\t{}", lines.join("\n\t")))
 }

From a6caa233a864a44b68afae61de3f9a09788c4225 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Thu, 11 Jan 2024 15:10:20 +0100
Subject: [PATCH 14/25] Save report to a file

---
 .../subsystem-bench/src/subsystem-bench.rs    |  1 -
 polkadot/node/subsystem-bench/src/valgrind.rs | 34 +------------------
 2 files changed, 1 insertion(+), 34 deletions(-)

diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index 9950660d1290..771a1ec95907 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -206,7 +206,6 @@ impl BenchCli {
 
 		if is_valgrind_running {
 			valgrind::stop_measuring();
-			valgrind::dispay_report()?;
 		}
 
 		if let Some(agent_running) = agent_running {
diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index 89bcfe92784c..54e676d424ef 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -16,13 +16,6 @@
 
 use color_eyre::eyre;
 
-#[cfg(target_os = "linux")]
-const LOG_FILE: &str = "cachegrind_logs";
-#[cfg(target_os = "linux")]
-const HEADER_SIZE: usize = 6;
-#[cfg(target_os = "linux")]
-const WARNING: &str = "warning";
-
 /// Show if the app is running under Valgrind
 #[cfg(target_os = "linux")]
 pub(crate) fn is_valgrind_running() -> bool {
@@ -60,7 +53,7 @@ pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 		.arg("--tool=cachegrind")
 		.arg("--cache-sim=yes")
 		.arg("--instr-at-start=no")
-		.arg(format!("--log-file={}", LOG_FILE))
+		.arg("--log-file=cachegrind_report.txt")
 		.args(std::env::args())
 		.exec();
 
@@ -71,28 +64,3 @@ pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 	return Err(eyre::eyre!("Valgrind can be executed only on linux"));
 }
-
-#[cfg(target_os = "linux")]
-fn prepare_report() -> eyre::Result<String> {
-	let log_file = std::fs::read_to_string(LOG_FILE)?;
-	let lines: Vec<&str> = log_file
-		.lines()
-		.skip(HEADER_SIZE)
-		.map(|line| line.trim_start_matches(|c: char| !c.is_alphabetic()))
-		.filter(|line| !line.contains(WARNING))
-		.collect();
-
-	Ok(format!("\nCache misses report:\n\t{}", lines.join("\n\t")))
-}
-
-#[cfg(target_os = "linux")]
-pub(crate) fn dispay_report() -> eyre::Result<()> {
-	gum::info!("{}", prepare_report()?);
-
-	Ok(())
-}
-
-#[cfg(not(target_os = "linux"))]
-pub(crate) fn dispay_report() -> eyre::Result<()> {
-	Ok(())
-}

From 20ceaa7879db345af186017aeca00f2301f9926b Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Thu, 11 Jan 2024 15:57:34 +0100
Subject: [PATCH 15/25] Update the instruction

---
 polkadot/node/subsystem-bench/README.md | 84 +++++++++----------------
 1 file changed, 30 insertions(+), 54 deletions(-)

diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md
index 6f738b0674c6..6097ca544383 100644
--- a/polkadot/node/subsystem-bench/README.md
+++ b/polkadot/node/subsystem-bench/README.md
@@ -231,67 +231,43 @@ usage, and more importantly it doesn't require a bare-metal machine to run on, w
 a remote virtual installation.
 
 To profile cache misses use the `--cache-misses` flag. Since the execution will be very slow, it's recommended not to
-run it together with other profiling and not to take benchmark results into account.
+run it together with other profiling and not to take benchmark results into account. A report is saved in a file
+`cachegrind_report.txt`.
 
 Example run results:
 ```
 $ target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
-==201761== Callgrind, a call-graph generating cache profiler
-==201761== Copyright (C) 2002-2017, and GNU GPL'd, by Josef Weidendorfer et al.
-==201761== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright info
-==201761== Command: target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
-==201761==
---201761-- warning: L3 cache found, using its data for the LL simulation.
---201761-- warning: specified LL cache: line_size 64  assoc 20  total_size 57,671,680
---201761-- warning: simulated LL cache: line_size 64  assoc 28  total_size 58,720,256
-==201761== For interactive control, run 'callgrind_control -h'.
-[2024-01-10T08:00:32Z INFO  subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, error = 0, latency = N
-one
-[2024-01-10T08:00:32Z INFO  subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880
-[2024-01-10T08:01:21Z INFO  subsystem-bench::availability] Created test environment.
-[2024-01-10T08:01:21Z INFO  subsystem-bench::availability] Pre-generating 10 candidates.
-[2024-01-10T08:07:18Z INFO  subsystem-bench::core] Initializing emulation for a 500 peer network.
-[2024-01-10T08:07:18Z INFO  subsystem-bench::core] connectivity 100%, error 0%
-[2024-01-10T08:07:19Z INFO  subsystem-bench::core] Network created, connected validator count 500
-[2024-01-10T08:07:19Z INFO  subsystem-bench::availability] Current block 1/1
-[2024-01-10T08:07:19Z INFO  substrate_prometheus_endpoint] 〽️ Prometheus exporter started at 127.0.0.1:9999
-[2024-01-10T08:07:20Z INFO  subsystem_bench::availability] 10 recoveries pending
-[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] All work for block completed in 1462465ms
-[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] All blocks processed in 1462533ms
-[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] Throughput: 51200 KiB/block
-[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] Block time: 1462538 ms
-[2024-01-10T08:31:42Z INFO  subsystem_bench::availability]
-
-    Total received from network: 200 MiB
-    Total sent to network: 762 KiB
-    Total subsystem CPU usage 2916.57s
-    CPU usage per block 2916.57s
-    Total test environment CPU usage 6.17s
-    CPU usage per block 6.17s
-
-==201761==
-==201761== Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
-==201761== Collected : 95411656238 10227094854 8380466979 9032639 161537270 173821878 42409 6413757 10014923
-==201761==
-==201761== I   refs:      95,411,656,238
-==201761== I1  misses:         9,032,639
-==201761== LLi misses:            42,409
-==201761== I1  miss rate:           0.01%
-==201761== LLi miss rate:           0.00%
-==201761==
-==201761== D   refs:      18,607,561,833  (10,227,094,854 rd + 8,380,466,979 wr)
-==201761== D1  misses:       335,359,148  (   161,537,270 rd +   173,821,878 wr)
-==201761== LLd misses:        16,428,680  (     6,413,757 rd +    10,014,923 wr)
-==201761== D1  miss rate:            1.8% (           1.6%   +           2.1%  )
-==201761== LLd miss rate:            0.1% (           0.1%   +           0.1%  )
-==201761==
-==201761== LL refs:          344,391,787  (   170,569,909 rd +   173,821,878 wr)
-==201761== LL misses:         16,471,089  (     6,456,166 rd +    10,014,923 wr)
+$ cat cachegrind_report.txt
+==232852== Cachegrind, a high-precision tracing profiler
+==232852== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
+==232852== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright info
+==232852== Command: target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
+==232852== Parent PID: 135623
+==232852==
+--232852-- warning: L3 cache found, using its data for the LL simulation.
+--232852-- warning: specified LL cache: line_size 64  assoc 20  total_size 57,671,680
+--232852-- warning: simulated LL cache: line_size 64  assoc 28  total_size 58,720,256
+==232852==
+==232852== I refs:        64,792,806,728
+==232852== I1  misses:         3,720,238
+==232852== LLi misses:            34,545
+==232852== I1  miss rate:           0.01%
+==232852== LLi miss rate:           0.00%
+==232852==
+==232852== D refs:        12,431,397,584  (10,003,114,208 rd   + 2,428,283,376 wr)
+==232852== D1  misses:       259,602,002  (   123,007,075 rd   +   136,594,927 wr)
+==232852== LLd misses:        13,168,490  (     5,331,255 rd   +     7,837,235 wr)
+==232852== D1  miss rate:            2.1% (           1.2%     +           5.6%  )
+==232852== LLd miss rate:            0.1% (           0.1%     +           0.3%  )
+==232852==
+==232852== LL refs:          263,322,240  (   126,727,313 rd   +   136,594,927 wr)
+==232852== LL misses:         13,203,035  (     5,365,800 rd   +     7,837,235 wr)
+==232852== LL miss rate:             0.0% (           0.0%     +           0.3%  )
 ==201761== LL miss rate:             0.0% (           0.0%   +           0.1%  )
 ```
 
-The results show that 1.8% of the L1 data cache missed, but the last level cache only missed 0.1% of the time.
-Instruction data of the L1 has 0.04% of the time and almost nothing was missed at the last level.
+The results show that 2.1% of the L1 data cache missed, but the last level cache only missed 0.1% of the time.
+Instruction data of the L1 has 0.01% of the time and almost nothing was missed at the last level.
 
 Cachegrind writes line-by-line cache profiling information to a file named `cachegrind.out.<pid>`.
 This file is best interpreted with `cg_annotate --auto=yes cachegrind.out.<pid>`. For more information see the

From d15fd83f342605902b5af31180b98672a76b8cb4 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Thu, 11 Jan 2024 16:31:15 +0100
Subject: [PATCH 16/25] Show error if valgrind command failed

---
 polkadot/node/subsystem-bench/src/valgrind.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index 54e676d424ef..ff8f31bfe8c1 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -49,7 +49,7 @@ pub(crate) fn stop_measuring() {}
 #[cfg(target_os = "linux")]
 pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 	use std::os::unix::process::CommandExt;
-	std::process::Command::new("valgrind")
+	let err = std::process::Command::new("valgrind")
 		.arg("--tool=cachegrind")
 		.arg("--cache-sim=yes")
 		.arg("--instr-at-start=no")
@@ -57,7 +57,10 @@ pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 		.args(std::env::args())
 		.exec();
 
-	return Ok(())
+	return Err(eyre::eyre!(
+		"Сannot run Valgrind, check that it is installed and available in the PATH\n{}",
+		err
+	))
 }
 
 #[cfg(not(target_os = "linux"))]

From 0cfb180f8b1e9a233fb7edcff026ea4a63c49316 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Mon, 15 Jan 2024 14:25:34 +0100
Subject: [PATCH 17/25] Adjust to recommended CPU

---
 polkadot/node/subsystem-bench/README.md       | 47 +++++++------------
 polkadot/node/subsystem-bench/src/valgrind.rs |  8 ++++
 2 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md
index 6097ca544383..5debae15fb33 100644
--- a/polkadot/node/subsystem-bench/README.md
+++ b/polkadot/node/subsystem-bench/README.md
@@ -232,42 +232,31 @@ a remote virtual installation.
 
 To profile cache misses use the `--cache-misses` flag. Since the execution will be very slow, it's recommended not to
 run it together with other profiling and not to take benchmark results into account. A report is saved in a file
-`cachegrind_report.txt`.
+`cachegrind_report.txt`. Cache simulation of current runs tuned for Intel Ice Lake CPU.
 
 Example run results:
 ```
 $ target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
 $ cat cachegrind_report.txt
-==232852== Cachegrind, a high-precision tracing profiler
-==232852== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
-==232852== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright info
-==232852== Command: target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
-==232852== Parent PID: 135623
-==232852==
---232852-- warning: L3 cache found, using its data for the LL simulation.
---232852-- warning: specified LL cache: line_size 64  assoc 20  total_size 57,671,680
---232852-- warning: simulated LL cache: line_size 64  assoc 28  total_size 58,720,256
-==232852==
-==232852== I refs:        64,792,806,728
-==232852== I1  misses:         3,720,238
-==232852== LLi misses:            34,545
-==232852== I1  miss rate:           0.01%
-==232852== LLi miss rate:           0.00%
-==232852==
-==232852== D refs:        12,431,397,584  (10,003,114,208 rd   + 2,428,283,376 wr)
-==232852== D1  misses:       259,602,002  (   123,007,075 rd   +   136,594,927 wr)
-==232852== LLd misses:        13,168,490  (     5,331,255 rd   +     7,837,235 wr)
-==232852== D1  miss rate:            2.1% (           1.2%     +           5.6%  )
-==232852== LLd miss rate:            0.1% (           0.1%     +           0.3%  )
-==232852==
-==232852== LL refs:          263,322,240  (   126,727,313 rd   +   136,594,927 wr)
-==232852== LL misses:         13,203,035  (     5,365,800 rd   +     7,837,235 wr)
-==232852== LL miss rate:             0.0% (           0.0%     +           0.3%  )
-==201761== LL miss rate:             0.0% (           0.0%   +           0.1%  )
+I refs:        64,622,081,485
+I1  misses:         3,018,168
+LLi misses:           437,654
+I1  miss rate:           0.00%
+LLi miss rate:           0.00%
+
+D refs:        12,161,833,115  (9,868,356,364 rd   + 2,293,476,751 wr)
+D1  misses:       167,940,701  (   71,060,073 rd   +    96,880,628 wr)
+LLd misses:        33,550,018  (   16,685,853 rd   +    16,864,165 wr)
+D1  miss rate:            1.4% (          0.7%     +           4.2%  )
+LLd miss rate:            0.3% (          0.2%     +           0.7%  )
+
+LL refs:          170,958,869  (   74,078,241 rd   +    96,880,628 wr)
+LL misses:         33,987,672  (   17,123,507 rd   +    16,864,165 wr)
+LL miss rate:             0.0% (          0.0%     +           0.7%  )
 ```
 
-The results show that 2.1% of the L1 data cache missed, but the last level cache only missed 0.1% of the time.
-Instruction data of the L1 has 0.01% of the time and almost nothing was missed at the last level.
+The results show that 1.4% of the L1 data cache missed, but the last level cache only missed 0.3% of the time.
+Instruction data of the L1 has 0.00%.
 
 Cachegrind writes line-by-line cache profiling information to a file named `cachegrind.out.<pid>`.
 This file is best interpreted with `cg_annotate --auto=yes cachegrind.out.<pid>`. For more information see the
diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index ff8f31bfe8c1..bcd9f8fcc99e 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -46,6 +46,10 @@ pub(crate) fn stop_measuring() {
 pub(crate) fn stop_measuring() {}
 
 /// Stop execution and relaunch the app under valgrind
+/// Cache configuration used to emulate Intel Ice Lake (size, associativity, line size):
+///     L1 instruction: 32,768 B, 8-way, 64 B lines
+///     L1 data: 49,152 B, 12-way, 64 B lines
+///     Last-level: 2,097,152 B, 16-way, 64 B lines
 #[cfg(target_os = "linux")]
 pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 	use std::os::unix::process::CommandExt;
@@ -54,6 +58,10 @@ pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 		.arg("--cache-sim=yes")
 		.arg("--instr-at-start=no")
 		.arg("--log-file=cachegrind_report.txt")
+		.arg("--I1=32768,8,64")
+		.arg("--D1=49152,12,64")
+		.arg("--LL=2097152,16,64")
+		.arg("--verbose")
 		.args(std::env::args())
 		.exec();
 

From 03d6f27ea07e354ca5e4681bf84fbd82e808056f Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Mon, 15 Jan 2024 15:01:50 +0100
Subject: [PATCH 18/25] Make crabgrind optional

---
 polkadot/node/subsystem-bench/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml
index 5142d9273c65..8e1bef4c93da 100644
--- a/polkadot/node/subsystem-bench/Cargo.toml
+++ b/polkadot/node/subsystem-bench/Cargo.toml
@@ -60,7 +60,7 @@ pyroscope = "0.5.7"
 pyroscope_pprofrs = "0.2.7"
 
 [target.'cfg(target_os = "linux")'.dependencies]
-crabgrind = "0.1.10"
+crabgrind = { version = "0.1.10", optional = true }
 
 [features]
 default = []

From 503ccfbfa27b1e8fdef370030ba95ff77af91f31 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Mon, 15 Jan 2024 16:53:10 +0100
Subject: [PATCH 19/25] Add valgrind feature

---
 polkadot/node/subsystem-bench/Cargo.toml      |  1 +
 polkadot/node/subsystem-bench/src/valgrind.rs | 42 +++++++++----------
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml
index 8e1bef4c93da..60c5558b55f9 100644
--- a/polkadot/node/subsystem-bench/Cargo.toml
+++ b/polkadot/node/subsystem-bench/Cargo.toml
@@ -64,3 +64,4 @@ crabgrind = { version = "0.1.10", optional = true }
 
 [features]
 default = []
+valgrind = ["crabgrind"]
diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index bcd9f8fcc99e..c0dfb5eec5d9 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -15,42 +15,45 @@
 // along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
 
 use color_eyre::eyre;
+#[cfg(all(target_os = "linux", feature = "valgrind"))]
+use crabgrind as cg;
 
-/// Show if the app is running under Valgrind
-#[cfg(target_os = "linux")]
-pub(crate) fn is_valgrind_running() -> bool {
-	!matches!(crabgrind::run_mode(), crabgrind::RunMode::Native)
+#[cfg(not(all(target_os = "linux", feature = "valgrind")))]
+mod cg {
+	pub(super) enum RunMode {
+		Native,
+	}
+
+	pub(super) fn run_mode() -> RunMode {
+		RunMode::Native
+	}
+
+	pub(super) mod cachegrind {
+		pub fn start_instrumentation() {}
+		pub fn stop_instrumentation() {}
+	}
 }
 
-#[cfg(not(target_os = "linux"))]
+/// Show if the app is running under Valgrind
 pub(crate) fn is_valgrind_running() -> bool {
-	false
+	!matches!(cg::run_mode(), cg::RunMode::Native)
 }
 
 /// Start collecting cache misses data
-#[cfg(target_os = "linux")]
 pub(crate) fn start_measuring() {
-	crabgrind::cachegrind::start_instrumentation();
+	cg::cachegrind::start_instrumentation();
 }
 
-#[cfg(not(target_os = "linux"))]
-pub(crate) fn start_measuring() {}
-
 /// Stop collecting cache misses data
-#[cfg(target_os = "linux")]
 pub(crate) fn stop_measuring() {
-	crabgrind::cachegrind::stop_instrumentation();
+	cg::cachegrind::stop_instrumentation();
 }
 
-#[cfg(not(target_os = "linux"))]
-pub(crate) fn stop_measuring() {}
-
 /// Stop execution and relaunch the app under valgrind
 /// Cache configuration used to emulate Intel Ice Lake (size, associativity, line size):
 ///     L1 instruction: 32,768 B, 8-way, 64 B lines
 ///     L1 data: 49,152 B, 12-way, 64 B lines
 ///     Last-level: 2,097,152 B, 16-way, 64 B lines
-#[cfg(target_os = "linux")]
 pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 	use std::os::unix::process::CommandExt;
 	let err = std::process::Command::new("valgrind")
@@ -70,8 +73,3 @@ pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 		err
 	))
 }
-
-#[cfg(not(target_os = "linux"))]
-pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
-	return Err(eyre::eyre!("Valgrind can be executed only on linux"));
-}

From 5a3b421871fb8114917201410a141f5be104c241 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Mon, 15 Jan 2024 17:08:33 +0100
Subject: [PATCH 20/25] Fix feature detecting

---
 polkadot/node/subsystem-bench/src/valgrind.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index c0dfb5eec5d9..9a3000903190 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -49,11 +49,17 @@ pub(crate) fn stop_measuring() {
 	cg::cachegrind::stop_instrumentation();
 }
 
+#[cfg(not(feature = "valgrind"))]
+pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
+	return Err(eyre::eyre!("Feature `valgrind` must be enabled"));
+}
+
 /// Stop execution and relaunch the app under valgrind
 /// Cache configuration used to emulate Intel Ice Lake (size, associativity, line size):
 ///     L1 instruction: 32,768 B, 8-way, 64 B lines
 ///     L1 data: 49,152 B, 12-way, 64 B lines
 ///     Last-level: 2,097,152 B, 16-way, 64 B lines
+#[cfg(feature = "valgrind")]
 pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 	use std::os::unix::process::CommandExt;
 	let err = std::process::Command::new("valgrind")

From e0da4c4ab331062e0a81ba3509d263e4b5f29cab Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Mon, 15 Jan 2024 17:12:07 +0100
Subject: [PATCH 21/25] Fix explanation

---
 polkadot/node/subsystem-bench/src/valgrind.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index 9a3000903190..3d794350eff1 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -51,7 +51,7 @@ pub(crate) fn stop_measuring() {
 
 #[cfg(not(feature = "valgrind"))]
 pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
-	return Err(eyre::eyre!("Feature `valgrind` must be enabled"));
+	return Err(eyre::eyre!("Feature `valgrind` must be enabled to measure cache misses"));
 }
 
 /// Stop execution and relaunch the app under valgrind

From d9cde0588b49612e549556ae1a25b1533b9942a6 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Mon, 15 Jan 2024 17:29:50 +0100
Subject: [PATCH 22/25] Update readme

---
 polkadot/node/subsystem-bench/README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md
index 5debae15fb33..f89fea292eb6 100644
--- a/polkadot/node/subsystem-bench/README.md
+++ b/polkadot/node/subsystem-bench/README.md
@@ -230,9 +230,12 @@ and unlikely to reflect the behavior of a modern machine. However, it still repr
 usage, and more importantly it doesn't require a bare-metal machine to run on, which means it could be run in CI or in
 a remote virtual installation.
 
-To profile cache misses use the `--cache-misses` flag. Since the execution will be very slow, it's recommended not to
-run it together with other profiling and not to take benchmark results into account. A report is saved in a file
-`cachegrind_report.txt`. Cache simulation of current runs tuned for Intel Ice Lake CPU.
+The use of Cachegrind is specific. To avoid overcomplicating the rest of the functionality, the measurement of cache
+misses is placed in a separate feature `valgrind`.
+
+To profile cache misses use the `--cache-misses` flag. Cache simulation of current runs tuned for Intel Ice Lake CPU.
+Since the execution will be very slow, it's recommended not to run it together with other profiling and not to take
+benchmark results into account. A report is saved in a file `cachegrind_report.txt`.
 
 Example run results:
 ```

From 18b71615f1c9e601f6013c2acdeb61029d4ffb5e Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Mon, 15 Jan 2024 17:40:09 +0100
Subject: [PATCH 23/25] Address clippy errors

---
 polkadot/node/subsystem-bench/src/valgrind.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index 3d794350eff1..c66e9d419ee5 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -51,7 +51,7 @@ pub(crate) fn stop_measuring() {
 
 #[cfg(not(feature = "valgrind"))]
 pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
-	return Err(eyre::eyre!("Feature `valgrind` must be enabled to measure cache misses"));
+	Err(eyre::eyre!("Feature `valgrind` must be enabled to measure cache misses"))
 }
 
 /// Stop execution and relaunch the app under valgrind
@@ -74,7 +74,7 @@ pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 		.args(std::env::args())
 		.exec();
 
-	return Err(eyre::eyre!(
+	Err(eyre::eyre!(
 		"Сannot run Valgrind, check that it is installed and available in the PATH\n{}",
 		err
 	))

From 5da8447266fe21dd2c94034995a2072d93668e7d Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Tue, 16 Jan 2024 14:08:14 +0100
Subject: [PATCH 24/25] Remove crabgrind

---
 Cargo.lock                                    | 10 -----
 polkadot/node/subsystem-bench/Cargo.toml      |  4 --
 .../subsystem-bench/src/subsystem-bench.rs    |  8 ----
 polkadot/node/subsystem-bench/src/valgrind.rs | 40 ++-----------------
 4 files changed, 4 insertions(+), 58 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 60462e8e1b29..b20adebf963a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3267,15 +3267,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "crabgrind"
-version = "0.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e174279a2a6598ba9a1fec494ebb4172916b95e69d7d2396598c6a7b850d048"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "cranelift-bforest"
 version = "0.95.1"
@@ -13566,7 +13557,6 @@ dependencies = [
  "clap-num",
  "color-eyre",
  "colored",
- "crabgrind",
  "env_logger 0.9.3",
  "futures",
  "futures-timer",
diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml
index 60c5558b55f9..dc1179b77621 100644
--- a/polkadot/node/subsystem-bench/Cargo.toml
+++ b/polkadot/node/subsystem-bench/Cargo.toml
@@ -59,9 +59,5 @@ orchestra = { version = "0.3.3", default-features = false, features = ["futures_
 pyroscope = "0.5.7"
 pyroscope_pprofrs = "0.2.7"
 
-[target.'cfg(target_os = "linux")'.dependencies]
-crabgrind = { version = "0.1.10", optional = true }
-
 [features]
 default = []
-valgrind = ["crabgrind"]
diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
index 771a1ec95907..8669ee4e8b1d 100644
--- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs
+++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -197,17 +197,9 @@ impl BenchCli {
 		let mut state = TestState::new(&test_config);
 		let (mut env, _protocol_config) = prepare_test(test_config, &mut state);
 
-		if is_valgrind_running {
-			valgrind::start_measuring();
-		}
-
 		env.runtime()
 			.block_on(availability::benchmark_availability_read(&mut env, state));
 
-		if is_valgrind_running {
-			valgrind::stop_measuring();
-		}
-
 		if let Some(agent_running) = agent_running {
 			let agent_ready = agent_running.stop()?;
 			agent_ready.shutdown();
diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs
index c66e9d419ee5..3d0c488355b9 100644
--- a/polkadot/node/subsystem-bench/src/valgrind.rs
+++ b/polkadot/node/subsystem-bench/src/valgrind.rs
@@ -15,43 +15,13 @@
 // along with Polkadot.  If not, see <http://www.gnu.org/licenses/>.
 
 use color_eyre::eyre;
-#[cfg(all(target_os = "linux", feature = "valgrind"))]
-use crabgrind as cg;
-
-#[cfg(not(all(target_os = "linux", feature = "valgrind")))]
-mod cg {
-	pub(super) enum RunMode {
-		Native,
-	}
-
-	pub(super) fn run_mode() -> RunMode {
-		RunMode::Native
-	}
-
-	pub(super) mod cachegrind {
-		pub fn start_instrumentation() {}
-		pub fn stop_instrumentation() {}
-	}
-}
 
 /// Show if the app is running under Valgrind
 pub(crate) fn is_valgrind_running() -> bool {
-	!matches!(cg::run_mode(), cg::RunMode::Native)
-}
-
-/// Start collecting cache misses data
-pub(crate) fn start_measuring() {
-	cg::cachegrind::start_instrumentation();
-}
-
-/// Stop collecting cache misses data
-pub(crate) fn stop_measuring() {
-	cg::cachegrind::stop_instrumentation();
-}
-
-#[cfg(not(feature = "valgrind"))]
-pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
-	Err(eyre::eyre!("Feature `valgrind` must be enabled to measure cache misses"))
+	match std::env::var("LD_PRELOAD") {
+		Ok(v) => v.contains("valgrind"),
+		Err(_) => false,
+	}
 }
 
 /// Stop execution and relaunch the app under valgrind
@@ -59,13 +29,11 @@ pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 ///     L1 instruction: 32,768 B, 8-way, 64 B lines
 ///     L1 data: 49,152 B, 12-way, 64 B lines
 ///     Last-level: 2,097,152 B, 16-way, 64 B lines
-#[cfg(feature = "valgrind")]
 pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> {
 	use std::os::unix::process::CommandExt;
 	let err = std::process::Command::new("valgrind")
 		.arg("--tool=cachegrind")
 		.arg("--cache-sim=yes")
-		.arg("--instr-at-start=no")
 		.arg("--log-file=cachegrind_report.txt")
 		.arg("--I1=32768,8,64")
 		.arg("--D1=49152,12,64")

From 219a05ceb25e81aca3eea7d9802dec007fe066c7 Mon Sep 17 00:00:00 2001
From: Andrei Eres <eresav@me.com>
Date: Tue, 16 Jan 2024 14:29:34 +0100
Subject: [PATCH 25/25] Update readme

---
 polkadot/node/subsystem-bench/README.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md
index f89fea292eb6..1ff5b129e1e4 100644
--- a/polkadot/node/subsystem-bench/README.md
+++ b/polkadot/node/subsystem-bench/README.md
@@ -230,9 +230,6 @@ and unlikely to reflect the behavior of a modern machine. However, it still repr
 usage, and more importantly it doesn't require a bare-metal machine to run on, which means it could be run in CI or in
 a remote virtual installation.
 
-The use of Cachegrind is specific. To avoid overcomplicating the rest of the functionality, the measurement of cache
-misses is placed in a separate feature `valgrind`.
-
 To profile cache misses use the `--cache-misses` flag. Cache simulation of current runs tuned for Intel Ice Lake CPU.
 Since the execution will be very slow, it's recommended not to run it together with other profiling and not to take
 benchmark results into account. A report is saved in a file `cachegrind_report.txt`.