diff --git a/src/iris_common.h b/src/iris_common.h
index a8d8622..466a869 100644
--- a/src/iris_common.h
+++ b/src/iris_common.h
@@ -54,6 +54,22 @@ SOFTWARE.
 #define IRIS_DEFAULT_PAGE_SIZE (IRIS_DEFAULT_BLOCK_SIZE * 64)
 #endif
 
+#ifndef IRIS_PROFILE_THREAD
+#define IRIS_PROFILE_THREAD(name, i)
+#endif
+
+#ifndef IRIS_PROFILE_SCOPE
+#define IRIS_PROFILE_SCOPE(name)
+#endif
+
+#ifndef IRIS_PROFILE_PUSH
+#define IRIS_PROFILE_PUSH(name)
+#endif
+
+#ifndef IRIS_PROFILE_POP
+#define IRIS_PROFILE_POP()
+#endif
+
 namespace iris {
 	static constexpr size_t default_block_size = IRIS_DEFAULT_BLOCK_SIZE;
 	static constexpr size_t default_page_size = IRIS_DEFAULT_PAGE_SIZE;
diff --git a/src/iris_dispatcher.h b/src/iris_dispatcher.h
index 2d01c59..a03e3a9 100644
--- a/src/iris_dispatcher.h
+++ b/src/iris_dispatcher.h
@@ -417,6 +417,8 @@ namespace iris {
 		// cleanup the dispatcher, pass true to 'execute_remaining' to make sure all tasks are executed finally.
 		template <bool execute_remaining = true, bool finalize = false, typename iterator_t = iris_warp_t*>
 		static bool join(iterator_t begin, iterator_t end) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
+
 			if /* constexpr */ (!finalize) {
 				// suspend all warps so we can take over tasks
 				for (iterator_t p = begin; p != end; ++p) {
@@ -484,6 +486,7 @@ namespace iris {
 		template <bool s, bool force>
 		typename std::enable_if<s>::type execute_internal() noexcept(
 			noexcept(std::declval<iris_warp_t>().flush()) && noexcept(std::declval<function_t>()())) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			// mark for queueing, avoiding flush me more than once.
 			queueing.store(queue_state_executing, std::memory_order_release);
 			iris_warp_t** warp_ptr = &get_current_warp_internal();
@@ -510,6 +513,7 @@ namespace iris {
 		template <bool s, bool force>
 		typename std::enable_if<!s>::type execute_internal() noexcept(
 			noexcept(std::declval<iris_warp_t>().flush()) && noexcept(std::declval<function_t>()())) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			// mark for queueing, avoiding flush me more than once.
 			queueing.store(queue_state_executing, std::memory_order_release);
 			iris_warp_t** warp_ptr = &get_current_warp_internal();
@@ -538,6 +542,7 @@ namespace iris {
 
 		template <bool s, bool force>
 		void execute() noexcept(noexcept(std::declval<iris_warp_t>().template execute_internal<s, force>())) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			if (suspend_count.load(std::memory_order_acquire) == 0) {
 				// try to acquire execution, if it fails, there must be another thread doing the same thing
 				// and it's ok to return immediately.
@@ -740,6 +745,7 @@ namespace iris {
 		}
 
 		bool cleanup() noexcept {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			routine_t* p = resurrect_routines.exchange(nullptr, std::memory_order_acquire);
 			if (p != nullptr) {
 				while (p != nullptr) {
@@ -757,6 +763,7 @@ namespace iris {
 		}
 
 		bool resurrect() {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			routine_t* p = resurrect_routines.exchange(nullptr, std::memory_order_acquire);
 			if (p != nullptr) {
 				while (p != nullptr) {
@@ -805,6 +812,7 @@ namespace iris {
 		};
 
 		void complete(bool success) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			// all pending routines finished?
 			if (pending_count.fetch_sub(1, std::memory_order_release) == 1) {
 				// if completion throws exception, we still do not care about pending_count anyway
@@ -815,6 +823,7 @@ namespace iris {
 		}
 
 		void execute(routine_t* routine) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			assert(routine->lock_count.load(std::memory_order_relaxed) == 0);
 			do {
 				routine_guard_t guard(*this, routine, &resurrect_routines);
@@ -926,6 +935,7 @@ namespace iris {
 
 			for (size_t i = 0; i < internal_thread_count; i++) {
 				threads[i] = thread_t([this, i]() {
+					IRIS_PROFILE_THREAD("iris_async_worker", i);
 					thread_loop(i);
 				});
 			}
@@ -1129,6 +1139,7 @@ namespace iris {
 
 		// wait for all threads in worker to be finished.
 		void join() {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			if (!task_heads.empty()) {
 				for (size_t i = 0; i < threads.size(); i++) {
 					if (threads[i].joinable()) {
@@ -1186,6 +1197,7 @@ namespace iris {
 	protected:
 		// cleanup all pending tasks
 		bool cleanup() {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			bool empty = true;
 
 			for (size_t i = 0; i < task_heads.size(); i++) {
@@ -1222,6 +1234,7 @@ namespace iris {
 
 		// poll with given priority
 		bool poll_internal(size_t priority_size) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			std::pair<size_t, size_t> slot = fetch(priority_size);
 			size_t index = slot.first;
 
diff --git a/src/iris_lua.h b/src/iris_lua.h
index d89867a..e423663 100644
--- a/src/iris_lua.h
+++ b/src/iris_lua.h
@@ -84,6 +84,7 @@ namespace iris {
 		// run a piece of code
 		template <typename return_t = void>
 		return_t run(std::string_view code) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			auto guard = write_fence();
 			lua_State* L = state;
 			stack_guard_t stack_guard(L);
@@ -415,6 +416,7 @@ namespace iris {
 		// call function in protect mode
 		template <typename return_t, typename callable_t, typename... args_t>
 		return_t call(callable_t&& reference, args_t&&... args) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			auto guard = write_fence();
 
 			lua_State* L = state;
diff --git a/src/iris_system.h b/src/iris_system.h
index 292ad98..458b1d6 100644
--- a/src/iris_system.h
+++ b/src/iris_system.h
@@ -198,6 +198,7 @@ namespace iris {
 		// iterate components
 		template <typename component_t, typename operation_t>
 		void for_each(operation_t&& op) noexcept(noexcept(std::declval<iris_queue_list_t<component_t, allocator_t>>().for_each(op))) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			auto guard = read_fence();
 			std::get<fetch_index<component_t>::value>(components).for_each(op);
 		}
@@ -205,6 +206,7 @@ namespace iris {
 		// n is the expected group size
 		template <typename component_t, typename warp_t, typename operand_t, typename queue_list_t = iris_queue_list_t<component_t, allocator_t>>
 		void for_each_parallel(operand_t&& op, size_t n = queue_list_t::element_count) {
+			IRIS_PROFILE_SCOPE(__FUNCTION__);
 			auto guard = read_fence();
 
 			auto& target_components = std::get<fetch_index<component_t>::value>(components);
@@ -481,11 +483,13 @@ namespace iris {
 
 			template <typename operation_t>
 			void for_each(operation_t&& op) {
+				IRIS_PROFILE_SCOPE(__FUNCTION__);
 				for_each_impl(std::forward<operation_t>(op), std::integral_constant<bool, sizeof...(components_t) == 1>());
 			}
 
 			template <typename operation_t>
 			void for_each_system(operation_t&& op) {
+				IRIS_PROFILE_SCOPE(__FUNCTION__);
 				for_each_system_impl(std::forward<operation_t>(op), gen_seq<sizeof...(components_t)>());
 			}
 
@@ -634,11 +638,13 @@ namespace iris {
 
 			template <typename operation_t>
 			void for_each(operation_t&& op) {
+				IRIS_PROFILE_SCOPE(__FUNCTION__);
 				for_each_impl(std::forward<operation_t>(op), std::integral_constant<bool, sizeof...(components_t) == 1>());
 			}
 
 			template <typename operation_t>
 			void for_each_system(operation_t&& op) {
+				IRIS_PROFILE_SCOPE(__FUNCTION__);
 				for_each_system_impl(std::forward<operation_t>(op), gen_seq<sizeof...(components_t)>());
 			}