Merge pull request #175 from naivisoftware/gpu_compute_apple_fix

GPU workgroup size compute fix macOS
napframework · Apr 7, 2022 · 2c4f4dd · 2c4f4dd
2 parents f4d9034 + 946559e
commit 2c4f4dd
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 12 deletions.
diff --git a/demos/computeflocking/data/shaders/flock.comp b/demos/computeflocking/data/shaders/flock.comp
@@ -27,9 +27,13 @@
 // Workgroup size as specialization constant as per: 
 // https://www.khronos.org/registry/OpenGL/specs/gl/GLSLangSpec.4.60.html#specialization-constant-qualifier
 
-// When a workgroup size specialization constant is detected, NAP automatically overwrites
-// it with the maximum group size of the device on pipeline creation.
+// NAP overwrites the workgroup size specialization constant, when detected and not 0, with the maximum group 
+// size supported by the device on pipeline creation.
 layout(local_size_x_id = 0) in;
+
+// When declaring the workgroup size specialization constant to be higher than 0, NAP will not 
+// override the workgroup size when creating the compute pipeline. Uncommenting this bit and commenting out 
+// the above will let the shader define the workgroup size.
 //layout(local_size_x = 512) in;
 
 struct Boid

diff --git a/modules/naprender/data/shaders/constant.vert b/modules/naprender/data/shaders/constant.vert
@@ -12,9 +12,6 @@ uniform nap
 } mvp;
 
 in vec3	in_Position;
-in vec3 in_UV0;
-
-out vec3 passUVs;
 
 void main(void)
 {

diff --git a/modules/naprender/src/renderservice.cpp b/modules/naprender/src/renderservice.cpp
@@ -1052,9 +1052,16 @@ namespace nap
 				entry.constantID = static_cast<uint>(const_ids[i]);
 				entry.offset = static_cast<uint>(spec_entries.size() * sizeof(uint));
 				entry.size = sizeof(uint);
-
 				spec_entries.emplace_back(std::move(entry));
-				spec_data.emplace_back(computeShader.getWorkGroupSize()[i]);
+				uint32 work_group_size = computeShader.getWorkGroupSize()[i];
+#ifdef __APPLE__
+				// Clamp work group size for Apple to 512, based on maxTotalThreadsPerThreadgroup,
+				// which doesn't necessarily match physical device limits, especially on older devices.
+				// See: https://developer.apple.com/documentation/metal/compute_passes/calculating_threadgroup_and_grid_sizes
+				// And: https://github.com/KhronosGroup/SPIRV-Cross/issues/837
+				work_group_size = math::min<uint32>(work_group_size, 512);
+#endif // __APPLE__
+				spec_data.emplace_back(work_group_size);
 			}
 		}
 

diff --git a/modules/naprender/src/shader.cpp b/modules/naprender/src/shader.cpp
@@ -866,10 +866,6 @@ namespace nap
 		if (!parseShaderVariables(comp_shader_compiler, VK_SHADER_STAGE_COMPUTE_BIT, mUBODeclarations, mSSBODeclarations, mSamplerDeclarations, errorState))
 			return false;
 
-		// Query useful compute info
-		std::array<uint, 3> max_workgroup_size;
-		std::memcpy(max_workgroup_size.data(), &mRenderService->getPhysicalDeviceProperties().limits.maxComputeWorkGroupSize[0], sizeof(max_workgroup_size));
-
 		// Cache workgroup size specialization constants
 		std::array<spirv_cross::SpecializationConstant, 3> spec_constants;
 		comp_shader_compiler.get_work_group_size_specialization_constants(spec_constants[0], spec_constants[1], spec_constants[2]);
@@ -884,7 +880,7 @@ namespace nap
 			{
 				// Overwrite workgroup size with quaried maximum supported workgroup size
 				mWorkGroupSizeConstantIds[i] = spec_constants[i].constant_id;
-				mWorkGroupSize[i] = max_workgroup_size[i];
+				mWorkGroupSize[i] = mRenderService->getPhysicalDeviceProperties().limits.maxComputeWorkGroupSize[i];
 			}
 			else
 			{