diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2842b63197ff1..e14c92eae0afe 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -194,6 +194,12 @@ Removed Compiler Flags
 
 Attribute Changes in Clang
 --------------------------
+- Introduced a new function attribute ``__attribute__((amdgpu_max_num_work_groups(x, y, z)))`` or
+  ``[[clang::amdgpu_max_num_work_groups(x, y, z)]]`` for the AMDGPU target. This attribute can be
+  attached to HIP or OpenCL kernel function definitions to provide an optimization hint. The parameters
+  ``x``, ``y``, and ``z`` specify the maximum number of workgroups for the respective dimensions,
+  and each must be a positive integer when provided. The parameter ``x`` is required, while ``y`` and
+  ``z`` are optional with default value of 1.
 
 Improvements to Clang's diagnostics
 -----------------------------------
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 080340669b60a..63efd85dcd4e5 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2054,6 +2054,13 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
+def AMDGPUMaxNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
+  let Args = [ExprArgument<"MaxNumWorkGroupsX">, ExprArgument<"MaxNumWorkGroupsY", 1>, ExprArgument<"MaxNumWorkGroupsZ", 1>];
+  let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
 def AMDGPUKernelCall : DeclOrTypeAttr {
   let Spellings = [Clang<"amdgpu_kernel">];
   let Documentation = [Undocumented];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 2c07cd09b0d5b..d61f96ade557d 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2741,6 +2741,33 @@ An error will be given if:
   }];
 }
 
+def AMDGPUMaxNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+This attribute specifies the max number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_max_num_work_groups(<x>, <y>, <z>)))`` or
+``[[clang::amdgpu_max_num_work_groups(<x>, <y>, <z>)]]`` attribute for the
+AMDGPU target. This attribute may be attached to HIP or OpenCL kernel function
+definitions and is an optimization hint.
+
+The ``<x>`` parameter specifies the maximum number of work groups in the x dimension.
+Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
+Each of the three values must be greater than 0 when provided. The ``<x>`` parameter
+is required, while ``<y>`` and ``<z>`` are optional with default value of 1.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
   let Content = [{
 Clang supports several different calling conventions, depending on the target
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 267c79cc057cb..b226851f03038 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3911,6 +3911,16 @@ class Sema final {
   void addAMDGPUWavesPerEUAttr(Decl *D, const AttributeCommonInfo &CI,
                                Expr *Min, Expr *Max);
 
+  /// Create an AMDGPUMaxNumWorkGroupsAttr attribute.
+  AMDGPUMaxNumWorkGroupsAttr *
+  CreateAMDGPUMaxNumWorkGroupsAttr(const AttributeCommonInfo &CI, Expr *XExpr,
+                                   Expr *YExpr, Expr *ZExpr);
+
+  /// addAMDGPUMaxNumWorkGroupsAttr - Adds an amdgpu_max_num_work_groups
+  /// attribute to a particular declaration.
+  void addAMDGPUMaxNumWorkGroupsAttr(Decl *D, const AttributeCommonInfo &CI,
+                                     Expr *XExpr, Expr *YExpr, Expr *ZExpr);
+
   DLLImportAttr *mergeDLLImportAttr(Decl *D, const AttributeCommonInfo &CI);
   DLLExportAttr *mergeDLLExportAttr(Decl *D, const AttributeCommonInfo &CI);
   MSInheritanceAttr *mergeMSInheritanceAttr(Decl *D,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 03ac6b78598fc..44e86c0b40f68 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -356,6 +356,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
+    uint32_t X = Attr->getMaxNumWorkGroupsX()
+                     ->EvaluateKnownConstInt(M.getContext())
+                     .getExtValue();
+    // Y and Z dimensions default to 1 if not specified
+    uint32_t Y = Attr->getMaxNumWorkGroupsY()
+                     ? Attr->getMaxNumWorkGroupsY()
+                           ->EvaluateKnownConstInt(M.getContext())
+                           .getExtValue()
+                     : 1;
+    uint32_t Z = Attr->getMaxNumWorkGroupsZ()
+                     ? Attr->getMaxNumWorkGroupsZ()
+                           ->EvaluateKnownConstInt(M.getContext())
+                           .getExtValue()
+                     : 1;
+
+    llvm::SmallString<32> AttrVal;
+    llvm::raw_svector_ostream OS(AttrVal);
+    OS << X << ',' << Y << ',' << Z;
+
+    F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
+  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index c00120b59d396..e3da3e606435f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8079,6 +8079,65 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
+static bool
+checkAMDGPUMaxNumWorkGroupsArguments(Sema &S, Expr *XExpr, Expr *YExpr,
+                                     Expr *ZExpr,
+                                     const AMDGPUMaxNumWorkGroupsAttr &Attr) {
+  if (S.DiagnoseUnexpandedParameterPack(XExpr) ||
+      (YExpr && S.DiagnoseUnexpandedParameterPack(YExpr)) ||
+      (ZExpr && S.DiagnoseUnexpandedParameterPack(ZExpr)))
+    return true;
+
+  // Accept template arguments for now as they depend on something else.
+  // We'll get to check them when they eventually get instantiated.
+  if (XExpr->isValueDependent() || (YExpr && YExpr->isValueDependent()) ||
+      (ZExpr && ZExpr->isValueDependent()))
+    return false;
+
+  uint32_t NumWG = 0;
+  Expr *Exprs[3] = {XExpr, YExpr, ZExpr};
+  for (int i = 0; i < 3; i++) {
+    if (Exprs[i]) {
+      if (!checkUInt32Argument(S, Attr, Exprs[i], NumWG, i,
+                               /*StrictlyUnsigned=*/true))
+        return true;
+      if (NumWG == 0) {
+        S.Diag(Attr.getLoc(), diag::err_attribute_argument_is_zero)
+            << &Attr << Exprs[i]->getSourceRange();
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+AMDGPUMaxNumWorkGroupsAttr *
+Sema::CreateAMDGPUMaxNumWorkGroupsAttr(const AttributeCommonInfo &CI,
+                                       Expr *XExpr, Expr *YExpr, Expr *ZExpr) {
+  AMDGPUMaxNumWorkGroupsAttr TmpAttr(Context, CI, XExpr, YExpr, ZExpr);
+
+  if (checkAMDGPUMaxNumWorkGroupsArguments(*this, XExpr, YExpr, ZExpr, TmpAttr))
+    return nullptr;
+
+  return ::new (Context)
+      AMDGPUMaxNumWorkGroupsAttr(Context, CI, XExpr, YExpr, ZExpr);
+}
+
+void Sema::addAMDGPUMaxNumWorkGroupsAttr(Decl *D, const AttributeCommonInfo &CI,
+                                         Expr *XExpr, Expr *YExpr,
+                                         Expr *ZExpr) {
+  if (auto *Attr = CreateAMDGPUMaxNumWorkGroupsAttr(CI, XExpr, YExpr, ZExpr))
+    D->addAttr(Attr);
+}
+
+static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
+                                             const ParsedAttr &AL) {
+  Expr *YExpr = (AL.getNumArgs() > 1) ? AL.getArgAsExpr(1) : nullptr;
+  Expr *ZExpr = (AL.getNumArgs() > 2) ? AL.getArgAsExpr(2) : nullptr;
+  S.addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), YExpr, ZExpr);
+}
+
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
                                               const ParsedAttr &AL) {
   // If we try to apply it to a function pointer, don't warn, but don't
@@ -9183,6 +9242,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
+    handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
     break;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 20c2c93ac9c7b..8ef8bfdf2a7b5 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -607,6 +607,29 @@ static void instantiateDependentAMDGPUWavesPerEUAttr(
   S.addAMDGPUWavesPerEUAttr(New, Attr, MinExpr, MaxExpr);
 }
 
+static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const AMDGPUMaxNumWorkGroupsAttr &Attr, Decl *New) {
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+  ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
+  if (!ResultX.isUsable())
+    return;
+  ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
+  if (!ResultY.isUsable())
+    return;
+  ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
+  if (!ResultZ.isUsable())
+    return;
+
+  Expr *XExpr = ResultX.getAs<Expr>();
+  Expr *YExpr = ResultY.getAs<Expr>();
+  Expr *ZExpr = ResultZ.getAs<Expr>();
+
+  S.addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
 // This doesn't take any template parameters, but we have a custom action that
 // needs to happen when the kernel itself is instantiated. We need to run the
 // ItaniumMangler to mark the names required to name this kernel.
@@ -792,6 +815,12 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
                                                *AMDGPUFlatWorkGroupSize, New);
     }
 
+    if (const auto *AMDGPUMaxNumWorkGroups =
+            dyn_cast<AMDGPUMaxNumWorkGroupsAttr>(TmplAttr)) {
+      instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
+          *this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
+    }
+
     if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
       instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
                                                 New);
diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
index a1642421af2c8..11a133fd1351d 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -40,12 +40,45 @@ __attribute__((amdgpu_num_vgpr(64))) // expected-no-diagnostics
 __global__ void num_vgpr_64() {
 // CHECK: define{{.*}} amdgpu_kernel void @_Z11num_vgpr_64v() [[NUM_VGPR_64:#[0-9]+]]
 }
+__attribute__((amdgpu_max_num_work_groups(32, 4, 2))) // expected-no-diagnostics
+__global__ void max_num_work_groups_32_4_2() {
+// CHECK: define{{.*}} amdgpu_kernel void @_Z26max_num_work_groups_32_4_2v() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+}
+__attribute__((amdgpu_max_num_work_groups(32))) // expected-no-diagnostics
+__global__ void max_num_work_groups_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @_Z22max_num_work_groups_32v() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+__attribute__((amdgpu_max_num_work_groups(32,1))) // expected-no-diagnostics
+__global__ void max_num_work_groups_32_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @_Z24max_num_work_groups_32_1v() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+
+
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(a, 4, 2)))
+__global__ void template_a_4_2_max_num_work_groups() {}
+template __global__ void template_a_4_2_max_num_work_groups<32>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z34template_a_4_2_max_num_work_groupsILj32EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, a, 2)))
+__global__ void template_32_a_2_max_num_work_groups() {}
+template __global__ void template_32_a_2_max_num_work_groups<4>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_a_2_max_num_work_groupsILj4EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, 4, a)))
+__global__ void template_32_4_a_max_num_work_groups() {}
+template __global__ void template_32_4_a_max_num_work_groups<2>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_4_a_max_num_work_groupsILj2EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
 
 // Make sure this is silently accepted on other targets.
 // NAMD-NOT: "amdgpu-flat-work-group-size"
 // NAMD-NOT: "amdgpu-waves-per-eu"
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
+// NAMD-NOT: "amdgpu-max-num-work-groups"
 
 // DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
@@ -53,5 +86,7 @@ __global__ void num_vgpr_64() {
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
 // CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
 // CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_4_2]] = {{.*}}"amdgpu-max-num-workgroups"="32,4,2"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_1_1]] = {{.*}}"amdgpu-max-num-workgroups"="32,1,1"
 
 // NOUB-NOT: "uniform-work-group-size"="true"
diff --git a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
index b0dfc97b53b2c..5648bc13458e1 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
@@ -139,6 +139,46 @@ kernel void reqd_work_group_size_32_2_1_flat_work_group_size_16_128() {
 // CHECK: define{{.*}} amdgpu_kernel void @reqd_work_group_size_32_2_1_flat_work_group_size_16_128() [[FLAT_WORK_GROUP_SIZE_16_128:#[0-9]+]]
 }
 
+__attribute__((amdgpu_max_num_work_groups(1, 1, 1))) // expected-no-diagnostics
+kernel void max_num_work_groups_1_1_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_1() [[MAX_NUM_WORK_GROUPS_1_1_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1))) // expected-no-diagnostics
+kernel void max_num_work_groups_32_1_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_1_1() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(32, 8, 1))) // expected-no-diagnostics
+kernel void max_num_work_groups_32_8_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_8_1() [[MAX_NUM_WORK_GROUPS_32_8_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(1, 1, 32))) // expected-no-diagnostics
+kernel void max_num_work_groups_1_1_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_32() [[MAX_NUM_WORK_GROUPS_1_1_32:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(1, 8, 32))) // expected-no-diagnostics
+kernel void max_num_work_groups_1_8_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_8_32() [[MAX_NUM_WORK_GROUPS_1_8_32:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(4, 8, 32))) // expected-no-diagnostics
+kernel void max_num_work_groups_4_8_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_4_8_32() [[MAX_NUM_WORK_GROUPS_4_8_32:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(32))) // expected-no-diagnostics
+kernel void max_num_work_groups_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(32,1))) // expected-no-diagnostics
+kernel void max_num_work_groups_32_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_1() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+
 void a_function() {
 // CHECK: define{{.*}} void @a_function() [[A_FUNCTION:#[0-9]+]]
 }
@@ -189,5 +229,12 @@ kernel void default_kernel() {
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"
 
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_1_1]] = {{.*}} "amdgpu-max-num-workgroups"="1,1,1"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_1_1]] = {{.*}} "amdgpu-max-num-workgroups"="32,1,1"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_8_1]] = {{.*}} "amdgpu-max-num-workgroups"="32,8,1"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_1_32]] = {{.*}} "amdgpu-max-num-workgroups"="1,1,32"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_8_32]] = {{.*}} "amdgpu-max-num-workgroups"="1,8,32"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_4_8_32]] = {{.*}} "amdgpu-max-num-workgroups"="4,8,32"
+
 // CHECK-DAG: attributes [[A_FUNCTION]] = {{.*}}
 // CHECK-DAG: attributes [[DEFAULT_KERNEL_ATTRS]] = {{.*}} "amdgpu-flat-work-group-size"="1,256"
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index ec84ebdc6abe7..318bfb2df2a7a 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -4,6 +4,7 @@
 
 // CHECK: #pragma clang attribute supports the following attributes:
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
diff --git a/clang/test/SemaCUDA/amdgpu-attrs.cu b/clang/test/SemaCUDA/amdgpu-attrs.cu
index 4811ef796c66b..e04b32d121bc8 100644
--- a/clang/test/SemaCUDA/amdgpu-attrs.cu
+++ b/clang/test/SemaCUDA/amdgpu-attrs.cu
@@ -63,6 +63,16 @@ __global__ void flat_work_group_size_32_64_waves_per_eu_2_num_sgpr_32_num_vgpr_6
 __attribute__((amdgpu_flat_work_group_size(32, 64), amdgpu_waves_per_eu(2, 4), amdgpu_num_sgpr(32), amdgpu_num_vgpr(64)))
 __global__ void flat_work_group_size_32_64_waves_per_eu_2_4_num_sgpr_32_num_vgpr_64() {}
 
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1)))
+__global__ void max_num_work_groups_32_1_1() {}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1), amdgpu_flat_work_group_size(32, 64)))
+__global__ void max_num_work_groups_32_1_1_flat_work_group_size_32_64() {}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1), amdgpu_flat_work_group_size(32, 64), amdgpu_waves_per_eu(2, 4), amdgpu_num_sgpr(32), amdgpu_num_vgpr(64)))
+__global__ void max_num_work_groups_32_1_1_flat_work_group_size_32_64_waves_per_eu_2_4_num_sgpr_32_num_vgpr_64() {}
+
+
 // expected-error@+2{{attribute 'reqd_work_group_size' can only be applied to an OpenCL kernel function}}
 __attribute__((reqd_work_group_size(32, 64, 64)))
 __global__ void reqd_work_group_size_32_64_64() {}
@@ -194,3 +204,125 @@ __global__ void non_cexpr_waves_per_eu_2() {}
 // expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}}
 __attribute__((amdgpu_waves_per_eu(2, ipow2(2))))
 __global__ void non_cexpr_waves_per_eu_2_4() {}
+
+__attribute__((amdgpu_max_num_work_groups(32)))
+__global__ void max_num_work_groups_32() {}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1)))
+__global__ void max_num_work_groups_32_1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute takes no more than 3 arguments}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute takes at least 1 argument}}
+__attribute__((amdgpu_max_num_work_groups()))
+__global__ void max_num_work_groups_no_arg() {}
+
+// expected-error@+1{{expected expression}}
+__attribute__((amdgpu_max_num_work_groups(,1,1)))
+__global__ void max_num_work_groups_empty_1_1() {}
+
+// expected-error@+1{{expected expression}}
+__attribute__((amdgpu_max_num_work_groups(32,,1)))
+__global__ void max_num_work_groups_32_empty_1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(ipow2(5), 1, 1)))
+__global__ void max_num_work_groups_32_1_1_non_int_arg0() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(32, "1", 1)))
+__global__ void max_num_work_groups_32_1_1_non_int_arg1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
+__attribute__((amdgpu_max_num_work_groups(-32, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_neg_int_arg0() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
+__attribute__((amdgpu_max_num_work_groups(32, -1, 1)))
+__global__ void max_num_work_groups_32_1_1_neg_int_arg1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, -1)))
+__global__ void max_num_work_groups_32_1_1_neg_int_arg2() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+__attribute__((amdgpu_max_num_work_groups(0, 1, 1)))
+__global__ void max_num_work_groups_0_1_1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+__attribute__((amdgpu_max_num_work_groups(32, 0, 1)))
+__global__ void max_num_work_groups_32_0_1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, 0)))
+__global__ void max_num_work_groups_32_1_0() {}
+
+__attribute__((amdgpu_max_num_work_groups(4294967295)))
+__global__ void max_num_work_groups_max_unsigned_int() {}
+
+// expected-error@+1{{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
+__attribute__((amdgpu_max_num_work_groups(4294967296)))
+__global__ void max_num_work_groups_max_unsigned_int_plus1() {}
+
+// expected-error@+1{{integer constant expression evaluates to value 10000000000 that cannot be represented in a 32-bit unsigned integer type}}
+__attribute__((amdgpu_max_num_work_groups(10000000000)))
+__global__ void max_num_work_groups_too_large() {}
+
+int num_wg_x = 32;
+int num_wg_y = 1;
+int num_wg_z = 1;
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(num_wg_x, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_non_const_arg0() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(32, num_wg_y, 1)))
+__global__ void max_num_work_groups_32_1_1_non_const_arg1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 2 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, num_wg_z)))
+__global__ void max_num_work_groups_32_1_1_non_const_arg2() {}
+
+const int c_num_wg_x = 32;
+__attribute__((amdgpu_max_num_work_groups(c_num_wg_x, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_const_arg0() {}
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(a, 1, 1)))
+__global__ void template_a_1_1_max_num_work_groups() {}
+template __global__ void template_a_1_1_max_num_work_groups<32>();
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, a, 1)))
+__global__ void template_32_a_1_max_num_work_groups() {}
+template __global__ void template_32_a_1_max_num_work_groups<1>();
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, 1, a)))
+__global__ void template_32_1_a_max_num_work_groups() {}
+template __global__ void template_32_1_a_max_num_work_groups<1>();
+
+// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+// expected-note@+4{{in instantiation of}}
+template<unsigned b>
+__attribute__((amdgpu_max_num_work_groups(b, 1, 1)))
+__global__ void template_b_1_1_max_num_work_groups() {}
+template __global__ void template_b_1_1_max_num_work_groups<0>();
+
+// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+// expected-note@+4{{in instantiation of}}
+template<unsigned b>
+__attribute__((amdgpu_max_num_work_groups(32, b, 1)))
+__global__ void template_32_b_1_max_num_work_groups() {}
+template __global__ void template_32_b_1_max_num_work_groups<0>();
+
+// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+// expected-note@+4{{in instantiation of}}
+template<unsigned b>
+__attribute__((amdgpu_max_num_work_groups(32, 1, b)))
+__global__ void template_32_1_b_max_num_work_groups() {}
+template __global__ void template_32_1_b_max_num_work_groups<0>();
+
+
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index f5f37d9e8a3b0..99d7a482710f5 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1442,6 +1442,11 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                              the frame. This is an internal detail of how LDS variables are lowered,
                                              language front ends should not set this attribute.
 
+     "amdgpu-max-num-workgroups"="x,y,z"     Specify the maximum number of work groups for the kernel dispatch in the
+                                             X, Y, and Z dimensions. Generated by the ``amdgpu_max_num_work_groups``
+                                             CLANG attribute [CLANG-ATTR]_. Clang only emits this attribute when all
+                                             the three numbers are >= 1.
+
      ======================================= ==========================================================
 
 Calling Conventions
@@ -3917,6 +3922,11 @@ same *vendor-name*.
 
                                                                   If omitted, "normal" is
                                                                   assumed.
+     ".max_num_work_groups_{x,y,z}"      integer                  The max number of
+                                                                  launched work-groups
+                                                                  in the X, Y, and Z
+                                                                  dimensions. Each number
+                                                                  must be >=1.
      =================================== ============== ========= ================================
 
 ..
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index c20fdd51607a5..9e288ab50e170 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,6 +494,14 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
+  unsigned NumWGX = MFI.getMaxNumWorkGroupsX();
+  unsigned NumWGY = MFI.getMaxNumWorkGroupsY();
+  unsigned NumWGZ = MFI.getMaxNumWorkGroupsZ();
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
+    Kern[".max_num_workgroups_x"] = Kern.getDocument()->getNode(NumWGX);
+    Kern[".max_num_workgroups_y"] = Kern.getDocument()->getNode(NumWGY);
+    Kern[".max_num_workgroups_z"] = Kern.getDocument()->getNode(NumWGZ);
+  }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
   Kern[".vgpr_spill_count"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index bcc7dedf32296..fa77b94fc22de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -432,7 +432,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 
   // If minimum/maximum flat work group sizes were explicitly requested using
-  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
+  // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
   // number of waves per execution unit to values implied by requested
   // minimum/maximum flat work group sizes.
   unsigned MinImpliedByFlatWorkGroupSize =
@@ -1108,3 +1108,8 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
 }
+
+SmallVector<unsigned>
+AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index b72697973be7a..e2d8b5d1ce979 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,6 +288,9 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
+  /// Return the number of work groups for the function.
+  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
+
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 52d6fe6c7ba51..2569f40fec0e4 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,6 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
+  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
+  assert(MaxNumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0336ec4985ea7..7d0c1ba8448e6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
+  // Default/requested number of work groups for the function.
+  SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
+
 private:
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
@@ -1072,6 +1075,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   // \returns true if a function needs or may need AGPRs.
   bool usesAGPRs(const MachineFunction &MF) const;
+
+  /// \returns Default/requested number of work groups for this function.
+  SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
+
+  unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
+  unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
+  unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index edb0e50da2896..aa47dccf2dd28 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUAsmUtils.h"
 #include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
@@ -1298,6 +1299,42 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
   return Ints;
 }
 
+SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
+                                             unsigned Size) {
+  assert(Size > 2);
+  SmallVector<unsigned> Default(Size, 0);
+
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  SmallVector<unsigned> Vals(Size, 0);
+
+  LLVMContext &Ctx = F.getContext();
+
+  StringRef S = A.getValueAsString();
+  unsigned i = 0;
+  for (; !S.empty() && i < Size; i++) {
+    std::pair<StringRef, StringRef> Strs = S.split(',');
+    unsigned IntVal;
+    if (Strs.first.trim().getAsInteger(0, IntVal)) {
+      Ctx.emitError("can't parse integer attribute " + Strs.first + " in " +
+                    Name);
+      return Default;
+    }
+    Vals[i] = IntVal;
+    S = Strs.second;
+  }
+
+  if (!S.empty() || i < Size) {
+    Ctx.emitError("attribute " + Name +
+                  " has incorrect number of integers; expected " +
+                  llvm::utostr(Size));
+    return Default;
+  }
+  return Vals;
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d7ea2a3eff4b7..f8521cba077c6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -863,6 +863,14 @@ bool isReadOnlySegment(const GlobalValue *GV);
 /// target triple \p TT, false otherwise.
 bool shouldEmitConstantsToTextSection(const Triple &TT);
 
+/// \returns Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
+int getIntegerAttribute(const Function &F, StringRef Name, int Default);
+
 /// \returns A pair of integer values requested using \p F's \p Name attribute
 /// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
 /// is false).
@@ -877,6 +885,16 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
                         std::pair<unsigned, unsigned> Default,
                         bool OnlyFirstRequired = false);
 
+/// \returns Generate a vector of integer values requested using \p F's \p Name
+/// attribute.
+///
+/// \returns true if exactly Size (>2) number of integers are found in the
+/// attribute.
+///
+/// \returns false if any error occurs.
+SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
+                                             unsigned Size);
+
 /// Represents the counter values to wait for in an s_waitcnt instruction.
 ///
 /// Large values (including the maximum possible integer) can be used to
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups.ll
new file mode 100644
index 0000000000000..bc58222076ac0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups.ll
@@ -0,0 +1,84 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Attribute not specified.
+; CHECK-LABEL: {{^}}empty_no_attribute:
+define amdgpu_kernel void @empty_no_attribute() {
+entry:
+  ret void
+}
+
+; Ignore if number of work groups for x dimension is 0.
+; CHECK-LABEL: {{^}}empty_max_num_workgroups_x0:
+define amdgpu_kernel void @empty_max_num_workgroups_x0() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-max-num-workgroups"="0,2,3"}
+
+; Ignore if number of work groups for y dimension is 0.
+; CHECK-LABEL: {{^}}empty_max_num_workgroups_y0:
+define amdgpu_kernel void @empty_max_num_workgroups_y0() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-max-num-workgroups"="1,0,3"}
+
+; Ignore if number of work groups for z dimension is 0.
+; CHECK-LABEL: {{^}}empty_max_num_workgroups_z0:
+define amdgpu_kernel void @empty_max_num_workgroups_z0() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-max-num-workgroups"="1,2,0"}
+
+; CHECK-LABEL: {{^}}empty_max_num_workgroups_1_2_3:
+define amdgpu_kernel void @empty_max_num_workgroups_1_2_3() #3 {
+entry:
+  ret void
+}
+attributes #3 = {"amdgpu-max-num-workgroups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_max_num_workgroups_1024_1024_1024:
+define amdgpu_kernel void @empty_max_num_workgroups_1024_1024_1024() #4 {
+entry:
+  ret void
+}
+attributes #4 = {"amdgpu-max-num-workgroups"="1024,1024,1024"}
+
+
+; CHECK: .amdgpu_metadata
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_no_attribute
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_workgroups_x0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_workgroups_y0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_workgroups_z0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_workgroups_x: 1
+; CHECK-NEXT:   .max_num_workgroups_y: 2
+; CHECK-NEXT:   .max_num_workgroups_z: 3
+; CHECK-NEXT:   .name:           empty_max_num_workgroups_1_2_3
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_workgroups_x: 1024
+; CHECK-NEXT:   .max_num_workgroups_y: 1024
+; CHECK-NEXT:   .max_num_workgroups_z: 1024
+; CHECK-NEXT:   .name:           empty_max_num_workgroups_1024_1024_1024
+; CHECK-NEXT:   .private_segment_fixed_size: 0
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups_error_check.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups_error_check.ll
new file mode 100644
index 0000000000000..6d86d2d7c1a34
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups_error_check.ll
@@ -0,0 +1,71 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s 2>&1 | FileCheck --check-prefix=ERROR %s
+
+; ERROR: error: can't parse integer attribute -1 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_neg_num1() #21 {
+entry:
+  ret void
+}
+attributes #21 = {"amdgpu-max-num-workgroups"="-1,2,3"}
+
+; ERROR: error: can't parse integer attribute -2 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_neg_num2() #22 {
+entry:
+  ret void
+}
+attributes #22 = {"amdgpu-max-num-workgroups"="1,-2,3"}
+
+; ERROR: error: can't parse integer attribute -3 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_neg_num3() #23 {
+entry:
+  ret void
+}
+attributes #23 = {"amdgpu-max-num-workgroups"="1,2,-3"}
+
+; ERROR: error: can't parse integer attribute 1.0 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_non_int1() #31 {
+entry:
+  ret void
+}
+attributes #31 = {"amdgpu-max-num-workgroups"="1.0,2,3"}
+
+; ERROR: error: can't parse integer attribute 2.0 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_non_int2() #32 {
+entry:
+  ret void
+}
+attributes #32 = {"amdgpu-max-num-workgroups"="1,2.0,3"}
+
+; ERROR: error: can't parse integer attribute 3.0 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_non_int3() #33 {
+entry:
+  ret void
+}
+attributes #33 = {"amdgpu-max-num-workgroups"="1,2,3.0"}
+
+; ERROR: error: can't parse integer attribute 10000000000 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_too_large() #41 {
+entry:
+  ret void
+}
+attributes #41 = {"amdgpu-max-num-workgroups"="10000000000,2,3"}
+
+; ERROR: error: attribute amdgpu-max-num-workgroups has incorrect number of integers; expected 3
+define amdgpu_kernel void @empty_max_num_workgroups_1_arg() #51 {
+entry:
+  ret void
+}
+attributes #51 = {"amdgpu-max-num-workgroups"="1"}
+
+; ERROR: error: attribute amdgpu-max-num-workgroups has incorrect number of integers; expected 3
+define amdgpu_kernel void @empty_max_num_workgroups_2_args() #52 {
+entry:
+  ret void
+}
+attributes #52 = {"amdgpu-max-num-workgroups"="1,2"}
+
+; ERROR: error: attribute amdgpu-max-num-workgroups has incorrect number of integers; expected 3
+define amdgpu_kernel void @empty_max_num_workgroups_4_args() #53 {
+entry:
+  ret void
+}
+attributes #53 = {"amdgpu-max-num-workgroups"="1,2,3,4"}