diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
index ae7d566920656..027bd3b79a1df 100644
--- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -60,6 +60,7 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
   mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genThisCluster(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index f67129dfa6730..c560c53033780 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -457,6 +457,10 @@ static constexpr IntrinsicHandler cudaHandlers[]{
      static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp),
      {},
      /*isElemental=*/false},
+    {"this_cluster",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisCluster),
+     {},
+     /*isElemental=*/false},
     {"this_grid",
      static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid),
      {},
@@ -1122,6 +1126,44 @@ void CUDAIntrinsicLibrary::genSyncWarp(
   mlir::NVVM::SyncWarpOp::create(builder, loc, fir::getBase(args[0]));
 }
 
+// THIS_CLUSTER
+mlir::Value
+CUDAIntrinsicLibrary::genThisCluster(mlir::Type resultType,
+                                     llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  // SIZE
+  mlir::Value size = mlir::NVVM::ClusterDim::create(builder, loc, i32Ty);
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  // RANK
+  mlir::Value rank = mlir::NVVM::ClusterId::create(builder, loc, i32Ty);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+
+  return res;
+}
+
 // THIS_GRID
 mlir::Value
 CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType,
diff --git a/flang/module/cooperative_groups.f90 b/flang/module/cooperative_groups.f90
index b8875f72f8079..1c89866f9c84a 100644
--- a/flang/module/cooperative_groups.f90
+++ b/flang/module/cooperative_groups.f90
@@ -14,6 +14,12 @@ module cooperative_groups
 
 implicit none
 
+type :: cluster_group
+  type(c_devptr), private :: handle
+  integer(4) :: size
+  integer(4) :: rank
+end type cluster_group
+
 type :: grid_group
   type(c_devptr), private :: handle
   integer(4) :: size
@@ -32,6 +38,13 @@ module cooperative_groups
   integer(4) :: rank
 end type thread_group
 
+interface
+  attributes(device) function this_cluster()
+    import
+    type(cluster_group) :: this_cluster
+  end function
+end interface
+
 interface
   attributes(device) function this_grid()
     import