diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 3a1752f5e3c09..502ec5a6b80b5 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -88,7 +88,6 @@ else()
     kmp_dispatch.cpp
     kmp_lock.cpp
     kmp_sched.cpp
-    kmp_collapse.cpp
   )
   if(WIN32)
     # Windows specific files
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index f740f29346ae2..1926683645045 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -401,9 +401,6 @@ kmpc_set_disp_num_buffers                   267
         __kmpc_sections_init                289
         __kmpc_next_section                 290
         __kmpc_end_sections                 291
-        __kmpc_process_loop_nest_rectang    293
-        __kmpc_calc_original_ivs_rectang    295
-        __kmpc_for_collapsed_init           296
 %endif
 
 # User API entry points that have both lower- and upper- case versions for Fortran.
diff --git a/openmp/runtime/src/kmp_collapse.cpp b/openmp/runtime/src/kmp_collapse.cpp
deleted file mode 100644
index cce30c36a84d3..0000000000000
--- a/openmp/runtime/src/kmp_collapse.cpp
+++ /dev/null
@@ -1,1466 +0,0 @@
-/*
- * kmp_collapse.cpp -- loop collapse feature
- */
-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "kmp.h"
-#include "kmp_error.h"
-#include "kmp_i18n.h"
-#include "kmp_itt.h"
-#include "kmp_stats.h"
-#include "kmp_str.h"
-#include "kmp_collapse.h"
-
-#if OMPT_SUPPORT
-#include "ompt-specific.h"
-#endif
-
-// OMPTODO: different style of comments (see kmp_sched)
-// OMPTODO: OMPT/OMPD
-
-//----------------------------------------------------------------------------
-// Common functions for working with rectangular and non-rectangular loops
-//----------------------------------------------------------------------------
-
-template <typename T> int sign(T val) { return (T(0) < val) - (val < T(0)); }
-
-//----------Loop canonicalization---------------------------------------------
-
-// For loop nest (any shape):
-// convert != to < or >;
-// switch from using < or > to <= or >=.
-// "bounds" array has to be allocated per thread.
-// All other internal functions will work only with canonicalized loops.
-template <typename T>
-void kmp_canonicalize_one_loop_XX(ident_t *loc,
-                              /*in/out*/ bounds_infoXX_template<T> *bounds) {
-
-  if (__kmp_env_consistency_check) {
-    if (bounds->step == 0) {
-      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
-                            loc);
-    }
-  }
-
-  if (bounds->comparison == comparison_t::comp_not_eq) {
-    // We can convert this to < or >, depends on the sign of the step:
-    if (bounds->step > 0) {
-      bounds->comparison = comparison_t::comp_less;
-    } else {
-      bounds->comparison = comparison_t::comp_greater;
-    }
-  }
-
-  if (bounds->comparison == comparison_t::comp_less) {
-    // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
-    // because ub0 + ub1*j should be still positive (otherwise loop was not
-    // well formed)
-    bounds->ub0 -= 1;
-    bounds->comparison = comparison_t::comp_less_or_eq;
-  } else if (bounds->comparison == comparison_t::comp_greater) {
-    bounds->ub0 += 1;
-    bounds->comparison = comparison_t::comp_greater_or_eq;
-  }
-}
-
-// Canonicalize loop nest. original_bounds_nest is an array of length n.
-void kmp_canonicalize_loop_nest(ident_t *loc,
-                            /*in/out*/ bounds_info_t *original_bounds_nest,
-                            kmp_index_t n) {
-
-  for (kmp_index_t ind = 0; ind < n; ++ind) {
-    auto bounds = &(original_bounds_nest[ind]);
-
-    switch (bounds->loop_type) {
-    case loop_type_t::loop_type_int32:
-      kmp_canonicalize_one_loop_XX<kmp_int32>(
-          loc,
-          /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
-      break;
-    case loop_type_t::loop_type_uint32:
-      kmp_canonicalize_one_loop_XX<kmp_uint32>(
-          loc,
-          /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
-      break;
-    case loop_type_t::loop_type_int64:
-      kmp_canonicalize_one_loop_XX<kmp_int64>(
-          loc,
-          /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
-      break;
-    case loop_type_t::loop_type_uint64:
-      kmp_canonicalize_one_loop_XX<kmp_uint64>(
-          loc,
-          /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
-      break;
-    default:
-      KMP_ASSERT(false);
-    }
-  }
-}
-
-//----------Calculating trip count on one level-------------------------------
-
-// Calculate trip count on this loop level.
-// We do this either for a rectangular loop nest,
-// or after an adjustment bringing the loops to a parallelepiped shape.
-// This number should not depend on the value of outer IV
-// even if the formular has lb1 and ub1.
-// Note: for non-rectangular loops don't use span for this, it's too big.
-
-template <typename T>
-kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
-    /*in/out*/ bounds_infoXX_template<T> *bounds) {
-
-  if (bounds->comparison == comparison_t::comp_less_or_eq) {
-    if (bounds->ub0 < bounds->lb0) {
-      // Note: after this we don't need to calculate inner loops,
-      // but that should be an edge case:
-      bounds->trip_count = 0;
-    } else {
-      // ub - lb may exceed signed type range; we need to cast to
-      // kmp_loop_nest_iv_t anyway
-      bounds->trip_count =
-          static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
-              std::abs(bounds->step) +
-          1;
-    }
-  } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
-    if (bounds->lb0 < bounds->ub0) {
-      // Note: after this we don't need to calculate inner loops,
-      // but that should be an edge case:
-      bounds->trip_count = 0;
-    } else {
-      // lb - ub may exceed signed type range; we need to cast to
-      // kmp_loop_nest_iv_t anyway
-      bounds->trip_count =
-          static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
-              std::abs(bounds->step) +
-          1;
-    }
-  } else {
-    KMP_ASSERT(false);
-  }
-  return bounds->trip_count;
-}
-
-// Calculate trip count on this loop level.
-kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
-
-  kmp_loop_nest_iv_t trip_count = 0;
-
-  switch (bounds->loop_type) {
-  case loop_type_t::loop_type_int32:
-    trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
-        /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
-    break;
-  case loop_type_t::loop_type_uint32:
-    trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
-        /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
-    break;
-  case loop_type_t::loop_type_int64:
-    trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
-        /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
-    break;
-  case loop_type_t::loop_type_uint64:
-    trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
-        /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
-    break;
-  default:
-    KMP_ASSERT(false);
-  }
-
-  return trip_count;
-}
-
-//----------Trim original iv according to its type----------------------------
-
-// Trim original iv according to its type.
-// Return kmp_uint64 value which can be easily used in all internal calculations
-// And can be statically cast back to original type in user code.
-kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
-  kmp_uint64 res = 0;
-
-  switch (loop_iv_type) {
-  case loop_type_t::loop_type_int8:
-    res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
-    break;
-  case loop_type_t::loop_type_uint8:
-    res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
-    break;
-  case loop_type_t::loop_type_int16:
-    res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
-    break;
-  case loop_type_t::loop_type_uint16:
-    res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
-    break;
-  case loop_type_t::loop_type_int32:
-    res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
-    break;
-  case loop_type_t::loop_type_uint32:
-    res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
-    break;
-  case loop_type_t::loop_type_int64:
-    res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
-    break;
-  case loop_type_t::loop_type_uint64:
-    res = static_cast<kmp_uint64>(original_iv);
-    break;
-  default:
-    KMP_ASSERT(false);
-  }
-
-  return res;
-}
-
-//----------Compare two IVs (remember they have a type)-----------------------
-
-bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
-                kmp_uint64 original_iv2) {
-  bool res = false;
-
-  switch (loop_iv_type) {
-  case loop_type_t::loop_type_int8:
-    res = static_cast<kmp_int8>(original_iv1) ==
-          static_cast<kmp_int8>(original_iv2);
-    break;
-  case loop_type_t::loop_type_uint8:
-    res = static_cast<kmp_uint8>(original_iv1) ==
-          static_cast<kmp_uint8>(original_iv2);
-    break;
-  case loop_type_t::loop_type_int16:
-    res = static_cast<kmp_int16>(original_iv1) ==
-          static_cast<kmp_int16>(original_iv2);
-    break;
-  case loop_type_t::loop_type_uint16:
-    res = static_cast<kmp_uint16>(original_iv1) ==
-          static_cast<kmp_uint16>(original_iv2);
-    break;
-  case loop_type_t::loop_type_int32:
-    res = static_cast<kmp_int32>(original_iv1) ==
-          static_cast<kmp_int32>(original_iv2);
-    break;
-  case loop_type_t::loop_type_uint32:
-    res = static_cast<kmp_uint32>(original_iv1) ==
-          static_cast<kmp_uint32>(original_iv2);
-    break;
-  case loop_type_t::loop_type_int64:
-    res = static_cast<kmp_int64>(original_iv1) ==
-          static_cast<kmp_int64>(original_iv2);
-    break;
-  case loop_type_t::loop_type_uint64:
-    res = static_cast<kmp_uint64>(original_iv1) ==
-          static_cast<kmp_uint64>(original_iv2);
-    break;
-  default:
-    KMP_ASSERT(false);
-  }
-
-  return res;
-}
-
-//----------Calculate original iv on one level--------------------------------
-
-// Return true if the point fits into upper bounds on this level,
-// false otherwise
-template <typename T>
-bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
-                                 const kmp_point_t &original_ivs,
-                                 kmp_index_t ind) {
-
-  T iv = static_cast<T>(original_ivs[ind]);
-  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
-
-  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
-       (iv >
-        (bounds->ub0 +
-         bounds->ub1 * outer_iv))) ||
-      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
-       (iv <
-        (bounds->ub0 + bounds->ub1 * outer_iv)))) {
-    // The calculated point is outside of loop upper boundary:
-    return false;
-  }
-
-  return true;
-}
-
-// Calculate one iv corresponding to iteration on the level ind.
-// Return true if it fits into lower-upper bounds on this level
-// (if not, we need to re-calculate)
-template <typename T>
-bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
-                        /*in/out*/ kmp_point_t &original_ivs,
-                        const kmp_iterations_t &iterations, kmp_index_t ind,
-                        bool start_with_lower_bound, bool checkBounds) {
-
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-
-  kmp_uint64 temp = 0;
-  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
-
-  if (start_with_lower_bound) {
-    // we moved to the next iteration on one of outer loops, should start
-    // with the lower bound here:
-    temp = bounds->lb0 + bounds->lb1 * outer_iv;
-  } else {
-    auto iteration = iterations[ind];
-    temp = bounds->lb0 + bounds->lb1 * outer_iv +
-           iteration * bounds->step;
-  }
-
-  // Now trim original iv according to its type:
-  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
-
-  if (checkBounds) {
-    return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
-  } else {
-    return true;
-  }
-}
-
-bool kmp_calc_one_iv(const bounds_info_t *bounds,
-                     /*in/out*/ kmp_point_t &original_ivs,
-                     const kmp_iterations_t &iterations, kmp_index_t ind,
-                     bool start_with_lower_bound, bool checkBounds) {
-
-  switch (bounds->loop_type) {
-  case loop_type_t::loop_type_int32:
-    return kmp_calc_one_iv_XX<kmp_int32>(
-        (bounds_infoXX_template<kmp_int32> *)(bounds),
-        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
-        checkBounds);
-    break;
-  case loop_type_t::loop_type_uint32:
-    return kmp_calc_one_iv_XX<kmp_uint32>(
-        (bounds_infoXX_template<kmp_uint32> *)(bounds),
-        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
-        checkBounds);
-    break;
-  case loop_type_t::loop_type_int64:
-    return kmp_calc_one_iv_XX<kmp_int64>(
-        (bounds_infoXX_template<kmp_int64> *)(bounds),
-        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
-        checkBounds);
-    break;
-  case loop_type_t::loop_type_uint64:
-    return kmp_calc_one_iv_XX<kmp_uint64>(
-        (bounds_infoXX_template<kmp_uint64> *)(bounds),
-        /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
-        checkBounds);
-    break;
-  default:
-    KMP_ASSERT(false);
-    return false;
-  }
-}
-
-//----------Calculate original iv on one level for rectangular loop nest------
-// Main difference with the common case: original_ivs is an array supplied by
-// user
-
-// Calculate one iv corresponding to iteration on the level ind.
-// Return true if it fits into lower-upper bounds on this level
-// (if not, we need to re-calculate)
-template <typename T>
-void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
-                                /*in/out*/ kmp_uint64 *original_ivs,
-                                const kmp_iterations_t &iterations,
-                                kmp_index_t ind) {
-
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-
-  auto iteration = iterations[ind];
-
-  kmp_uint64 temp =
-      bounds->lb0 +
-      bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
-      iteration * bounds->step;
-
-  // Now trim original iv according to its type:
-  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
-}
-
-void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
-                             /*in/out*/ kmp_uint64 *original_ivs,
-                             const kmp_iterations_t &iterations,
-                             kmp_index_t ind) {
-
-  switch (bounds->loop_type) {
-  case loop_type_t::loop_type_int32:
-    kmp_calc_one_iv_rectang_XX<kmp_int32>(
-        (bounds_infoXX_template<kmp_int32> *)(bounds),
-        /*in/out*/ original_ivs, iterations, ind);
-    break;
-  case loop_type_t::loop_type_uint32:
-    kmp_calc_one_iv_rectang_XX<kmp_uint32>(
-        (bounds_infoXX_template<kmp_uint32> *)(bounds),
-        /*in/out*/ original_ivs, iterations, ind);
-    break;
-  case loop_type_t::loop_type_int64:
-    kmp_calc_one_iv_rectang_XX<kmp_int64>(
-        (bounds_infoXX_template<kmp_int64> *)(bounds),
-        /*in/out*/ original_ivs, iterations, ind);
-    break;
-  case loop_type_t::loop_type_uint64:
-    kmp_calc_one_iv_rectang_XX<kmp_uint64>(
-        (bounds_infoXX_template<kmp_uint64> *)(bounds),
-        /*in/out*/ original_ivs, iterations, ind);
-    break;
-  default:
-    KMP_ASSERT(false);
-  }
-}
-
-//----------------------------------------------------------------------------
-// Rectangular loop nest
-//----------------------------------------------------------------------------
-
-//----------Canonicalize loop nest and calculate trip count-------------------
-
-// Canonicalize loop nest and calculate overall trip count.
-// "bounds_nest" has to be allocated per thread.
-// API will modify original bounds_nest array to bring it to a canonical form
-// (only <= and >=, no !=, <, >). If the original loop nest was already in a
-// canonical form there will be no changes to bounds in bounds_nest array
-// (only trip counts will be calculated).
-// Returns trip count of overall space.
-extern "C" kmp_loop_nest_iv_t
-__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
-                                 /*in/out*/ bounds_info_t *original_bounds_nest,
-                                 kmp_index_t n) {
-
-  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
-
-  kmp_loop_nest_iv_t total = 1;
-
-  for (kmp_index_t ind = 0; ind < n; ++ind) {
-    auto bounds = &(original_bounds_nest[ind]);
-
-    kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
-    total *= trip_count;
-  }
-
-  return total;
-}
-
-//----------Calculate old induction variables---------------------------------
-
-// Calculate old induction variables corresponding to overall new_iv.
-// Note: original IV will be returned as if it had kmp_uint64 type,
-// will have to be converted to original type in user code.
-// Note: trip counts should be already calculated by
-// __kmpc_process_loop_nest_rectang.
-// OMPTODO: special case 2, 3 nested loops: either do different
-// interface without array or possibly template this over n
-extern "C" void
-__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
-                                 const bounds_info_t *original_bounds_nest,
-                                 /*out*/ kmp_uint64 *original_ivs,
-                                 kmp_index_t n) {
-
-  kmp_iterations_t iterations(n);
-
-  // First, calc corresponding iteration in every original loop:
-  for (kmp_index_t ind = n; ind > 0;) {
-    --ind;
-    auto bounds = &(original_bounds_nest[ind]);
-
-    // should be optimized to OPDIVREM:
-    auto temp = new_iv / bounds->trip_count;
-    auto iteration = new_iv % bounds->trip_count;
-    new_iv = temp;
-
-    iterations[ind] = iteration;
-  }
-  KMP_ASSERT(new_iv == 0);
-
-  for (kmp_index_t ind = 0; ind < n; ++ind) {
-    auto bounds = &(original_bounds_nest[ind]);
-
-    kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
-  }
-}
-
-//----------------------------------------------------------------------------
-// Non-rectangular loop nest
-//----------------------------------------------------------------------------
-
-//----------Calculate maximum possible span of iv values on one level---------
-
-// Calculate span for IV on this loop level for "<=" case.
-// Note: it's for <= on this loop nest level, so lower bound should be smallest
-// value, upper bound should be the biggest value. If the loop won't execute,
-// 'smallest' may be bigger than 'biggest', but we'd better not switch them
-// around.
-template <typename T>
-void kmp_calc_span_lessoreq_XX(
-    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
-    /* in/out*/ std::vector<bounds_info_internal_t> &bounds_nest) {
-
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-
-  // typedef typename big_span_t span_t;
-  typedef T span_t;
-
-  auto &bbounds = bounds->b;
-
-  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
-    // This dimention depends on one of previous ones; can't be the outermost
-    // one.
-    bounds_info_internalXX_template<T> *previous =
-        reinterpret_cast<bounds_info_internalXX_template<T> *>(
-            &(bounds_nest[bbounds.outer_iv]));
-
-    // OMPTODO: assert that T is compatible with loop variable type on
-    // 'previous' loop
-
-    {
-      span_t bound_candidate1 =
-          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
-      span_t bound_candidate2 =
-          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
-      if (bound_candidate1 < bound_candidate2) {
-        bounds->span_smallest = bound_candidate1;
-      } else {
-        bounds->span_smallest = bound_candidate2;
-      }
-    }
-
-    {
-      // We can't adjust the upper bound with respect to step, because
-      // lower bound might be off after adjustments
-
-      span_t bound_candidate1 =
-          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
-      span_t bound_candidate2 =
-          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
-      if (bound_candidate1 < bound_candidate2) {
-        bounds->span_biggest = bound_candidate2;
-      } else {
-        bounds->span_biggest = bound_candidate1;
-      }
-    }
-  } else {
-    // Rectangular:
-    bounds->span_smallest = bbounds.lb0;
-    bounds->span_biggest = bbounds.ub0;
-  }
-  if (!bounds->loop_bounds_adjusted) {
-    // Here it's safe to reduce the space to the multiply of step.
-    // OMPTODO: check if the formular is correct.
-    // Also check if it would be safe to do this if we didn't adjust left side.
-    bounds->span_biggest -=
-        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
-  }
-}
-
-// Calculate span for IV on this loop level for ">=" case.
-template <typename T>
-void kmp_calc_span_greateroreq_XX(
-    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
-    /* in/out*/ std::vector<bounds_info_internal_t> &bounds_nest) {
-
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-
-  // typedef typename big_span_t span_t;
-  typedef T span_t;
-
-  auto &bbounds = bounds->b;
-
-  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
-    // This dimention depends on one of previous ones; can't be the outermost
-    // one.
-    bounds_info_internalXX_template<T> *previous =
-        reinterpret_cast<bounds_info_internalXX_template<T> *>(
-            &(bounds_nest[bbounds.outer_iv]));
-
-    // OMPTODO: assert that T is compatible with loop variable type on
-    // 'previous' loop
-
-    {
-      span_t bound_candidate1 =
-          bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
-      span_t bound_candidate2 =
-          bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
-      if (bound_candidate1 >= bound_candidate2) {
-        bounds->span_smallest = bound_candidate1;
-      } else {
-        bounds->span_smallest = bound_candidate2;
-      }
-    }
-
-    {
-      // We can't adjust the upper bound with respect to step, because
-      // lower bound might be off after adjustments
-
-      span_t bound_candidate1 =
-          bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
-      span_t bound_candidate2 =
-          bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
-      if (bound_candidate1 >= bound_candidate2) {
-        bounds->span_biggest = bound_candidate2;
-      } else {
-        bounds->span_biggest = bound_candidate1;
-      }
-    }
-
-  } else {
-    // Rectangular:
-    bounds->span_biggest = bbounds.lb0;
-    bounds->span_smallest = bbounds.ub0;
-  }
-  if (!bounds->loop_bounds_adjusted) {
-    // Here it's safe to reduce the space to the multiply of step.
-    // OMPTODO: check if the formular is correct.
-    // Also check if it would be safe to do this if we didn't adjust left side.
-    bounds->span_biggest -=
-        (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
-  }
-}
-
-// Calculate maximum possible span for IV on this loop level.
-template <typename T>
-void kmp_calc_span_XX(
-    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
-    /* in/out*/ std::vector<bounds_info_internal_t> &bounds_nest) {
-
-  if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
-    kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
-  } else {
-    KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
-    kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
-  }
-}
-
-//----------All initial processing of the loop nest---------------------------
-
-// Calculate new bounds for this loop level.
-// To be able to work with the nest we need to get it to a parallelepiped shape.
-// We need to stay in the original range of values, so that there will be no
-// overflow, for that we'll adjust both upper and lower bounds as needed.
-template <typename T>
-void kmp_calc_new_bounds_XX(
-    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
-    /* in/out*/ std::vector<bounds_info_internal_t> &bounds_nest) {
-
-  auto &bbounds = bounds->b;
-
-  if (bbounds.lb1 == bbounds.ub1) {
-    // Already parallel, no need to adjust:
-    bounds->loop_bounds_adjusted = false;
-  } else {
-    bounds->loop_bounds_adjusted = true;
-
-    T old_lb1 = bbounds.lb1;
-    T old_ub1 = bbounds.ub1;
-
-    if (sign(old_lb1) != sign(old_ub1)) {
-      // With this shape we can adjust to a rectangle:
-      bbounds.lb1 = 0;
-      bbounds.ub1 = 0;
-    } else {
-      // get upper and lower bounds to be parallel
-      // with values in the old range.
-      // Note: std::abs didn't work here.
-      if (((sign(old_lb1) == -1) && (old_lb1 < old_ub1)) ||
-          ((sign(old_lb1) == 1) && (old_lb1 > old_ub1))) {
-        bbounds.lb1 = old_ub1;
-      } else {
-        bbounds.ub1 = old_lb1;
-      }
-    }
-
-    // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
-    // The idea here that for this IV we are now getting the same span
-    // irrespective of the previous IV value.
-    bounds_info_internalXX_template<T> *previous =
-        reinterpret_cast<bounds_info_internalXX_template<T> *>(
-            &bounds_nest[bbounds.outer_iv]);
-
-    if (bbounds.comparison == comparison_t::comp_less_or_eq) {
-      if (old_lb1 < bbounds.lb1) {
-        KMP_ASSERT(old_lb1 < 0);
-        // The length is good on outer_iv biggest number,
-        // can use it to find where to move the lower bound:
-
-        T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
-        bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
-                            // e.g. it was 0?? (same below)
-      } else if (old_lb1 > bbounds.lb1) {
-        // still need to move lower bound:
-        T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
-        bbounds.lb0 += add;
-      }
-
-      if (old_ub1 > bbounds.ub1) {
-        KMP_ASSERT(old_ub1 > 0);
-        // The length is good on outer_iv biggest number,
-        // can use it to find where to move upper bound:
-
-        T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
-        bbounds.ub0 += add;
-      } else if (old_ub1 < bbounds.ub1) {
-        // still need to move upper bound:
-        T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
-        bbounds.ub0 -= sub;
-      }
-    } else {
-      KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
-      if (old_lb1 < bbounds.lb1) {
-        KMP_ASSERT(old_lb1 < 0);
-        T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
-        bbounds.lb0 -= sub;
-      } else if (old_lb1 > bbounds.lb1) {
-        T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
-        bbounds.lb0 += add;
-      }
-
-      if (old_ub1 > bbounds.ub1) {
-        KMP_ASSERT(old_ub1 > 0);
-        T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
-        bbounds.ub0 += add;
-      } else if (old_ub1 < bbounds.ub1) {
-        T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
-        bbounds.ub0 -= sub;
-      }
-    }
-  }
-}
-
-// Do all processing for one canonicalized loop in the nest
-// (assuming that outer loops already were processed):
-template <typename T>
-kmp_loop_nest_iv_t kmp_process_one_loop_XX(
-    /* in/out*/ bounds_info_internalXX_template<T> *bounds,
-    /*in/out*/ std::vector<bounds_info_internal_t> &bounds_nest) {
-
-  kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
-  kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
-  return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
-}
-
-// Non-rectangular loop nest, canonicalized to use <= or >=.
-// Process loop nest to have a parallelepiped shape,
-// calculate biggest spans for IV's on all levels and calculate overall trip
-// count. "bounds_nest" has to be allocated per thread.
-// Returns overall trip count (for adjusted space).
-kmp_loop_nest_iv_t kmp_process_loop_nest(
-    /*in/out*/ std::vector<bounds_info_internal_t> &bounds_nest) {
-
-  kmp_loop_nest_iv_t total = 1;
-
-  for (kmp_index_t ind = 0; ind < bounds_nest.size(); ++ind) {
-    auto bounds = &(bounds_nest[ind]);
-    kmp_loop_nest_iv_t trip_count = 0;
-
-    switch (bounds->b.loop_type) {
-    case loop_type_t::loop_type_int32:
-      trip_count = kmp_process_one_loop_XX<kmp_int32>(
-          /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
-          /*in/out*/ bounds_nest);
-      break;
-    case loop_type_t::loop_type_uint32:
-      trip_count = kmp_process_one_loop_XX<kmp_uint32>(
-          /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
-          /*in/out*/ bounds_nest);
-      break;
-    case loop_type_t::loop_type_int64:
-      trip_count = kmp_process_one_loop_XX<kmp_int64>(
-          /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
-          /*in/out*/ bounds_nest);
-      break;
-    case loop_type_t::loop_type_uint64:
-      trip_count = kmp_process_one_loop_XX<kmp_uint64>(
-          /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
-          /*in/out*/ bounds_nest);
-      break;
-    default:
-      KMP_ASSERT(false);
-    }
-    total *= trip_count;
-  }
-
-  return total;
-}
-
-//----------Calculate iterations (in the original or updated space)-----------
-
-// Calculate number of iterations in original or updated space resulting in
-// original_ivs[ind] (only on this level, non-negative)
-// (not counting initial iteration)
-template <typename T>
-kmp_loop_nest_iv_t
-kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
-                                 const kmp_point_t &original_ivs,
-                                 kmp_index_t ind) {
-
-  kmp_loop_nest_iv_t iterations = 0;
-
-  if (bounds->comparison == comparison_t::comp_less_or_eq) {
-    iterations =
-        (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
-         bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
-        std::abs(bounds->step);
-  } else {
-    KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
-    iterations = (bounds->lb0 +
-                  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
-                  static_cast<T>(original_ivs[ind])) /
-                 std::abs(bounds->step);
-  }
-
-  return iterations;
-}
-
-// Calculate number of iterations in the original or updated space resulting in
-// original_ivs[ind] (only on this level, non-negative)
-kmp_loop_nest_iv_t
-kmp_calc_number_of_iterations(const bounds_info_t *bounds,
-                              const kmp_point_t &original_ivs,
-                              kmp_index_t ind) {
-
-  switch (bounds->loop_type) {
-  case loop_type_t::loop_type_int32:
-    return kmp_calc_number_of_iterations_XX<kmp_int32>(
-        (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
-    break;
-  case loop_type_t::loop_type_uint32:
-    return kmp_calc_number_of_iterations_XX<kmp_uint32>(
-        (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
-    break;
-  case loop_type_t::loop_type_int64:
-    return kmp_calc_number_of_iterations_XX<kmp_int64>(
-        (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
-    break;
-  case loop_type_t::loop_type_uint64:
-    return kmp_calc_number_of_iterations_XX<kmp_uint64>(
-        (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
-    break;
-  default:
-    KMP_ASSERT(false);
-    return 0;
-  }
-}
-
-//----------Calculate new iv corresponding to original ivs--------------------
-
-// We got a point in the original loop nest.
-// Take updated bounds and calculate what new_iv will correspond to this point.
-// When we are getting original IVs from new_iv, we have to adjust to fit into
-// original loops bounds. Getting new_iv for the adjusted original IVs will help
-// with making more chunks non-empty.
-kmp_loop_nest_iv_t kmp_calc_new_iv_from_original_ivs(
-    const std::vector<bounds_info_internal_t> &bounds_nest,
-    const kmp_point_t &original_ivs) {
-
-  kmp_loop_nest_iv_t new_iv = 0;
-
-  for (kmp_index_t ind = 0; ind < bounds_nest.size(); ++ind) {
-    auto bounds = &(bounds_nest[ind].b);
-
-    new_iv = new_iv * bounds->trip_count +
-             kmp_calc_number_of_iterations(bounds, original_ivs, ind);
-  }
-
-  return new_iv;
-}
-
-//----------Calculate original ivs for provided iterations--------------------
-
-// Calculate original IVs for provided iterations, assuming iterations are
-// calculated in the original space.
-// Loop nest is in canonical form (with <= / >=).
-bool kmp_calc_original_ivs_from_iterations(
-    const bounds_info_t *original_bounds_nest, kmp_index_t n,
-    /*in/out*/ kmp_point_t &original_ivs,
-    /*in/out*/ kmp_iterations_t &iterations, kmp_index_t ind) {
-
-  kmp_index_t lengthened_ind = n;
-
-  for (; ind < n;) {
-    auto bounds = &(original_bounds_nest[ind]);
-    bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
-                                ind, (lengthened_ind < ind), true);
-
-    if (!good) {
-      // The calculated iv value is too big (or too small for >=):
-      if (ind == 0) {
-        // Space is empty:
-        return false;
-      } else {
-        // Go to next iteration on the outer loop:
-        --ind;
-        ++iterations[ind];
-        lengthened_ind = ind;
-        for (kmp_index_t i = ind + 1; i < n; ++i) {
-          iterations[i] = 0;
-        }
-        continue;
-      }
-    }
-    ++ind;
-  }
-
-  return true;
-}
-
-//----------Calculate original ivs for the beginning of the loop nest---------
-
-// Calculate IVs for the beginning of the loop nest.
-// Note: lower bounds of all loops may not work -
-// if on some of the iterations of the outer loops inner loops are empty.
-// Loop nest is in canonical form (with <= / >=).
-bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
-                                     kmp_index_t n,
-                                     /*out*/ kmp_point_t &original_ivs) {
-
-  // Iterations in the original space, multiplied by step:
-  kmp_iterations_t iterations(n);
-
-  for (kmp_index_t ind = n; ind > 0;) {
-    --ind;
-    iterations[ind] = 0;
-  }
-
-  // Now calculate the point:
-  return kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
-                                               /*in/out*/ original_ivs,
-                                               /*in/out*/ iterations, 0);
-}
-
-//----------Calculate next point in the original loop space-------------------
-
-// From current set of original IVs calculate next point.
-// Return false if there is no next point in the loop bounds.
-bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
-                                kmp_index_t n, const kmp_point_t &original_ivs,
-                                /*out*/ kmp_point_t &next_original_ivs) {
-  // Iterations in the original space, multiplied by step (so can be negative):
-  kmp_iterations_t iterations(n);
-
-  // First, calc corresponding iteration in every original loop:
-  for (kmp_index_t ind = 0; ind < n; ++ind) {
-    auto bounds = &(original_bounds_nest[ind]);
-    iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
-  }
-
-  next_original_ivs = original_ivs;
-
-  // Next add one step to the iterations on the inner-most level, and see if we
-  // need to move up the nest:
-  kmp_index_t ind = n - 1;
-  ++iterations[ind];
-
-  return kmp_calc_original_ivs_from_iterations(
-      original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
-}
-
-//----------Calculate chunk end in the original loop space--------------------
-
-// For one level calculate old induction variable corresponding to overall
-// new_iv for the chunk end.
-// Return true if it fits into upper bound on this level
-// (if not, we need to re-calculate)
-template <typename T>
-bool kmp_calc_one_iv_for_chunk_end_XX(
-    const bounds_infoXX_template<T> *bounds,
-    const bounds_infoXX_template<T> *updated_bounds,
-    /*in/out*/ kmp_point_t &original_ivs, const kmp_iterations_t &iterations,
-    kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
-    const kmp_point_t &original_ivs_start) {
-
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-
-    typedef
-      typename std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
-          big_span_t;
-
-  // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
-  T temp = 0;
-
-  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
-
-  if (start_with_lower_bound) {
-    // we moved to the next iteration on one of outer loops, may as well use
-    // the lower bound here:
-    temp = bounds->lb0 + bounds->lb1 * outer_iv;
-  } else {
-    // Start in expanded space, but:
-    // - we need to hit original space lower bound, so need to account for
-    // that
-    // - we have to go into original space, even if that means adding more
-    // iterations than was planned
-    // - we have to go past (or equal to) previous point (which is the chunk
-    // starting point)
-
-    auto iteration = iterations[ind];
-
-    auto step = bounds->step;
-
-    // In case of >= it's negative:
-    auto accountForStep =
-        ((bounds->lb0 + bounds->lb1 * outer_iv) -
-         (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
-        step;
-
-    temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
-           accountForStep + iteration * step;
-
-    if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
-         (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
-        ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
-         (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
-      // Too small (or too big), didn't reach the original lower bound. Use
-      // heuristic:
-      temp = bounds->lb0 + bounds->lb1 * outer_iv +
-             iteration / 2 * step;
-    }
-
-    if (compare_with_start) {
-
-      T start = static_cast<T>(original_ivs_start[ind]);
-
-      temp = kmp_fix_iv(bounds->loop_iv_type, temp);
-
-      // On all previous levels start of the chunk is same as the end, need to
-      // be really careful here:
-      if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
-           (temp < start)) ||
-          ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
-           (temp > start))) {
-        // End of the chunk can't be smaller (for >= bigger) than it's start.
-        // Use heuristic:
-        temp = start + iteration / 4 * step;
-      }
-    }
-  }
-
-  original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
-
-  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
-       (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
-      ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
-       (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
-    // Too big (or too small for >=).
-    return false;
-  }
-
-  return true;
-}
-
-// For one level calculate old induction variable corresponding to overall
-// new_iv for the chunk end.
-bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
-                                   const bounds_info_t *updated_bounds,
-                                   /*in/out*/ kmp_point_t &original_ivs,
-                                   const kmp_iterations_t &iterations,
-                                   kmp_index_t ind, bool start_with_lower_bound,
-                                   bool compare_with_start,
-                                   const kmp_point_t &original_ivs_start) {
-
-  switch (bounds->loop_type) {
-  case loop_type_t::loop_type_int32:
-    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
-        (bounds_infoXX_template<kmp_int32> *)(bounds),
-        (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
-        /*in/out*/
-        original_ivs, iterations, ind, start_with_lower_bound,
-        compare_with_start, original_ivs_start);
-    break;
-  case loop_type_t::loop_type_uint32:
-    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
-        (bounds_infoXX_template<kmp_uint32> *)(bounds),
-        (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
-        /*in/out*/
-        original_ivs, iterations, ind, start_with_lower_bound,
-        compare_with_start, original_ivs_start);
-    break;
-  case loop_type_t::loop_type_int64:
-    return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
-        (bounds_infoXX_template<kmp_int64> *)(bounds),
-        (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
-        /*in/out*/
-        original_ivs, iterations, ind, start_with_lower_bound,
-        compare_with_start, original_ivs_start);
-    break;
-  case loop_type_t::loop_type_uint64:
-    return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
-        (bounds_infoXX_template<kmp_uint64> *)(bounds),
-        (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
-        /*in/out*/
-        original_ivs, iterations, ind, start_with_lower_bound,
-        compare_with_start, original_ivs_start);
-    break;
-  default:
-    KMP_ASSERT(false);
-    return false;
-  }
-}
-
-// Calculate old induction variables corresponding to overall new_iv for the
-// chunk end. If due to space extension we are getting old IVs outside of the
-// boundaries, bring them into the boundaries. Need to do this in the runtime,
-// esp. on the lower bounds side. When getting result need to make sure that the
-// new chunk starts at next position to old chunk, not overlaps with it (this is
-// done elsewhere), and need to make sure end of the chunk is further than the
-// beginning of the chunk. We don't need an exact ending point here, just
-// something more-or-less close to the desired chunk length, bigger is fine
-// (smaller would be fine, but we risk going into infinite loop, so do smaller
-// only at the very end of the space). result: false if could not find the
-// ending point in the original loop space. In this case the caller can use
-// original upper bounds as the end of the chunk. Chunk won't be empty, because
-// it'll have at least the starting point, which is by construction in the
-// original space.
-bool kmp_calc_original_ivs_for_chunk_end(
-    const bounds_info_t *original_bounds_nest, kmp_index_t n,
-    const std::vector<bounds_info_internal_t> &updated_bounds_nest,
-    const kmp_point_t &original_ivs_start, kmp_loop_nest_iv_t new_iv,
-    /*out*/ kmp_point_t &original_ivs) {
-
-  // Iterations in the expanded space:
-  kmp_iterations_t iterations(n);
-
-  KMP_DEBUG_ASSERT(updated_bounds_nest.size() == n);
-
-#if defined(KMP_DEBUG)
-  auto new_iv_saved = new_iv;
-#endif
-
-  // First, calc corresponding iteration in every modified loop:
-  for (kmp_index_t ind = n; ind > 0;) {
-    --ind;
-    auto &updated_bounds = updated_bounds_nest[ind];
-
-    // should be optimized to OPDIVREM:
-    auto new_ind = new_iv / updated_bounds.b.trip_count;
-    auto iteration = new_iv % updated_bounds.b.trip_count;
-
-    new_iv = new_ind;
-    iterations[ind] = iteration;
-  }
-  KMP_DEBUG_ASSERT(new_iv == 0);
-
-  kmp_index_t lengthened_ind = n;
-  kmp_index_t equal_ind = -1;
-
-  // Next calculate the point, but in original loop nest.
-  for (kmp_index_t ind = 0; ind < n;) {
-    auto bounds = &(original_bounds_nest[ind]);
-    auto updated_bounds = &(updated_bounds_nest[ind].b);
-
-    bool good = kmp_calc_one_iv_for_chunk_end(
-        bounds, updated_bounds,
-        /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
-        (equal_ind >= ind - 1), original_ivs_start);
-
-    if (!good) {
-      // Too big (or too small for >=).
-      if (ind == 0) {
-        // Need to reduce to the end.
-        return false;
-      } else {
-        // Go to next iteration on outer loop:
-        --ind;
-        ++(iterations[ind]);
-        lengthened_ind = ind;
-        if (equal_ind >= lengthened_ind) {
-          // We've changed the number of iterations here,
-          // can't be same anymore:
-          equal_ind = lengthened_ind - 1;
-        }
-        for (kmp_index_t i = ind + 1; i < n; ++i) {
-          iterations[i] = 0;
-        }
-        continue;
-      }
-    }
-
-    if ((equal_ind == ind - 1) &&
-        (kmp_ivs_eq(bounds->loop_iv_type, 
-        original_ivs[ind], original_ivs_start[ind]))) {
-      equal_ind = ind;
-    } else if ((equal_ind > ind - 1) &&
-               !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
-                            original_ivs_start[ind]))) {
-      equal_ind = ind - 1;
-    }
-    ++ind;
-  }
-
-  return true;
-}
-
-//----------Calculate upper bounds for the last chunk-------------------------
-
-// Calculate one upper bound for the end.
-template <typename T>
-void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
-                            /*in/out*/ kmp_point_t &original_ivs,
-                            kmp_index_t ind) {
-
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-
-  T temp = bounds->ub0 +
-           bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
-
-  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
-}
-
-void kmp_calc_one_iv_end(const bounds_info_t *bounds,
-                         /*in/out*/ kmp_point_t &original_ivs,
-                         kmp_index_t ind) {
-
-  switch (bounds->loop_type) {
-  default:
-    KMP_ASSERT(false);
-    break;
-  case loop_type_t::loop_type_int32:
-    kmp_calc_one_iv_end_XX<kmp_int32>(
-        (bounds_infoXX_template<kmp_int32> *)(bounds),
-        /*in/out*/ original_ivs, ind);
-    break;
-  case loop_type_t::loop_type_uint32:
-    kmp_calc_one_iv_end_XX<kmp_uint32>(
-        (bounds_infoXX_template<kmp_uint32> *)(bounds),
-        /*in/out*/ original_ivs, ind);
-    break;
-  case loop_type_t::loop_type_int64:
-    kmp_calc_one_iv_end_XX<kmp_int64>(
-        (bounds_infoXX_template<kmp_int64> *)(bounds),
-        /*in/out*/ original_ivs, ind);
-    break;
-  case loop_type_t::loop_type_uint64:
-    kmp_calc_one_iv_end_XX<kmp_uint64>(
-        (bounds_infoXX_template<kmp_uint64> *)(bounds),
-        /*in/out*/ original_ivs, ind);
-    break;
-  }
-}
-
-// Calculate upper bounds for the last loop iteration. Just use original upper
-// bounds (adjusted when canonicalized to use <= / >=). No need to check that this
-// point is in the original space (it's likely not)
-void kmp_calc_original_ivs_for_end(
-    const bounds_info_t *const original_bounds_nest, kmp_index_t n,
-    /*out*/ kmp_point_t &original_ivs) {
-  for (kmp_index_t ind = 0; ind < n; ++ind) {
-    auto bounds = &(original_bounds_nest[ind]);
-    kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
-  }
-}
-
-//----------Init API for non-rectangular loops--------------------------------
-
-// Init API for collapsed loops (static, no chunks defined).
-// "bounds_nest" has to be allocated per thread.
-// API will modify original bounds_nest array to bring it to a canonical form
-// (only <= and >=, no !=, <, >). If the original loop nest was already in a
-// canonical form there will be no changes to bounds in bounds_nest array
-// (only trip counts will be calculated). Internally API will expand the space
-// to parallelogram/parallelepiped, calculate total, calculate bounds for the
-// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
-// important on the left side, to hit the lower bounds and not step over), and
-// pick the correct chunk for this thread (so it will calculate chunks up to the
-// needed one). It could be optimized to calculate just this chunk, potentially
-// a bit less well distributed among threads. It is designed to make sure that
-// threads will receive predictable chunks, deterministically (so that next nest
-// of loops with similar characteristics will get exactly same chunks on same
-// threads).
-// Current contract: chunk_bounds_nest has only lb0 and ub0,
-// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
-extern "C" kmp_int32
-__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
-                          /*in/out*/ bounds_info_t *original_bounds_nest,
-                          /*out*/ bounds_info_t *chunk_bounds_nest,
-                          kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
-
-  KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
-  KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
-
-  if (__kmp_env_consistency_check) {
-    __kmp_push_workshare(gtid, ct_pdo, loc);
-  }
-
-  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
-
-  std::vector<bounds_info_internal_t> updated_bounds_nest(n);
-
-  for (kmp_index_t i = 0; i < n; ++i) {
-    updated_bounds_nest[i].b = original_bounds_nest[i];
-  }
-
-  kmp_loop_nest_iv_t total =
-      kmp_process_loop_nest(/*in/out*/ updated_bounds_nest);
-
-  if (plastiter != NULL) {
-    *plastiter = FALSE;
-  }
-
-  if (total == 0) {
-    // Loop won't execute:
-    return FALSE;
-  }
-
-  // OMPTODO: DISTRIBUTE is not supported yet
-  __kmp_assert_valid_gtid(gtid);
-  kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
-
-  kmp_info_t *th = __kmp_threads[gtid];
-  kmp_team_t *team = th->th.th_team;
-  kmp_uint32 nth = team->t.t_nproc; // Number of threads
-
-  KMP_DEBUG_ASSERT(tid < nth);
-
-  kmp_point_t original_ivs_start(n);
-
-  if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
-                                       /*out*/ original_ivs_start)) {
-    // Loop won't execute:
-    return FALSE;
-  }
-
-  // Not doing this optimization for one thread:
-  // (1) more to test
-  // (2) without it current contract that chunk_bounds_nest has only lb0 and ub0,
-  // lb1 and ub1 are set to 0 and can be ignored. 
-  //if (nth == 1) {
-  //  // One thread:
-  //  // Copy all info from original_bounds_nest, it'll be good enough.
-
-  //  for (kmp_index_t i = 0; i < n; ++i) {
-  //    chunk_bounds_nest[i] = original_bounds_nest[i];
-  //  }
-
-  //  if (plastiter != NULL) {
-  //    *plastiter = TRUE;
-  //  }
-  //  return TRUE;
-  //}
-
-  kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
-      updated_bounds_nest, original_ivs_start);
-
-  bool last_iter = false;
-
-  for (; nth > 0;) {
-    // We could calculate chunk size once, but this is to compensate that the
-    // original space is not parallelepiped and some threads can be left
-    // without work:
-    KMP_DEBUG_ASSERT(total >= new_iv);
-
-    kmp_loop_nest_iv_t total_left = total - new_iv;
-    kmp_loop_nest_iv_t chunk_size = total_left / nth;
-    kmp_loop_nest_iv_t remainder = total_left % nth;
-
-    kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
-
-    if (remainder > 0) {
-      ++curr_chunk_size;
-      --remainder;
-    }
-
-#if defined(KMP_DEBUG)
-    kmp_loop_nest_iv_t new_iv_for_start = new_iv;
-#endif
-
-    if (curr_chunk_size > 1) {
-      new_iv += curr_chunk_size - 1;
-    }
-
-    kmp_point_t original_ivs_end(n);
-    if ((nth == 1) || (new_iv >= total - 1)) {
-      // Do this one till the end - just in case we miscalculated
-      // and either too much is left to process or new_iv is a bit too big:
-      kmp_calc_original_ivs_for_end(original_bounds_nest, n,
-                                    /*out*/ original_ivs_end);
-
-      last_iter = true;
-    } else {
-      // Note: here we make sure it's past (or equal to) the previous point.
-      if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
-                                               updated_bounds_nest,
-                                               original_ivs_start, new_iv,
-                                               /*out*/ original_ivs_end)) {
-        // We could not find the ending point, use the original upper bounds:
-        kmp_calc_original_ivs_for_end(original_bounds_nest, n,
-                                      /*out*/ original_ivs_end);
-
-        last_iter = true;
-      }
-    }
-
-#if defined(KMP_DEBUG)
-    auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
-                                                            original_ivs_end);
-    KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
-#endif
-
-    if (last_iter && (tid != 0)) {
-      // We are done, this was last chunk, but no chunk for current thread was
-      // found:
-      return FALSE;
-    }
-
-    if (tid == 0) {
-      // We found the chunk for this thread, now we need to check if it's the
-      // last chunk or not:
-
-      kmp_point_t original_ivs_next_start(n);
-
-      if (last_iter ||
-          !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
-                                      /*out*/ original_ivs_next_start)) {
-        // no more loop iterations left to process,
-        // this means that currently found chunk is the last chunk:
-        if (plastiter != NULL) {
-          *plastiter = TRUE;
-        }
-      }
-
-      // Fill in chunk bounds:
-      for (kmp_index_t i = 0; i < n; ++i) {
-        chunk_bounds_nest[i] =
-            original_bounds_nest[i]; // To fill in types, etc. - optional
-        chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
-        chunk_bounds_nest[i].lb1_u64 = 0;
-
-        chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
-        chunk_bounds_nest[i].ub1_u64 = 0;
-      }
-
-      return TRUE;
-    }
-
-    --tid;
-    --nth;
-
-    bool next_chunk = kmp_calc_next_original_ivs(
-        original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
-    if (!next_chunk) {
-      // no more loop iterations to process,
-      // the prevoius chunk was the last chunk
-      break;
-    }
-
-    // original_ivs_start is next to previous chunk original_ivs_end,
-    // we need to start new chunk here, so chunks will be one after another
-    // without any gap or overlap:
-    new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
-                                               original_ivs_start);
-  }
-
-  return FALSE;
-}
diff --git a/openmp/runtime/src/kmp_collapse.h b/openmp/runtime/src/kmp_collapse.h
deleted file mode 100644
index abfd88f06d799..0000000000000
--- a/openmp/runtime/src/kmp_collapse.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * kmp_collapse.h -- header for loop collapse feature
- */
-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef KMP_COLLAPSE_H
-#define KMP_COLLAPSE_H
-
-#include <vector>
-#include <type_traits>
-
-// Type of the index into the loop nest structures
-// (with values from 0 to less than n from collapse(n))
-typedef kmp_int32 kmp_index_t;
-
-// Type for combined loop nest space IV:
-typedef kmp_uint64 kmp_loop_nest_iv_t;
-
-// Loop has <, <=, etc. as a comparison:
-enum comparison_t : kmp_int32 {
-  comp_less_or_eq = 0,
-  comp_greater_or_eq = 1,
-  comp_not_eq = 2,
-  comp_less = 3,
-  comp_greater = 4
-};
-
-// Type of loop IV.
-// Type of bounds and step, after usual promotions
-// are a subset of these types (32 & 64 only):
-enum loop_type_t : kmp_int32 {
-  loop_type_uint8 = 0,
-  loop_type_int8 = 1,
-  loop_type_uint16 = 2,
-  loop_type_int16 = 3,
-  loop_type_uint32 = 4,
-  loop_type_int32 = 5,
-  loop_type_uint64 = 6,
-  loop_type_int64 = 7
-};
-
-/*!
- @ingroup WORK_SHARING
- * Describes the structure for rectangular nested loops.
- */
-template <typename T> struct bounds_infoXX_template {
-
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-
-  loop_type_t loop_type; // The differentiator
-  loop_type_t loop_iv_type;
-  comparison_t comparison;
-  // outer_iv should be 0 (or any other less then number of dimentions)
-  // if loop doesn't depend on it (lb1 and ub1 will be 0).
-  // This way we can do multiplication without a check.
-  kmp_index_t outer_iv;
-
-  // unions to keep the size constant:
-  union {
-    T lb0;
-    kmp_uint64 lb0_u64; // real type can be signed
-  };
-
-  union {
-    T lb1;
-    kmp_uint64 lb1_u64; // real type can be signed
-  };
-
-  union {
-    T ub0;
-    kmp_uint64 ub0_u64; // real type can be signed
-  };
-
-  union {
-    T ub1;
-    kmp_uint64 ub1_u64; // real type can be signed
-  };
-
-  union {
-    ST step; // signed even if bounds type is unsigned
-    kmp_int64 step_64; // signed
-  };
-
-  kmp_loop_nest_iv_t trip_count;
-};
-
-/*!
- @ingroup WORK_SHARING
- * Interface struct for rectangular nested loops.
- * Same size as bounds_infoXX_template.
- */
-struct bounds_info_t {
-
-  loop_type_t loop_type; // The differentiator
-  loop_type_t loop_iv_type;
-  comparison_t comparison;
-  // outer_iv should be 0  (or any other less then number of dimentions)
-  // if loop doesn't depend on it (lb1 and ub1 will be 0).
-  // This way we can do multiplication without a check.
-  kmp_index_t outer_iv;
-
-  kmp_uint64 lb0_u64; // real type can be signed
-  kmp_uint64 lb1_u64; // real type can be signed
-  kmp_uint64 ub0_u64; // real type can be signed
-  kmp_uint64 ub1_u64; // real type can be signed
-  kmp_int64 step_64; // signed
-
-  // This is internal, but it's the only internal thing we need
-  // in rectangular case, so let's expose it here:
-  kmp_loop_nest_iv_t trip_count;
-};
-
-//-------------------------------------------------------------------------
-// Additional types for internal representation:
-
-// A point in the loop space, in the original space.
-// It's represented in kmp_uint64, but each dimention is calculated in
-// that loop IV type. Also dimentions have to be converted to those types
-// when used in generated code.
-typedef std::vector<kmp_uint64> kmp_point_t;
-
-// Number of loop iterations on each nesting level to achieve some point,
-// in expanded space or in original space.
-// OMPTODO: move from using iterations to using offsets (iterations multiplied
-// by steps). For those we need to be careful with the types, as step can be
-// negative, but it'll remove multiplications and divisions in several places.
-typedef std::vector<kmp_loop_nest_iv_t> kmp_iterations_t;
-
-// Internal struct with additional info:
-template <typename T> struct bounds_info_internalXX_template {
-
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-
-  // OMPTODO: should span have type T or should it better be
-  // kmp_uint64/kmp_int64 depending on T sign? (if kmp_uint64/kmp_int64 than
-  // updated bounds should probably also be kmp_uint64/kmp_int64). I'd like to
-  // use big_span_t, if it can be resolved at compile time.
-  typedef
-      typename std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
-          big_span_t;
-
-  // typedef typename big_span_t span_t;
-  typedef typename T span_t;
-
-  bounds_infoXX_template<T> b; // possibly adjusted bounds
-
-  // Leaving this as a union in case we'll switch to span_t with different sizes
-  // (depending on T)
-  union {
-    // Smallest possible value of iv (may be smaller than actually possible)
-    span_t span_smallest;
-    kmp_uint64 span_smallest_u64;
-  };
-
-  // Leaving this as a union in case we'll switch to span_t with different sizes
-  // (depending on T)
-  union {
-    // Biggest possible value of iv (may be bigger than actually possible)
-    span_t span_biggest;
-    kmp_uint64 span_biggest_u64;
-  };
-
-  // Did we adjust loop bounds (not counting canonicalization)?
-  bool loop_bounds_adjusted;
-};
-
-// Internal struct with additional info:
-struct bounds_info_internal_t {
-
-  bounds_info_t b; // possibly adjusted bounds
-
-  // Smallest possible value of iv (may be smaller than actually possible)
-  kmp_uint64 span_smallest_u64;
-
-  // Biggest possible value of iv (may be bigger than actually possible)
-  kmp_uint64 span_biggest_u64;
-
-  // Did we adjust loop bounds (not counting canonicalization)?
-  bool loop_bounds_adjusted;
-};
-
-//----------APIs for rectangular loop nests--------------------------------
-
-// Canonicalize loop nest and calculate overall trip count.
-// "bounds_nest" has to be allocated per thread.
-// API will modify original bounds_nest array to bring it to a canonical form
-// (only <= and >=, no !=, <, >). If the original loop nest was already in a
-// canonical form there will be no changes to bounds in bounds_nest array
-// (only trip counts will be calculated).
-// Returns trip count of overall space.
-extern "C" kmp_loop_nest_iv_t
-__kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
-                                 /*in/out*/ bounds_info_t *original_bounds_nest,
-                                 kmp_index_t n);
-
-// Calculate old induction variables corresponding to overall new_iv.
-// Note: original IV will be returned as if it had kmp_uint64 type,
-// will have to be converted to original type in user code.
-// Note: trip counts should be already calculated by
-// __kmpc_process_loop_nest_rectang.
-// OMPTODO: special case 2, 3 nested loops - if it'll be possible to inline
-// that into user code.
-extern "C" void
-__kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
-                                 const bounds_info_t *original_bounds_nest,
-                                 /*out*/ kmp_uint64 *original_ivs,
-                                 kmp_index_t n);
-
-//----------Init API for non-rectangular loops--------------------------------
-
-// Init API for collapsed loops (static, no chunks defined).
-// "bounds_nest" has to be allocated per thread.
-// API will modify original bounds_nest array to bring it to a canonical form
-// (only <= and >=, no !=, <, >). If the original loop nest was already in a
-// canonical form there will be no changes to bounds in bounds_nest array
-// (only trip counts will be calculated). Internally API will expand the space
-// to parallelogram/parallelepiped, calculate total, calculate bounds for the
-// chunks in terms of the new IV, re-calc them in terms of old IVs (especially
-// important on the left side, to hit the lower bounds and not step over), and
-// pick the correct chunk for this thread (so it will calculate chunks up to the
-// needed one). It could be optimized to calculate just this chunk, potentially
-// a bit less well distributed among threads. It is designed to make sure that
-// threads will receive predictable chunks, deterministically (so that next nest
-// of loops with similar characteristics will get exactly same chunks on same
-// threads).
-// Current contract: chunk_bounds_nest has only lb0 and ub0,
-// lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
-extern "C" kmp_int32
-__kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
-                          /*in/out*/ bounds_info_t *original_bounds_nest,
-                          /*out*/ bounds_info_t *chunk_bounds_nest,
-                          kmp_index_t n,
-                          /*out*/ kmp_int32 *plastiter);
-
-#endif // KMP_COLLAPSE_H