diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 6ca4d19196874..0b826cc6a781a 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -272,7 +272,8 @@ jobs:
           python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1
           python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true"
           python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1
-          python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
+          python -m pip install --no-cache-dir --no-build-isolation -e . \
+            --config-settings=setup-args="--werror" --config-settings compile-args="--verbose"
           python -m pip list --no-cache-dir
           export PANDAS_CI=1
           python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml
@@ -310,7 +311,8 @@ jobs:
           . ~/virtualenvs/pandas-dev/bin/activate
           python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1
           python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1
-          python -m pip install --no-cache-dir --no-build-isolation -e . --config-settings=setup-args="--werror"
+          python -m pip install --no-cache-dir --no-build-isolation -e . \
+            --config-settings=setup-args="--werror" --config-settings compile-args="--verbose"
           python -m pip list --no-cache-dir
 
       - name: Run Tests
@@ -383,7 +385,8 @@ jobs:
           python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
           python -m pip install versioneer[toml]
           python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov
-          python -m pip install -ve . --no-build-isolation --no-index --no-deps --config-settings=setup-args="--werror"
+          python -m pip install -ve . --no-build-isolation --no-index --no-deps \
+            --config-settings=setup-args="--werror" --config-settings compile-args="--verbose"
           python -m pip list
 
       - name: Run Tests
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index ccac3d0b50d45..8732d3e075537 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -61,10 +61,8 @@ include "hashtable_func_helper.pxi"
 # map derived hash-map types onto basic hash-map types:
 if np.dtype(np.intp) == np.dtype(np.int64):
     IntpHashTable = Int64HashTable
-    unique_label_indices = _unique_label_indices_int64
 elif np.dtype(np.intp) == np.dtype(np.int32):
     IntpHashTable = Int32HashTable
-    unique_label_indices = _unique_label_indices_int32
 else:
     raise ValueError(np.dtype(np.intp))
 
diff --git a/pandas/_libs/hashtable_cpp.pyx b/pandas/_libs/hashtable_cpp.pyx
new file mode 100644
index 0000000000000..c1d3412d23f99
--- /dev/null
+++ b/pandas/_libs/hashtable_cpp.pyx
@@ -0,0 +1,74 @@
+import cython
+import numpy as np
+
+cimport numpy as cnp
+from libc.stdint cimport uint32_t
+from libc.string cimport memcpy
+from libcpp.vector cimport vector
+
+
+cdef extern from "<functional>" namespace "std" nogil:
+    cdef cppclass hash[T]:
+        hash()
+        size_t operator()
+
+# TODO: duplicated with khash.pxd
+cdef extern from "pandas/vendored/klib/khash_python.h":
+    ctypedef uint32_t khuint_t
+    khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
+
+
+cdef extern from "pandas/vendored/klib/cpp/khash.hpp" namespace "klib" nogil:
+    cdef cppclass KHash[T, Hash, Eq=*, khint_t=*]:
+        T *keys
+        KHash()
+        # TODO: validate we don't need deconstructor
+        # ~KHash()
+        void exist(khint_t x)
+        T &at(khint_t x)
+        khint_t get(const T &)
+        # TODO: make this khint_t
+        # int resize(khint_t)
+        int resize(uint32_t)
+        khint_t put(const T &, int *)
+        # void del(khint_t x)
+
+
+# TODO: de-duplicate from hashtable.pyx
+cdef khuint_t SIZE_HINT_LIMIT = (1 << 20) + 7
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def unique_label_indices(const cnp.npy_intp[:] labels) -> cnp.ndarray:
+    """
+    Indices of the first occurrences of the unique labels
+    *excluding* -1. equivalent to:
+        np.unique(labels, return_index=True)[1]
+    """
+    cdef:
+        int ret = 0
+        Py_ssize_t i, n = len(labels)
+        KHash[cnp.npy_intp, hash[cnp.npy_intp]] *table = (
+            new KHash[cnp.npy_intp, hash[cnp.npy_intp]]()
+        )
+        cnp.ndarray[cnp.npy_intp, ndim=1] arr
+        vector[cnp.npy_intp] idx = vector[cnp.npy_intp]()
+
+    table.resize(min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
+
+    with nogil:
+        for i in range(n):
+            table.put(labels[i], &ret)
+            if ret != 0:
+                # TODO: pandas has a custom resize operation but we
+                # rely on C++ stdlib here - how different are they?
+                idx.push_back(i)
+
+    # TODO: must be a cleaner way to do this?
+    # even arr.data = move(idx.data()) would be better but arr.data is readonly
+    arr = np.empty(idx.size(), dtype=np.intp)
+    memcpy(arr.data, idx.const_data(), idx.size() * sizeof(cnp.npy_intp))
+    arr = arr[np.asarray(labels)[arr].argsort()]
+
+    return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in
index 336af306d410f..963dedbe7ec3b 100644
--- a/pandas/_libs/hashtable_func_helper.pxi.in
+++ b/pandas/_libs/hashtable_func_helper.pxi.in
@@ -445,51 +445,3 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
         res_mask = np.zeros(j+1, dtype=np.bool_)
         res_mask[j] = True
     return modes[:j + 1], res_mask
-
-
-{{py:
-
-# name, dtype, ttype, c_type
-dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
-          ('Int32', 'int32', 'int32', 'int32_t'), ]
-
-}}
-
-{{for name, dtype, ttype, c_type in dtypes}}
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
-    """
-    Indices of the first occurrences of the unique labels
-    *excluding* -1. equivalent to:
-        np.unique(labels, return_index=True)[1]
-    """
-    cdef:
-        int ret = 0
-        Py_ssize_t i, n = len(labels)
-        kh_{{ttype}}_t *table = kh_init_{{ttype}}()
-        {{name}}Vector idx = {{name}}Vector()
-        ndarray[{{c_type}}, ndim=1] arr
-        {{name}}VectorData *ud = idx.data
-
-    kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
-    with nogil:
-        for i in range(n):
-            kh_put_{{ttype}}(table, labels[i], &ret)
-            if ret != 0:
-                if needs_resize(ud):
-                    with gil:
-                        idx.resize()
-                append_data_{{ttype}}(ud, i)
-
-    kh_destroy_{{ttype}}(table)
-
-    arr = idx.to_array()
-    arr = arr[np.asarray(labels)[arr].argsort()]
-
-    return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
-
-{{endfor}}
diff --git a/pandas/_libs/include/pandas/vendored/klib/cpp/khash.hpp b/pandas/_libs/include/pandas/vendored/klib/cpp/khash.hpp
new file mode 100644
index 0000000000000..465bdbba9bd99
--- /dev/null
+++ b/pandas/_libs/include/pandas/vendored/klib/cpp/khash.hpp
@@ -0,0 +1,209 @@
+#ifndef KHASH_HPP
+#define KHASH_HPP
+
+#include <cstdlib> // for malloc() etc
+#include <cstring> // for memset()
+#include <functional>
+#include <memory>
+
+#include <stdint.h> // for uint32_t
+
+namespace klib {
+
+#ifndef kroundup32 // FIXME: doesn't work for 64-bit integers
+#define kroundup32(x)                                                          \
+  (--(x), (x) |= (x) >> 1, (x) |= (x) >> 2, (x) |= (x) >> 4, (x) |= (x) >> 8,  \
+   (x) |= (x) >> 16, ++(x))
+#endif
+
+#define __ac_isempty(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 2)
+#define __ac_isdel(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 1)
+#define __ac_isempty(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 2)
+#define __ac_isdel(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 1)
+#define __ac_iseither(flag, i) ((flag[i >> 4] >> ((i & 0xfU) << 1)) & 3)
+#define __ac_set_isdel_false(flag, i)                                          \
+  (flag[i >> 4] &= ~(1ul << ((i & 0xfU) << 1)))
+#define __ac_set_isempty_false(flag, i)                                        \
+  (flag[i >> 4] &= ~(2ul << ((i & 0xfU) << 1)))
+#define __ac_set_isboth_false(flag, i)                                         \
+  (flag[i >> 4] &= ~(3ul << ((i & 0xfU) << 1)))
+#define __ac_set_isdel_true(flag, i) (flag[i >> 4] |= 1ul << ((i & 0xfU) << 1))
+
+#define __ac_fsize(m) ((m) < 16 ? 1 : (m) >> 4)
+
+template <class T, class Hash, class Eq = std::equal_to<T>,
+          typename khint_t = uint32_t>
+class KHash {
+  khint_t n_buckets, count, n_occupied, upper_bound;
+  uint32_t *flags;
+  T *keys;
+
+public:
+  KHash()
+      : n_buckets(0), count(0), n_occupied(0), upper_bound(0), flags(NULL),
+        keys(NULL){};
+  ~KHash() {
+    std::free(flags);
+    std::free(keys);
+  };
+  khint_t capacity(void) const { return n_buckets; };
+  khint_t size(void) const { return count; };
+  khint_t begin(void) const { return 0; };
+  khint_t end(void) const { return n_buckets; };
+
+  void exist(khint_t x) const { return !__ac_iseither(flags, x); };
+  T &at(khint_t x) { return keys[x]; };
+
+  khint_t get(const T &key) const {
+    if (n_buckets) {
+      khint_t k, i, last, mask, step = 0;
+      mask = n_buckets - 1;
+      k = Hash()(key);
+      i = k & mask;
+      last = i;
+      while (!__ac_isempty(flags, i) &&
+             (__ac_isdel(flags, i) || !Eq()(keys[i], key))) {
+        i = (i + (++step)) & mask;
+        if (i == last)
+          return n_buckets;
+      }
+      return __ac_iseither(flags, i) ? n_buckets : i;
+    } else
+      return 0;
+  };
+
+  int resize(khint_t new_n_buckets) {
+    uint32_t *new_flags = 0;
+    khint_t j = 1;
+    {
+      kroundup32(new_n_buckets);
+      if (new_n_buckets < 4)
+        new_n_buckets = 4;
+      if (count >= (new_n_buckets >> 1) + (new_n_buckets >> 2))
+        j = 0; /* requested count is too small */
+      else {   /* hash table count to be changed (shrink or expand); rehash */
+        new_flags = (uint32_t *)std::malloc(__ac_fsize(new_n_buckets) *
+                                            sizeof(uint32_t));
+        if (!new_flags)
+          return -1;
+        ::memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(uint32_t));
+        if (n_buckets < new_n_buckets) { /* expand */
+          T *new_keys =
+              (T *)std::realloc((void *)keys, new_n_buckets * sizeof(T));
+          if (!new_keys) {
+            std::free(new_flags);
+            return -1;
+          }
+          keys = new_keys;
+        } /* otherwise shrink */
+      }
+    }
+    if (j) { /* rehashing is needed */
+      for (j = 0; j != n_buckets; ++j) {
+        if (__ac_iseither(flags, j) == 0) {
+          T key = keys[j];
+          khint_t new_mask;
+          new_mask = new_n_buckets - 1;
+          __ac_set_isdel_true(flags, j);
+          while (1) { /* kick-out process; sort of like in Cuckoo hashing */
+            khint_t k, i, step = 0;
+            k = Hash()(key);
+            i = k & new_mask;
+            while (!__ac_isempty(new_flags, i))
+              i = (i + (++step)) & new_mask;
+            __ac_set_isempty_false(new_flags, i);
+            if (i < n_buckets && __ac_iseither(flags, i) ==
+                                     0) { /* kick out the existing element */
+              {
+                T tmp = keys[i];
+                keys[i] = key;
+                key = tmp;
+              }
+              __ac_set_isdel_true(
+                  flags, i); /* mark it as deleted in the old hash table */
+            } else {         /* write the element and jump out of the loop */
+              keys[i] = key;
+              break;
+            }
+          }
+        }
+      }
+      if (n_buckets > new_n_buckets) /* shrink the hash table */
+        keys = (T *)std::realloc((void *)keys, new_n_buckets * sizeof(T));
+      std::free(flags); /* free the working space */
+      flags = new_flags;
+      n_buckets = new_n_buckets;
+      n_occupied = count;
+      upper_bound = (n_buckets >> 1) + (n_buckets >> 2);
+    }
+    return 0;
+  };
+
+  khint_t put(const T &key, int *ret) {
+    khint_t x;
+    if (n_occupied >= upper_bound) { /* update the hash table */
+      if (n_buckets > (count << 1)) {
+        if (resize(n_buckets - 1) < 0) { /* clear "deleted" elements */
+          *ret = -1;
+          return n_buckets;
+        }
+      } else if (resize(n_buckets + 1) < 0) { /* expand the hash table */
+        *ret = -1;
+        return n_buckets;
+      }
+    } /* TODO: to implement automatically shrinking; resize() already support
+         shrinking */
+    {
+      khint_t k, i, site, last, mask = n_buckets - 1, step = 0;
+      x = site = n_buckets;
+      k = Hash()(key);
+      i = k & mask;
+      if (__ac_isempty(flags, i))
+        x = i; /* for speed up */
+      else {
+        last = i;
+        while (!__ac_isempty(flags, i) &&
+               (__ac_isdel(flags, i) || !Eq()(keys[i], key))) {
+          if (__ac_isdel(flags, i))
+            site = i;
+          i = (i + (++step)) & mask;
+          if (i == last) {
+            x = site;
+            break;
+          }
+        }
+        if (x == n_buckets) {
+          if (__ac_isempty(flags, i) && site != n_buckets)
+            x = site;
+          else
+            x = i;
+        }
+      }
+    }
+    if (__ac_isempty(flags, x)) { /* not present at all */
+      keys[x] = key;
+      __ac_set_isboth_false(flags, x);
+      ++count;
+      ++n_occupied;
+      *ret = 1;
+    } else if (__ac_isdel(flags, x)) { /* deleted */
+      keys[x] = key;
+      __ac_set_isboth_false(flags, x);
+      ++count;
+      *ret = 2;
+    } else
+      *ret = 0; /* Don't touch keys[x] if present and not deleted */
+    return x;
+  };
+
+  void del(khint_t x) {
+    if (x != n_buckets && !__ac_iseither(flags, x)) {
+      __ac_set_isdel_true(flags, x);
+      --count;
+    }
+  };
+};
+
+} // end of namespace klib
+
+#endif
diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
index c27386743c6e9..ed98e81b2a7bd 100644
--- a/pandas/_libs/meson.build
+++ b/pandas/_libs/meson.build
@@ -122,6 +122,19 @@ foreach ext_name, ext_dict : libs_sources
     )
 endforeach
 
+# hashtable_cpp is an exception because it requires cpp compiler
+py.extension_module(
+  'hashtable_cpp',
+  ['hashtable_cpp.pyx'],
+  cpp_args: '-ffunction-sections',
+  link_args: '-Wl,--gc-sections',
+  cython_args: cython_args,
+  include_directories: [inc_np, inc_pd],
+  subdir: 'pandas/_libs',
+  override_options : ['cython_language=cpp'],
+  install: true
+)
+
 # Basically just __init__.py and the .pyi files
 sources_to_install = [
     '__init__.py',
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index a431842218b3b..5ce6845991da4 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -16,7 +16,7 @@
     hashtable,
     lib,
 )
-from pandas._libs.hashtable import unique_label_indices
+from pandas._libs.hashtable_cpp import unique_label_indices
 
 from pandas.core.dtypes.common import (
     ensure_int64,
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
index e54764f9ac4a6..928deed89f057 100644
--- a/pandas/tests/libs/test_hashtable.py
+++ b/pandas/tests/libs/test_hashtable.py
@@ -7,7 +7,10 @@
 import numpy as np
 import pytest
 
-from pandas._libs import hashtable as ht
+from pandas._libs import (
+    hashtable as ht,
+    hashtable_cpp as ht_cpp,
+)
 
 import pandas as pd
 import pandas._testing as tm
@@ -665,7 +668,7 @@ def test_modes_with_nans():
 def test_unique_label_indices_intp(writable):
     keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp)
     keys.flags.writeable = writable
-    result = ht.unique_label_indices(keys)
+    result = ht_cpp.unique_label_indices(keys)
     expected = np.array([0, 1, 5], dtype=np.intp)
     tm.assert_numpy_array_equal(result, expected)
 
@@ -673,13 +676,13 @@ def test_unique_label_indices_intp(writable):
 def test_unique_label_indices():
     a = np.random.default_rng(2).integers(1, 1 << 10, 1 << 15).astype(np.intp)
 
-    left = ht.unique_label_indices(a)
+    left = ht_cpp.unique_label_indices(a)
     right = np.unique(a, return_index=True)[1]
 
     tm.assert_numpy_array_equal(left, right, check_dtype=False)
 
     a[np.random.default_rng(2).choice(len(a), 10)] = -1
-    left = ht.unique_label_indices(a)
+    left = ht_cpp.unique_label_indices(a)
     right = np.unique(a, return_index=True)[1][1:]
     tm.assert_numpy_array_equal(left, right, check_dtype=False)