diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 5426dc405429c..65e59d1a2eb14 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -718,8 +718,9 @@ const DictionaryEncodeOptions* GetDefaultDictionaryEncodeOptions() { const FunctionDoc dictionary_encode_doc( "Dictionary-encode array", - ("Return a dictionary-encoded version of the input array."), {"array"}, - "DictionaryEncodeOptions"); + ("Return a dictionary-encoded version of the input array.\n" + "This function does nothing if the input is already a dictionary array."), + {"array"}, "DictionaryEncodeOptions"); // ---------------------------------------------------------------------- // This function does not use any hashing utilities @@ -803,9 +804,11 @@ void RegisterVectorHash(FunctionRegistry* registry) { GetDefaultDictionaryEncodeOptions()); AddHashKernels(dict_encode.get(), base, DictEncodeOutput); - // Calling dictionary_encode on dictionary input not supported, but if it - // ends up being needed (or convenience), a kernel could be added to make it - // a no-op + auto no_op = [](KernelContext*, const ExecSpan& span, ExecResult* out) { + out->value = span[0].array.ToArrayData(); + return Status::OK(); + }; + DCHECK_OK(dict_encode->AddKernel({Type::DICTIONARY}, OutputType(FirstType), no_op)); DCHECK_OK(registry->AddFunction(std::move(dict_encode))); } diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc index 7b713362f6feb..c4ec74fbaabca 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc @@ -687,6 +687,15 @@ TEST_F(TestHashKernel, DictEncodeIntervalMonth) { {0, 0, 1, 0, 2}); } +TEST_F(TestHashKernel, DictEncodeDictInput) { + // Dictionary encode a dictionary is a no-op + auto dict_ty = dictionary(int32(), utf8()); + auto dict = ArrayFromJSON(utf8(), R"(["a", "b", "c"])"); + auto indices = ArrayFromJSON(int32(), "[0, 1, 2, 0, 1, 2, 0, 1, 2]"); + auto input = std::make_shared(dict_ty, indices, dict); + CheckDictEncode(input, dict, indices); +} + TEST_F(TestHashKernel, DictionaryUniqueAndValueCounts) { auto dict_json = "[10, 20, 30, 40]"; auto dict = ArrayFromJSON(int64(), dict_json); diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 47af9764150e5..17d003b261dca 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1675,7 +1675,8 @@ Associative transforms | | | Temporal, Binary- and String-like | | | +-------------------+-------+-----------------------------------+-------------+-------+ -* \(1) Output is ``Dictionary(Int32, input type)``. +* \(1) Output is ``Dictionary(Int32, input type)``. It is a no-op if input is + already a Dictionary array. * \(2) Duplicates are removed from the output while the original order is maintained. diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 067d96a82113f..7c5a134d330ac 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1781,6 +1781,7 @@ def test_dictionary_decode(): assert array == dictionary_array_decode assert array == pc.dictionary_decode(array) + assert pc.dictionary_encode(dictionary_array) == dictionary_array def test_cast():