diff --git a/CHANGELOG.md b/CHANGELOG.md old mode 100644 new mode 100755 index e156cf8..226f6e7 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.3] - 2025-07-05 + +### Fixed +- Fixed issue where wrapper text following JSON blocks was not recognized (#1) + - Added dedicated `remove_trailing_wrapper_text/1` function in Layer 1 + - Now properly removes trailing text after valid JSON structures + - Example: `[{"id": 1}]\n1 Volume(s) created` → `[{"id": 1}]` + ## [0.1.2] - 2025-06-08 ### Added diff --git a/lib/json_remedy/layer1/content_cleaning.ex b/lib/json_remedy/layer1/content_cleaning.ex old mode 100644 new mode 100755 index b652250..4464e21 --- a/lib/json_remedy/layer1/content_cleaning.ex +++ b/lib/json_remedy/layer1/content_cleaning.ex @@ -87,7 +87,10 @@ defmodule JsonRemedy.Layer1.ContentCleaning do # Then try to extract from prose/text {result, prose_repairs} = extract_from_prose(result) - all_repairs = existing_repairs ++ html_repairs ++ prose_repairs + # Finally, remove any trailing wrapper text after JSON + {result, trailing_repairs} = remove_trailing_wrapper_text(result) + + all_repairs = existing_repairs ++ html_repairs ++ prose_repairs ++ trailing_repairs {result, all_repairs} end @@ -682,6 +685,66 @@ defmodule JsonRemedy.Layer1.ContentCleaning do find_balanced_end(rest, open, close, pos + 1, balance, in_string) end + # Remove trailing wrapper text after JSON + defp remove_trailing_wrapper_text(input) do + trimmed = String.trim(input) + + # Check if input starts with JSON structure + cond do + String.starts_with?(trimmed, "{") -> + check_and_remove_trailing_text(input, "{", "}") + + String.starts_with?(trimmed, "[") -> + check_and_remove_trailing_text(input, "[", "]") + + true -> + {input, []} + end + end + + defp check_and_remove_trailing_text(input, open_char, close_char) do + # Find where the JSON structure starts + json_start = + case String.split(input, open_char, parts: 2) do + [prefix, _] -> String.length(prefix) + _ -> 0 + end + + # Extract from the JSON start to find the balanced end + substring_from_json = String.slice(input, json_start, String.length(input)) + + case find_balanced_end(substring_from_json, open_char, close_char) do + nil -> + # Could not find balanced end, return as is + {input, []} + + end_pos -> + # Calculate the absolute position where JSON ends + json_end = json_start + end_pos + 1 + + # Check if there's non-whitespace content after JSON ends + after_json = String.slice(input, json_end, String.length(input)) + + if String.trim(after_json) == "" do + # No significant trailing content + {input, []} + else + # Extract only the JSON portion + json_content = String.slice(input, 0, json_end) + + repair = %{ + layer: :content_cleaning, + action: "removed trailing wrapper text", + position: json_end, + original: input, + replacement: json_content + } + + {json_content, [repair]} + end + end + end + # Helper functions for string detection using direct methods # Fast check for long text that likely contains JSON content diff --git a/mix.exs b/mix.exs old mode 100644 new mode 100755 index e313907..bbb19e6 --- a/mix.exs +++ b/mix.exs @@ -1,7 +1,7 @@ defmodule JsonRemedy.MixProject do use Mix.Project - @version "0.1.1" + @version "0.1.3" @source_url "https://github.com/nshkrdotcom/json_remedy" def project do diff --git a/test/unit/layer1_content_cleaning_test.exs b/test/unit/layer1_content_cleaning_test.exs old mode 100644 new mode 100755 index 360b4a0..9dc044a --- a/test/unit/layer1_content_cleaning_test.exs +++ b/test/unit/layer1_content_cleaning_test.exs @@ -174,6 +174,36 @@ defmodule JsonRemedy.Layer1.ContentCleaningTest do assert length(context.repairs) > 0 end end + + test "extracts json with trailing wrapper text (GitHub issue #1)" do + # This test case reproduces the issue where JSON followed by text is not cleaned + input = """ + [ + { + "volumeID": "f3a6ffd2-0111-4235-980c-a5ceec215e93", + "name": "km-tst-20", + "cloudID": "75b10103873d4a1ba0d52b43159a2842", + "size": 1, + "storageType": "ssd", + "state": "creating", + "shareable": false, + "bootable": false, + "volumePool": "General-Flash-002" + } + ] + 1 Volume(s) created + """ + + {:ok, result, context} = ContentCleaning.process(input, %{repairs: [], options: []}) + + # Should extract only the JSON array, removing the trailing text + trimmed_result = String.trim(result) + assert String.starts_with?(trimmed_result, "[") + assert String.ends_with?(trimmed_result, "]") + assert not String.contains?(result, "1 Volume(s) created") + assert length(context.repairs) > 0 + assert hd(context.repairs).action =~ "removed trailing wrapper text" + end end describe "encoding normalization" do