From d0ea7d0529adcb3a35417814c8bf3965af032c47 Mon Sep 17 00:00:00 2001 From: Daniele Date: Wed, 22 Apr 2026 09:05:53 +0200 Subject: [PATCH 1/8] Remove EXAMPLES.md Delete the EXAMPLES.md file which contained extensive integration and usage examples for the `str` library. --- EXAMPLES.md | 492 ---------------------------------------------------- 1 file changed, 492 deletions(-) delete mode 100644 EXAMPLES.md diff --git a/EXAMPLES.md b/EXAMPLES.md deleted file mode 100644 index 626b8ff..0000000 --- a/EXAMPLES.md +++ /dev/null @@ -1,492 +0,0 @@ -# Examples โ€” Integration snippets for `str` - -This file collects short, copy-pasteable examples showing how to integrate -and extend the `str` library from an application. Keep in mind that the -`str` package itself intentionally does not depend on Erlang/OTP; any OTP -interop should live in the *integrating application* (not in `src/str/*`). - -## Core Function Examples - -### Grapheme-Aware Indexing and Search - -```gleam -import str - -pub fn search_examples() { - // Find first occurrence (grapheme-aware!) - let idx = str.index_of("Hello ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ World", "World") - // Ok(8) - the emoji is ONE grapheme cluster! - - // Find last occurrence - let last = str.last_index_of("hello hello hello", "hello") - // Ok(12) - - // Check for multiple needles - let has_any = str.contains_any("hello world", ["foo", "world"]) - // True - - let has_all = str.contains_all("hello world", ["hello", "world"]) - // True -} -``` - -### Experimental Search Strategies & Caching - -```gleam -import str -import str/advanced - -pub fn search_strategy_examples() { - // 1) Use the automatic heuristic (experimental) - // The heuristic chooses between a sliding matcher and KMP based on - // pattern/text characteristics. It is opt-in and may choose a - // non-optimal strategy in some cases. - let auto = str.index_of_auto("some long text...", "pat") - - // 2) Force a specific strategy: use this when performance is critical - // and you know which algorithm is better for your input shape. - let forced_kmp = str.index_of_strategy("long text...", "pattern", str.Kmp) - let forced_sliding = str.index_of_strategy("short text", "pat", str.Sliding) - - // 3) Caching KMP maps: precompute pattern maps once and reuse them - // across multiple searches to avoid rebuilding prefix tables. - let pattern = "abababab..." - let maps = advanced.build_kmp_maps(pattern) - let pmap = maps.0 - let pimap = maps.1 - - // Reuse maps across many texts - let idx1 = advanced.kmp_index_of_with_maps("first long text...", pattern, pmap, pimap) - let occurrences = advanced.kmp_search_all_with_maps("another text...", pmap, pimap) - - // Guidance: prefer explicit strategy or caching in hot loops; use - // `index_of_auto` for convenience and exploratory testing. -} -``` - -> Note: `index_of_auto` is experimental and its behavior depends on tunable -> thresholds in `src/str/config.gleam`. For production-critical paths, -> prefer `index_of_strategy` or precomputing maps via `build_kmp_maps`. - -### Grapheme-Aware Length and String Checks - -```gleam -import str - -pub fn length_examples() { - // Grapheme-aware length - // Unlike standard string length, counts grapheme clusters correctly - let len = str.length("Hello") - // 5 - - // Family emoji is a SINGLE grapheme cluster - let emoji_len = str.length("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") - // 1 - - // Flag is also a single grapheme - let flag_len = str.length("๐Ÿ‡ฎ๐Ÿ‡น") - // 1 - - // Combining characters stay attached - let cafe_len = str.length("cafรฉ") - // 4 (even with combining accent) -} - -pub fn contains_examples() { - // Grapheme-aware contains - let found = str.contains("hello world", "world") - // True - - let not_found = str.contains("hello", "x") - // False - - // Works correctly with emoji - let emoji_found = str.contains("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ family", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") - // True -} - -pub fn prefix_suffix_examples() { - // Grapheme-aware starts_with - let starts = str.starts_with("hello", "he") - // True - - // Empty prefix always matches - let empty_prefix = str.starts_with("hello", "") - // True - - // Works with emoji on grapheme boundaries - let emoji_starts = str.starts_with("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆabc", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") - // True - - // Grapheme-aware ends_with - let ends = str.ends_with("hello.txt", ".txt") - // True - - let emoji_ends = str.ends_with("abc๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ", "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") - // True -} - -pub fn empty_check_examples() { - // is_empty check - let empty = str.is_empty("") - // True - - let not_empty = str.is_empty(" ") - // False (whitespace is not empty) - - // Combine with is_blank for whitespace check - let blank = str.is_blank(" ") - // True -} -``` - -### Replace First/Last Occurrence - -```gleam -import str - -pub fn replace_examples() { - // Replace only first occurrence (stdlib only has replace all) - let text = "hello hello hello" - let first = str.replace_first(text, "hello", "hi") - // "hi hello hello" - - let last = str.replace_last(text, "hello", "bye") - // "hello hello bye" -} -``` - -### HTML Escaping for Web Applications - -```gleam -import str - -pub fn html_examples() { - // Escape user input before rendering - let user_input = "" - let safe = str.escape_html(user_input) - // "<script>alert('xss')</script>" - - // Unescape for display - let escaped = "<div>Hello</div>" - let original = str.unescape_html(escaped) - // "
Hello
" -} -``` - -### String Validation - -```gleam -import str - -pub fn validation_examples() { - // Case validation (ignores non-letter characters) - assert str.is_uppercase("HELLO123") == True - assert str.is_lowercase("hello_world") == True - - // Title Case validation - assert str.is_title_case("Hello World") == True - assert str.is_title_case("hello World") == False - assert str.is_title_case("Hello 123 World") == True // numbers ignored - - // ASCII validation - assert str.is_ascii("hello!@#") == True - assert str.is_ascii("cafรฉ") == False - - // Hex validation (useful for color codes, UUIDs, etc.) - assert str.is_hex("DEADBEEF") == True - assert str.is_hex("ff00ff") == True - - // Printable check (no control characters) - assert str.is_printable("hello") == True - assert str.is_printable("hello\n") == False -} -``` - -### String Similarity and Distance - -```gleam -import str - -pub fn similarity_examples() { - // Levenshtein distance (edit operations needed) - let dist = str.distance("kitten", "sitting") - // 3 - - // Similarity as percentage (0.0 to 1.0) - let sim = str.similarity("hello", "hallo") - // 0.8 (80% similar) - - // Hamming distance (same length strings only) - let ham = str.hamming_distance("karolin", "kathrin") - // Ok(3) -} -``` - -### Take/Drop from Right - -```gleam -import str - -pub fn take_drop_examples() { - // Get last N graphemes - let last3 = str.take_right("hello world", 3) - // "rld" - - // Drop last N graphemes - let without_ext = str.drop_right("file.txt", 4) - // "file" - - // Works with emoji too! - let emoji_end = str.take_right("Hello ๐Ÿ‘‹๐Ÿฝ", 1) - // "๐Ÿ‘‹๐Ÿฝ" (single grapheme cluster with skin tone) -} -``` - -### Capitalize and Case Manipulation - -```gleam -import str - -pub fn capitalize_examples() { - // Capitalize: first letter uppercase, rest lowercase - let text = str.capitalize("hELLO wORLD") - // "Hello world" - - // Swap case - let swapped = str.swapcase("Hello World") - // "hELLO wORLD" -} -``` - -### Partition and Split - -```gleam -import str - -pub fn partition_examples() { - // Partition from first occurrence - let #(before, sep, after) = str.partition("a-b-c", "-") - // #("a", "-", "b-c") - - // Partition from LAST occurrence - // Note: if not found, returns #("", "", text) like Python - let #(before2, sep2, after2) = str.rpartition("a-b-c", "-") - // #("a-b", "-", "c") - - // Split with max parts limit - let parts = str.splitn("one-two-three-four", "-", 2) - // ["one", "two-three-four"] - - let parts3 = str.splitn("a:b:c:d", ":", 3) - // ["a", "b", "c:d"] -} -``` - -### Padding and Filling - -```gleam -import str - -pub fn padding_examples() { - // Standard padding - let padded = str.pad_left("42", 5, "0") - // "00042" - - // Flexible fill with position type - let left_fill = str.fill("x", 5, "-", str.Left) - // "----x" - - let right_fill = str.fill("x", 5, "-", str.Right) - // "x----" - - let center_fill = str.fill("x", 5, "-", str.Both) - // "--x--" -} -``` - -### Chunking Strings - -```gleam -import str - -pub fn chunk_examples() { - // Split into fixed-size chunks - let chunks = str.chunk("abcdefg", 3) - // ["abc", "def", "g"] - - let pairs = str.chunk("abcdef", 2) - // ["ab", "cd", "ef"] - - // Works with emoji (grapheme-aware!) - let emoji_chunks = str.chunk("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆab", 2) - // ["๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆa", "b"] -} -``` - -### Prefix/Suffix Checking - -```gleam -import str - -pub fn prefix_suffix_examples() { - // Check multiple prefixes at once - let is_greeting = str.starts_with_any("hello world", ["hi", "hello", "hey"]) - // True - - // Check multiple suffixes at once - let is_image = str.ends_with_any("photo.png", [".jpg", ".png", ".gif"]) - // True - - let is_code = str.ends_with_any("main.gleam", [".gleam", ".erl", ".ex"]) - // True -} -``` - -### Whitespace Normalization - -```gleam -import str - -pub fn whitespace_examples() { - // Collapse all whitespace to single spaces - let normalized = str.normalize_whitespace(" hello world \n\t test ") - // "hello world test" - - // Great for cleaning user input - let clean = str.normalize_whitespace(" John Doe ") - // "John Doe" -} -``` - -### Text Utilities - -```gleam -import str - -pub fn utility_examples() { - // Reverse word order - let reversed = str.reverse_words("hello beautiful world") - // "world beautiful hello" - - // Extract initials - let init = str.initials("John Fitzgerald Kennedy") - // "JFK" - - // Regex escaping for pattern matching - let pattern = str.escape_regex("hello.world[test]") - // "hello\\.world\\[test\\]" -} -``` - -## OTP-based Unicode Normalization - -### Implementation Location - -Define Unicode normalization helpers in your application code (not in the `str` library). These helpers should implement the `String -> String` signature and can be passed to any `str` function that accepts a normalizer parameter. - -### Example Implementation - -```gleam -// file: src/normalize.gleam (in your app, not in `str`) -pub fn otp_nfd(s: String) -> String { - // Call OTP from your app via Erlang interop. Example (conceptual): - // :unicode.characters_to_nfd_binary(s) - s -} - -// Use it when calling into `str`: -let folded = str.ascii_fold_with_normalizer("Crรจme Brรปlรฉe", otp_nfd) -let slug = str.slugify_opts_with_normalizer("Crรจme Brรปlรฉe", 0, "-", False, otp_nfd) -``` - -Notes: -- Put the code above in your application so the `str` package remains - free of OTP as a hard dependency. -- The exact Erlang interop call depends on your project setup and - runtime; the example above is conceptual. - -## 2) Convenience alias `slugify_with_normalizer` - -A short wrapper is available for convenience. Example usage: - -```gleam -// short alias: uses default separator `-` and no token limit -let s = "Cafรฉ โค๏ธ Gleam" -let slug = str.slugify_with_normalizer(s, otp_nfd) -``` - -## 3) No-decompose variants - -If you prefer not to run the library's limited Latin decomposer you can -call the `_no_decompose_` variants and still pass a normalizer: - -```gleam -let folded = str.ascii_fold_no_decompose_with_normalizer(s, otp_nfd) -``` - -This gives you full control over decomposition/normalization order. - -## 4) Testing locally (use `gleam test`) - -The project uses Gleam's test runner. Example commands: - -```sh -# run all tests -gleam test - -# run a single test file (shell navigation) -cd /path/to/project && gleam test -``` - -## 5) Regenerating character tables (docs) - -If you extend `src/str/internal/translit.gleam` or -`src/str/internal/decompose.gleam`, regenerate the JSON used by the -docs: - -```sh -python3 scripts/generate_character_tables.py -``` - -## 6) Example of a small fake normalizer (useful for tests) - -In tests it's handy to simulate NFD/NFC without OTP. Example: - -```gleam -let fake_nfd = fn(x) { string.replace(x, "รฉ", "e\u{0301}") } -let slug = str.slugify_opts_with_normalizer("Cafรฉ", 0, "-", False, fake_nfd) -assert slug == "cafe" -``` - -## 7) Where to put NFC/NFD helpers (application-side) - -If you want to provide explicit `nfc`/`nfd` helpers that call OTP, put -them in your application (not in the `str` library). Example (commented): - -```gleam -// file: src/normalize.gleam (in your app) -// pub fn nfd(s: String) -> String { -// // :unicode.characters_to_nfd_binary(s) -// } -// -// pub fn nfc(s: String) -> String { -// // :unicode.characters_to_nfc_binary(s) -// } -``` - -## 8) Tokenization reference - -If you need a pure-Gleam tokenizer for special processing, see -`src/str/internal/tokenize.gleam` which provides a pedagogic reference -implementation. Access the tokenizer via the public API: - -```gleam -import str - -let clusters = str.chars("cafรฉ") -// -> ["c", "a", "f", "รฉ"] - -let stdlib_clusters = str.chars_stdlib("cafรฉ") -// -> ["c", "a", "f", "รฉ"] -``` From 6d6914412fd8e9615117efae0d941dfc7fdd9d0e Mon Sep 17 00:00:00 2001 From: Daniele Date: Wed, 22 Apr 2026 09:06:29 +0200 Subject: [PATCH 2/8] Add comprehensive str docs and examples Add documentation for the str module: a full API reference (docs/api_reference.md) detailing functions for case conversion, grapheme handling, search/replace, splitting, padding, validation, escaping, similarity, manipulation, slug generation, and ASCII folding; an examples file (docs/examples.md) with copy-pasteable Gleam snippets and usage guidance (including KMP caching and grapheme-aware behavior); and an OTP integration guide (docs/otp_integration.md) explaining how to provide NFC/NFD normalizers, usage with ascii_fold/slugify, and testing without OTP. These docs link to internal references and explain rationale for keeping OTP normalizers out of the library to preserve JS and pure-Gleam targets. --- docs/api_reference.md | 223 ++++++++++++++++++++++++++++++++++++++++ docs/examples.md | 200 +++++++++++++++++++++++++++++++++++ docs/otp_integration.md | 71 +++++++++++++ 3 files changed, 494 insertions(+) create mode 100644 docs/api_reference.md create mode 100644 docs/examples.md create mode 100644 docs/otp_integration.md diff --git a/docs/api_reference.md b/docs/api_reference.md new file mode 100644 index 0000000..7ea5f4b --- /dev/null +++ b/docs/api_reference.md @@ -0,0 +1,223 @@ +# str โ€” API Reference + +Complete function reference for `import str`. For module internals see +[str_core.md](str_core.md) and [str_extra.md](str_extra.md). + +--- + +## Case & Capitalization + +| Function | Example | Result | +|----------|---------|--------| +| `capitalize(text)` | `"hELLO wORLD"` | `"Hello world"` | +| `swapcase(text)` | `"Hello World"` | `"hELLO wORLD"` | +| `is_uppercase(text)` | `"HELLO123"` | `True` | +| `is_lowercase(text)` | `"hello_world"` | `True` | +| `is_title_case(text)` | `"Hello World"` | `True` | +| `is_mixed_case(text)` | `"helloWorld"` | `True` | + +### Case Conversions + +```gleam +str.to_snake_case("Hello World") // โ†’ "hello_world" +str.to_camel_case("hello world") // โ†’ "helloWorld" +str.to_pascal_case("hello world") // โ†’ "HelloWorld" +str.to_kebab_case("Hello World") // โ†’ "hello-world" +str.to_title_case("hello world") // โ†’ "Hello World" +str.camel_to_snake("camelCase") // โ†’ "camel_case" +str.pascal_to_snake("PascalCase") // โ†’ "pascal_case" // alias for camel_to_snake +str.snake_to_camel("snake_case") // โ†’ "snakeCase" +str.snake_to_pascal("snake_case") // โ†’ "SnakeCase" +``` + +--- + +## Grapheme Extraction + +| Function | Example | Result | +|----------|---------|--------| +| `take(text, n)` | `take("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆabc", 2)` | `"๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆa"` | +| `drop(text, n)` | `drop("hello", 2)` | `"llo"` | +| `take_right(text, n)` | `take_right("hello", 3)` | `"llo"` | +| `drop_right(text, n)` | `drop_right("hello", 2)` | `"hel"` | +| `at(text, index)` | `at("hello", 1)` | `Ok("e")` | +| `chunk(text, size)` | `chunk("abcdef", 2)` | `["ab", "cd", "ef"]` | +| `length(text)` | `length("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")` | `1` | +| `reverse(text)` | `reverse("Hello ๐Ÿ‘‹")` | `"๐Ÿ‘‹ olleH"` | + +--- + +## Search & Replace + +| Function | Example | Result | +|----------|---------|--------| +| `index_of(text, needle)` | `"hello world", "world"` | `Ok(6)` | +| `last_index_of(text, needle)` | `"hello hello", "hello"` | `Ok(6)` | +| `contains(text, needle)` | `"hello world", "world"` | `True` | +| `contains_any(text, needles)` | `"hello", ["x", "e"]` | `True` | +| `contains_all(text, needles)` | `"hello", ["h", "e"]` | `True` | +| `replace_first(text, old, new)` | `"aaa", "a", "b"` | `"baa"` | +| `replace_last(text, old, new)` | `"aaa", "a", "b"` | `"aab"` | + +### Search Strategies (experimental) + +Automatic heuristic selection between KMP (long/repetitive patterns) and +sliding window (short patterns, zero allocations). Configure thresholds in +`str/config`. + +| Function | Description | +|----------|-------------| +| `index_of_auto(text, pattern)` | Heuristic algorithm selection | +| `index_of_strategy(text, pattern, Kmp\|Sliding)` | Explicit algorithm | +| `count_auto(text, pattern, overlapping)` | Heuristic for counting | +| `count_strategy(text, pattern, overlapping, Kmp\|Sliding)` | Explicit count | + +For pre-built KMP maps (hot loops), use `import str/advanced`. See +[str_core.md](str_core.md) for details. + +--- + +## Splitting & Partitioning + +| Function | Example | Result | +|----------|---------|--------| +| `partition(text, sep)` | `"a-b-c", "-"` | `#("a", "-", "b-c")` | +| `rpartition(text, sep)` | `"a-b-c", "-"` | `#("a-b", "-", "c")` | +| `splitn(text, sep, n)` | `"a-b-c-d", "-", 2` | `["a", "b-c-d"]` | +| `words(text)` | `"hello world"` | `["hello", "world"]` | +| `lines(text)` | `"a\nb\nc"` | `["a", "b", "c"]` | + +--- + +## Padding & Filling + +| Function | Example | Result | +|----------|---------|--------| +| `pad_left(text, width, pad)` | `"42", 5, "0"` | `"00042"` | +| `pad_right(text, width, pad)` | `"hi", 5, "*"` | `"hi***"` | +| `center(text, width, pad)` | `"hi", 6, "-"` | `"--hi--"` | +| `fill(text, width, pad, pos)` | `"x", 5, "-", Both` | `"--x--"` | + +`FillPosition` values: `Left`, `Right`, `Both`. + +--- + +## Prefix & Suffix + +| Function | Example | Result | +|----------|---------|--------| +| `starts_with(text, prefix)` | `"hello", "he"` | `True` | +| `ends_with(text, suffix)` | `"file.txt", ".txt"` | `True` | +| `starts_with_any(text, list)` | `"hello", ["hi", "he"]` | `True` | +| `ends_with_any(text, list)` | `"file.txt", [".txt"]` | `True` | +| `remove_prefix(text, prefix)` | `"hello world", "hello "` | `"world"` | +| `remove_suffix(text, suffix)` | `"file.txt", ".txt"` | `"file"` | +| `ensure_prefix(text, prefix)` | `"world", "hello "` | `"hello world"` | +| `ensure_suffix(text, suffix)` | `"file", ".txt"` | `"file.txt"` | +| `common_prefix(strings)` | `["abc", "abd"]` | `"ab"` | +| `common_suffix(strings)` | `["abc", "xbc"]` | `"bc"` | + +--- + +## Validation + +| Function | Description | +|----------|-------------| +| `is_empty(text)` | Empty string | +| `is_blank(text)` | Whitespace only | +| `is_numeric(text)` | Digits only (0โ€“9) | +| `is_alpha(text)` | Letters only | +| `is_alphanumeric(text)` | Letters and digits | +| `is_ascii(text)` | ASCII only (0x00โ€“0x7F) | +| `is_printable(text)` | Printable ASCII (0x20โ€“0x7E) | +| `is_hex(text)` | Hexadecimal (0โ€“9, aโ€“f, Aโ€“F) | +| `is_uppercase(text)` | All cased chars uppercase | +| `is_lowercase(text)` | All cased chars lowercase | +| `is_title_case(text)` | Title Case format | +| `is_mixed_case(text)` | Both cases present | + +--- + +## Escaping + +| Function | Example | Result | +|----------|---------|--------| +| `escape_html(text)` | `"
"` | `"<div>"` | +| `unescape_html(text)` | `"<div>"` | `"
"` | +| `escape_regex(text)` | `"a.b*c"` | `"a\\.b\\*c"` | + +--- + +## Similarity & Distance + +| Function | Example | Result | +|----------|---------|--------| +| `distance(a, b)` | `"kitten", "sitting"` | `3` | +| `similarity(a, b)` | `"hello", "hallo"` | `0.8` | +| `hamming_distance(a, b)` | `"karolin", "kathrin"` | `Ok(3)` | + +`similarity` uses normalized Levenshtein: `1.0 - distance(a,b) / max(len(a), len(b))`. + +--- + +## Text Manipulation + +| Function | Description | +|----------|-------------| +| `truncate(text, len, suffix)` | Truncate grapheme-aware | +| `truncate_strict(text, len, suffix)` | Truncate โ€” may split emoji | +| `truncate_preserve(text, len, suffix)` | Truncate โ€” keeps emoji whole | +| `ellipsis(text, len)` | Truncate with `โ€ฆ` | +| `normalize_whitespace(text)` | Collapse to single spaces | +| `strip(text, chars)` | Remove char set from ends | +| `squeeze(text, char)` | Collapse consecutive chars | +| `chomp(text)` | Remove trailing newline | +| `surround(text, pre, suf)` | Add prefix and suffix | +| `unwrap(text, pre, suf)` | Remove prefix and suffix | +| `strip_affixes(text, pre, suf)` | Alias for `unwrap` (preferred) | +| `reverse_words(text)` | Reverse word order | +| `initials(text)` | Extract initials | + +`strip(text, chars)` โ€” `chars` is a **set of graphemes**, not a literal substring. + +--- + +## Line Operations + +| Function | Description | +|----------|-------------| +| `lines(text)` | Split into lines | +| `dedent(text)` | Remove common indentation | +| `indent(text, spaces)` | Add indentation | +| `wrap_at(text, width)` | Word wrap | + +--- + +## ASCII Folding + +```gleam +str.ascii_fold("Crรจme Brรปlรฉe") // โ†’ "Creme Brulee" +str.ascii_fold("straรŸe") // โ†’ "strasse" +str.ascii_fold("รฆon") // โ†’ "aeon" +``` + +`ascii_fold_no_decompose` applies only the precomposed replacement table, +skipping NFD decomposition. Use it when the input is already NFC-normalized +and you want to avoid re-decomposing. + +For OTP-based NFC/NFD normalization, see [otp_integration.md](otp_integration.md). + +--- + +## Slug Generation + +```gleam +str.slugify("Hello, World!") // โ†’ "hello-world" +str.slugify_opts("one two three", 2, "-", False) // โ†’ "one-two" +str.slugify_opts("Hello World", -1, "_", False) // โ†’ "hello_world" +``` + +`slugify_opts` parameters: +- `max_len` โ€” max tokens. `-1` = no limit. +- `sep` โ€” separator string. +- `preserve_unicode` โ€” `True` keeps non-ASCII chars instead of folding. diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000..a5db74a --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,200 @@ +# Examples + +Copy-pasteable integration snippets. For full API see [api_reference.md](api_reference.md). + +--- + +## Grapheme-aware search + +```gleam +import str + +pub fn search_examples() { + // Emoji is ONE grapheme cluster โ€” index counts clusters, not bytes + let idx = str.index_of("Hello ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ World", "World") + // Ok(8) + + let last = str.last_index_of("hello hello hello", "hello") + // Ok(12) + + let has_any = str.contains_any("hello world", ["foo", "world"]) + // True + + let has_all = str.contains_all("hello world", ["hello", "world"]) + // True +} +``` + +--- + +## KMP map caching (hot loops) + +```gleam +import str +import str/advanced + +pub fn cached_search() { + let pattern = "abababab" + let maps = advanced.build_kmp_maps(pattern) + let pmap = maps.0 + let pimap = maps.1 + + // Reuse pre-built maps across many texts โ€” avoids rebuilding prefix table + let idx1 = advanced.kmp_index_of_with_maps("first long text...", pattern, pmap, pimap) + let all = advanced.kmp_search_all_with_maps("another text...", pmap, pimap) +} +``` + +> Prefer `str/advanced` for explicit algorithm control. Use `index_of_auto` +> only for exploratory/non-critical paths. + +--- + +## Grapheme length and boundaries + +```gleam +import str + +pub fn length_examples() { + str.length("Hello") // 5 + str.length("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ") // 1 โ€” family emoji is one grapheme cluster + str.length("๐Ÿ‡ฎ๐Ÿ‡น") // 1 โ€” flag sequence + str.length("cafรฉ") // 4 โ€” combining accent stays attached +} +``` + +--- + +## HTML escaping + +```gleam +import str + +let safe = str.escape_html("") +// "<script>alert('xss')</script>" + +let original = str.unescape_html("<div>Hello</div>") +// "
Hello
" +``` + +--- + +## Validation + +```gleam +import str + +str.is_uppercase("HELLO123") // True +str.is_lowercase("hello_world") // True +str.is_title_case("Hello World") // True +str.is_title_case("hello World") // False +str.is_ascii("hello!@#") // True +str.is_ascii("cafรฉ") // False +str.is_hex("DEADBEEF") // True +str.is_printable("hello\n") // False +``` + +--- + +## Similarity and distance + +```gleam +import str + +str.distance("kitten", "sitting") // 3 +str.similarity("hello", "hallo") // 0.8 +str.hamming_distance("karolin", "kathrin") // Ok(3) +``` + +--- + +## Take / drop from right + +```gleam +import str + +str.take_right("hello world", 3) // "rld" +str.drop_right("file.txt", 4) // "file" +str.take_right("Hello ๐Ÿ‘‹๐Ÿฝ", 1) // "๐Ÿ‘‹๐Ÿฝ" โ€” skin-tone modifier preserved +``` + +--- + +## Partition and split + +```gleam +import str + +str.partition("a-b-c", "-") // #("a", "-", "b-c") +str.rpartition("a-b-c", "-") // #("a-b", "-", "c") +str.splitn("one-two-three-four", "-", 2) // ["one", "two-three-four"] +``` + +--- + +## Padding and fill + +```gleam +import str + +str.pad_left("42", 5, "0") // "00042" +str.fill("x", 5, "-", str.Left) // "----x" +str.fill("x", 5, "-", str.Right) // "x----" +str.fill("x", 5, "-", str.Both) // "--x--" +``` + +--- + +## String manipulation + +```gleam +import str + +str.replace_first("aaa", "a", "b") // "baa" +str.replace_last("aaa", "a", "b") // "aab" +str.normalize_whitespace(" hello world \n") // "hello world" +str.reverse_words("hello beautiful world") // "world beautiful hello" +str.initials("John Fitzgerald Kennedy") // "JFK" +str.strip("..hello..", ".") // "hello" +str.squeeze("aabbcc", "b") // "aabcc" +``` + +--- + +## Slugification + +```gleam +import str + +str.slugify("Crรจme Brรปlรฉe โ€” Recipe 2025!") +// โ†’ "creme-brulee-recipe-2025" + +str.slugify_opts("one two three four", 2, "-", False) +// โ†’ "one-two" + +// Preserve Unicode characters +str.slugify_opts("Hรฉllo Wรถrld", -1, "-", True) +// โ†’ "hรฉllo-wรถrld" +``` + +--- + +## Case conversions + +```gleam +import str + +str.to_snake_case("Hello World") // โ†’ "hello_world" +str.to_camel_case("hello world") // โ†’ "helloWorld" +str.to_pascal_case("hello world") // โ†’ "HelloWorld" +str.to_kebab_case("Hello World") // โ†’ "hello-world" +str.camel_to_snake("camelCase") // โ†’ "camel_case" +str.snake_to_camel("snake_case") // โ†’ "snakeCase" +``` + +--- + +## OTP-based Unicode normalization + +See [otp_integration.md](otp_integration.md) for passing NFC/NFD normalizers +to `ascii_fold_with_normalizer` and `slugify_with_normalizer`. diff --git a/docs/otp_integration.md b/docs/otp_integration.md new file mode 100644 index 0000000..9baf634 --- /dev/null +++ b/docs/otp_integration.md @@ -0,0 +1,71 @@ +# OTP Integration + +`str` is OTP-free by design. Its internal decomposer covers common Latin +scripts; for production-grade Unicode normalization (NFC/NFD) pass an OTP +normalizer function from your application. + +--- + +## Normalizer signature + +Any function with type `fn(String) -> String` works. + +```gleam +// In your application (not in the str library) +pub fn otp_nfd(s: String) -> String { + // Call Erlang's :unicode module via FFI + // :unicode.characters_to_nfd_binary(s) + s // replace with actual FFI call +} +``` + +--- + +## Usage with ascii_fold + +```gleam +import str + +// Full pipeline: decompose โ†’ normalize via OTP โ†’ remove combining marks +str.ascii_fold_with_normalizer("Crรจme Brรปlรฉe", otp_nfd) +// โ†’ "Creme Brulee" + +// Skip decomposition, apply normalizer only +str.ascii_fold_no_decompose_with_normalizer("Cafรฉ", otp_nfd) +``` + +--- + +## Usage with slugify + +```gleam +// Default separator, no token limit +str.slugify_with_normalizer("Crรจme Brรปlรฉe", otp_nfd) +// โ†’ "creme-brulee" + +// Full options + normalizer +str.slugify_opts_with_normalizer("Crรจme Brรปlรฉe", 2, "-", False, otp_nfd) +// โ†’ "creme-brulee" +``` + +--- + +## Testing without OTP + +A fake normalizer is useful in tests to simulate NFD without Erlang interop: + +```gleam +import gleam/string + +let fake_nfd = fn(s) { string.replace(s, "รฉ", "e\u{0301}") } +let slug = str.slugify_opts_with_normalizer("Cafรฉ", -1, "-", False, fake_nfd) +// โ†’ "cafe" +``` + +--- + +## Placement note + +Keep normalizer helpers in your application, not in `str` itself. +`str` has no OTP dependency; adding one would break JavaScript targets and +pure-Gleam environments. From 37f99649423d511f31ebe918cf1403ac3d471579 Mon Sep 17 00:00:00 2001 From: Daniele Date: Wed, 22 Apr 2026 09:07:58 +0200 Subject: [PATCH 3/8] Enhance docs for str.gleam string utilities Improve and expand documentation for multiple string helper functions in src/str.gleam. Clarifies semantics and adds examples for: `strip` (treats `chars` as a set of graphemes removed from both ends), `similarity` (normalized Levenshtein formula and edge cases), `slugify_opts` (explain `max_len`, `sep`, `preserve_unicode` and examples), `ascii_fold_no_decompose` (behavior vs. `ascii_fold`, when to use it, and examples), `pascal_to_snake` (note that it's an alias of `camel_to_snake`), and `chars` (add experimental/compatibility warning and recommendation to use `chars_stdlib` for correctness). Also adds small example usages to clarify expected results. --- src/str.gleam | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 4 deletions(-) diff --git a/src/str.gleam b/src/str.gleam index ff92489..49b54ba 100644 --- a/src/str.gleam +++ b/src/str.gleam @@ -397,6 +397,19 @@ pub fn ensure_suffix(text: String, suffix: String) -> String { } /// Strips specified characters from both ends. +/// +/// `chars` is treated as a **set of individual graphemes** to remove, not +/// as a literal substring. Each grapheme in `chars` is removed independently +/// from both ends of `text` until a grapheme not in the set is found. +/// +/// ## Examples +/// +/// ```gleam +/// strip("..hello..", ".") +/// // -> "hello" +/// strip("xyxhelloxyxy", "xy") +/// // -> "hello" // removes any 'x' or 'y' from both ends +/// ``` pub fn strip(text: String, chars: String) -> String { core.strip(text, chars) } @@ -486,13 +499,20 @@ pub fn distance(a: String, b: String) -> Int { core.distance(a, b) } -/// Calculates similarity as a percentage (0.0 to 1.0). +/// Calculates similarity as a percentage (0.0 to 1.0) using normalized +/// Levenshtein distance: `1.0 - distance(a, b) / max(len(a), len(b))`. +/// Returns 1.0 for identical strings, 0.0 for fully different strings of +/// equal length. /// /// ## Examples /// /// ```gleam /// similarity("hello", "hallo") /// // -> 0.8 +/// similarity("abc", "xyz") +/// // -> 0.0 +/// similarity("", "") +/// // -> 1.0 /// ``` pub fn similarity(a: String, b: String) -> Float { core.similarity(a, b) @@ -700,6 +720,23 @@ pub fn slugify_with_normalizer(text: String, normalizer) -> String { } /// Creates slug with detailed options. +/// +/// - `max_len`: maximum number of tokens (words) to include. Pass `-1` for +/// no limit. Pass `0` to produce an empty string. +/// - `sep`: separator between tokens (default `"-"` in `slugify`). +/// - `preserve_unicode`: when `True`, keeps non-ASCII characters instead of +/// folding them to ASCII equivalents. +/// +/// ## Examples +/// +/// ```gleam +/// slugify_opts("one two three", 2, "-", False) +/// // -> "one-two" +/// slugify_opts("Hello World", -1, "_", False) +/// // -> "hello_world" +/// slugify_opts("Hรฉllo", -1, "-", True) +/// // -> "hรฉllo" +/// ``` pub fn slugify_opts( text: String, max_len: Int, @@ -739,6 +776,25 @@ pub fn ascii_fold(text: String) -> String { } /// ASCII folding without Unicode decomposition. +/// +/// Unlike `ascii_fold`, this function skips the NFD decomposition step and +/// applies only the direct replacement table. Use this when the input is +/// already NFC-normalized and you want to avoid re-decomposing it, or when +/// you need to preserve combining marks that decomposition would expose. +/// +/// Precomposed characters in the replacement table (e.g. `"รฉ"` โ†’ `"e"`) are +/// still converted. Characters not in the table are left unchanged. +/// +/// ## Examples +/// +/// ```gleam +/// ascii_fold_no_decompose("cafรฉ") +/// // -> "cafe" // "รฉ" is in the precomposed table +/// ascii_fold("straรŸe") +/// // -> "strasse" // ligature handled by decompose path +/// ascii_fold_no_decompose("straรŸe") +/// // -> "strasse" // also in the table directly +/// ``` pub fn ascii_fold_no_decompose(text: String) -> String { extra.ascii_fold_no_decompose(text) } @@ -797,7 +853,9 @@ pub fn camel_to_snake(text: String) -> String { extra.camel_to_snake(text) } -/// Alias for camel_to_snake. +/// Converts PascalCase to snake_case. Alias for `camel_to_snake` โ€” both +/// functions use the same implementation since the conversion rules are +/// identical for PascalCase and camelCase. pub fn pascal_to_snake(text: String) -> String { extra.pascal_to_snake(text) } @@ -818,8 +876,14 @@ pub fn snake_to_pascal(text: String) -> String { /// Pure Gleam grapheme tokenizer (approximates Unicode segmentation). /// -/// This is an experimental pure-Gleam implementation that approximates -/// Unicode grapheme cluster segmentation without external dependencies. +/// **WARNING: experimental.** This implementation approximates Unicode +/// grapheme cluster segmentation and **may produce incorrect results** on +/// complex sequences such as ZWJ emoji, skin-tone modifiers, or flag +/// sequences. For correctness, use `chars_stdlib` instead. +/// +/// Only use this function if you have a specific reason to avoid the BEAM +/// stdlib (e.g., compiling to JavaScript where BEAM primitives are absent) +/// and you have verified the edge cases that matter to you. pub fn chars(text: String) -> List(String) { tokenize.chars(text) } From 46e0ca4a62e576fbd17a1cf78027c8f2a4209241 Mon Sep 17 00:00:00 2001 From: Daniele Date: Wed, 22 Apr 2026 09:08:05 +0200 Subject: [PATCH 4/8] Update .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index c71d395..51b8e53 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,6 @@ tmp/ ### user-specific files artifacts/ benchmark/results/ + +### Editor / tooling config +.markdownlint.json From 93cfc1e52f74a1ccb32ea95034f3b4b9611ef262 Mon Sep 17 00:00:00 2001 From: Daniele Date: Wed, 22 Apr 2026 09:10:21 +0200 Subject: [PATCH 5/8] Add ROADMAP.md with release plan Add a ROADMAP.md describing planned releases and migration guidance. It documents v2.1.1 (documentation fixes), v2.2.0 (new API surface: TruncateMode, SlugifyOpts/slug_with, strip_affixes; multiple deprecations and v3.0.0 (breaking cleanup removing deprecated APIs and consolidating public surface into str.gleam). --- ROADMAP.md | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 ROADMAP.md diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000..882c24a --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,60 @@ +# Roadmap + +## v2.1.1 โ€” Documentation fixes โœ… + +Patch release. No API changes. + +- Documented `similarity` formula (normalized Levenshtein) +- Clarified `strip` treats `chars` as a set, not a literal +- Clarified `ascii_fold_no_decompose` vs `ascii_fold` with examples +- Documented `slugify_opts` `max_len: -1` = no limit +- Promoted `chars` experimental warning +- Fixed README `fill` example (was showing string `"both"` instead of `Both`) + +--- + +## v2.2.0 โ€” New API surface + deprecations + +Additive release. All existing code continues to work. + +**New API:** +- `TruncateMode` type (`Strict | Preserve`) replacing `truncate_strict` / + `truncate_preserve` / `truncate_with_flag` โ€” collapses 4 functions into 1 +- `SlugifyOpts` record + `slug_with` replacing the positional `slugify_opts` + variants โ€” eliminates opaque positional args and combinatorial `_with_normalizer` variants +- `strip_affixes` as the canonical name for `unwrap` โ€” `unwrap` conflicts + with established FP semantics (extract from container or panic) + +**Deprecations** (removed in 3.0): +- `truncate_strict`, `truncate_preserve`, `truncate_default`, `truncate_with_flag` +- `slugify_opts`, `slugify_with_normalizer`, `slugify_opts_with_normalizer` +- `ascii_fold_with_normalizer`, `ascii_fold_no_decompose_with_normalizer` +- `unwrap`, `index_of_simple`, `count_simple` +- KMP/sliding re-exports in `str` main module โ†’ use `str/advanced` directly +- Config constant re-exports in `str` main module โ†’ use `str/config` directly + +**Also:** +- `MIGRATION.md` with full old โ†’ new mapping +- `chars` moved to `str/tokenize` or renamed `approximate_chars`; + `chars_stdlib` becomes the canonical `chars` + +--- + +## v3.0.0 โ€” Breaking cleanup + +Removes everything deprecated in v2.2.0. Users who followed deprecation +notices have zero changes to make. + +**Removed:** +- All deprecated truncate variants โ†’ only `truncate` (3-arg, default behaviour) + and `truncate_mode` (4-arg with `TruncateMode`) remain +- All deprecated slugify/ascii_fold variants โ†’ `slugify`, `slug_with`, `ascii_fold`, + `ascii_fold_no_decompose` remain +- `unwrap` โ†’ `strip_affixes` +- `index_of_simple`, `count_simple` +- KMP/sliding/config re-exports from `str` main module +- Legacy modules `str/core`, `str/extra`, `str/tokenize` (deprecated since 2.0) + +**API surface goal:** `str.gleam` exposes only user-facing operations. +Algorithm plumbing lives exclusively in `str/advanced`. Configuration in +`str/config`. No internals leaking through the main module. From feef751b0b6db9fc26a1cf1d2380e58aaff3fbdd Mon Sep 17 00:00:00 2001 From: Daniele Date: Wed, 22 Apr 2026 09:11:33 +0200 Subject: [PATCH 6/8] Refactor README for concise docs and examples Rework README to be more concise and navigable: simplify the feature list, normalize wording, and replace large inline API and example tables with links to dedicated docs (docs/*.md). Trim experimental algorithm details and long examples, and provide short quick-start snippets demonstrating grapheme-safe truncation, slugify, case conversions, and similarity. Update development instructions to include JavaScript target testing and the script to regenerate transliteration tables. Clean up formatting, headings, and links (docs, Hex, UAX#29), and clarify module guidance and license/contributing references. --- README.md | 369 +++++++++--------------------------------------------- 1 file changed, 62 insertions(+), 307 deletions(-) diff --git a/README.md b/README.md index aca95ff..1fab2c7 100644 --- a/README.md +++ b/README.md @@ -15,28 +15,30 @@ License: MIT

-> **Production-ready** Gleam library providing Unicode-aware string operations with a focus on grapheme-cluster correctness, pragmatic ASCII transliteration, and URL-friendly slug generation. +> Production-ready Gleam library for Unicode-aware string operations. All +> operations work at grapheme cluster boundaries, correct behaviour for +> emoji, ZWJ sequences, combining marks, and flags. --- -## โœจ Features +## Features | Category | Highlights | |----------|------------| -| ๐ŸŽฏ **Grapheme-Aware** | All operations correctly handle Unicode grapheme clusters (emoji, ZWJ sequences, combining marks) | -| ๐Ÿ”ค **Case Conversions** | `snake_case`, `camelCase`, `kebab-case`, `PascalCase`, `Title Case`, `capitalize` | -| ๐Ÿ”— **Slug Generation** | Configurable `slugify` with token limits, custom separators, and Unicode preservation | -| ๐Ÿ” **Search & Replace** | `index_of`, `last_index_of`, `replace_first`, `replace_last`, `contains_any/all` | -| โœ… **Validation** | `is_uppercase`, `is_lowercase`, `is_title_case`, `is_ascii`, `is_hex`, `is_numeric`, `is_alpha` | -| ๐Ÿ›ก๏ธ **Escaping** | `escape_html`, `unescape_html`, `escape_regex` | -| ๐Ÿ“ **Similarity** | Levenshtein `distance`, percentage `similarity`, `hamming_distance` | -| ๐Ÿงฉ **Splitting** | `splitn`, `partition`, `rpartition`, `chunk`, `lines`, `words` | -| ๐Ÿ“ **Padding** | `pad_left`, `pad_right`, `center`, `fill` | -| ๐Ÿš€ **Minimal Dependencies** | Pure Gleam implementation with no OTP requirement | +| **Grapheme-Aware** | `take`, `drop`, `length`, `reverse`, `chunk` โ€” all grapheme-correct | +| **Case Conversions** | `snake_case`, `camelCase`, `kebab-case`, `PascalCase`, `Title Case` | +| **Slug Generation** | `slugify` with token limits, custom separators, Unicode preservation | +| **Search & Replace** | `index_of`, `last_index_of`, `replace_first/last`, `contains_any/all` | +| **Validation** | `is_uppercase/lowercase/title_case`, `is_ascii/hex/numeric/alpha` | +| **Escaping** | `escape_html`, `unescape_html`, `escape_regex` | +| **Similarity** | Levenshtein `distance`, normalized `similarity`, `hamming_distance` | +| **Splitting** | `splitn`, `partition`, `rpartition`, `chunk`, `lines`, `words` | +| **Padding** | `pad_left`, `pad_right`, `center`, `fill` | +| **Minimal deps** | No OTP requirement โ€” works on Erlang and JavaScript targets | --- -## ๐Ÿ“ฆ Installation +## Installation ```sh gleam add str @@ -44,339 +46,92 @@ gleam add str --- -## ๐Ÿš€ Quick Start +## Quick Start ```gleam import str -pub fn main() { - // ๐ŸŽฏ Grapheme-safe truncation preserves emoji - let text = "Hello ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ World" - str.truncate(text, 10, "...") - // โ†’ "Hello ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ..." - - // ๐Ÿ”— ASCII transliteration and slugification - str.slugify("Crรจme Brรปlรฉe โ€” Recipe 2025!") - // โ†’ "creme-brulee-recipe-2025" - - // ๐Ÿ”ค Case conversions - str.to_camel_case("hello world") // โ†’ "helloWorld" - str.to_snake_case("Hello World") // โ†’ "hello_world" - str.capitalize("hELLO wORLD") // โ†’ "Hello world" - - // ๐Ÿ” Grapheme-aware search - str.index_of("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ family test", "family") - // โ†’ Ok(2) - counts grapheme clusters, not bytes! - - // ๐Ÿ“ String similarity - str.similarity("hello", "hallo") - // โ†’ 0.8 (80% similar) - - // ๐Ÿ›ก๏ธ HTML escaping - str.escape_html("") - // โ†’ "<script>alert('xss')</script>" -} -``` - ---- - -## ๐Ÿ“š API Reference - -### ๐Ÿ”ค Case & Capitalization - -| Function | Example | Result | -|----------|---------|--------| -| `capitalize(text)` | `"hELLO wORLD"` | `"Hello world"` | -| `swapcase(text)` | `"Hello World"` | `"hELLO wORLD"` | -| `is_uppercase(text)` | `"HELLO123"` | `True` | -| `is_lowercase(text)` | `"hello_world"` | `True` | -| `is_title_case(text)` | `"Hello World"` | `True` | -| `is_mixed_case(text)` | `"helloWorld"` | `True` | - -### โœ‚๏ธ Grapheme Extraction - -| Function | Example | Result | -|----------|---------|--------| -| `take(text, n)` | `take("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆabc", 2)` | `"๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆa"` | -| `drop(text, n)` | `drop("hello", 2)` | `"llo"` | -| `take_right(text, n)` | `take_right("hello", 3)` | `"llo"` | -| `drop_right(text, n)` | `drop_right("hello", 2)` | `"hel"` | -| `at(text, index)` | `at("hello", 1)` | `Ok("e")` | -| `chunk(text, size)` | `chunk("abcdef", 2)` | `["ab", "cd", "ef"]` | - -### ๐Ÿ” Search & Replace - -| Function | Example | Result | -|----------|---------|--------| -| `index_of(text, needle)` | `"hello world", "world"` | `Ok(6)` | -| `last_index_of(text, needle)` | `"hello hello", "hello"` | `Ok(6)` | -| `contains_any(text, needles)` | `"hello", ["x", "e", "z"]` | `True` | -| `contains_all(text, needles)` | `"hello", ["h", "e"]` | `True` | -| `replace_first(text, old, new)` | `"aaa", "a", "b"` | `"baa"` | -| `replace_last(text, old, new)` | `"aaa", "a", "b"` | `"aab"` | +// Grapheme-safe truncation preserves emoji sequences +str.truncate("Hello ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ World", 10, "...") +// โ†’ "Hello ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ..." -### โš ๏ธ Experimental: Search Strategies +// ASCII transliteration and URL-friendly slugs +str.slugify("Crรจme Brรปlรฉe โ€” Recipe 2025!") +// โ†’ "creme-brulee-recipe-2025" -**Algorithms:** -- **KMP**: optimized for long/repetitive patterns -- **Sliding**: fast for short patterns, zero allocations +// Case conversions +str.to_camel_case("hello world") // โ†’ "helloWorld" +str.to_snake_case("Hello World") // โ†’ "hello_world" -**APIs:** +// Grapheme-aware search โ€” counts clusters, not bytes +str.index_of("๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ family test", "family") +// โ†’ Ok(2) -| Function | Description | -|----------|-------------| -| `index_of_auto(text, pattern)` | Auto-select algorithm (heuristic) | -| `index_of_strategy(text, pattern, Kmp\|Sliding)` | Explicit algorithm choice | -| `count_auto(text, pattern, overlapping)` | Auto-select for counting | -| `count_strategy(text, pattern, overlapping, Kmp\|Sliding)` | Explicit count algorithm | - -**Examples:** - -```gleam -// Force KMP explicitly -str.index_of_strategy("long text...", "pattern", str.Kmp) - -// Let heuristic decide (experimental) -str.index_of_auto("some text", "pat") +// Normalized Levenshtein similarity +str.similarity("hello", "hallo") +// โ†’ 0.8 ``` -> **Note:** `_auto` variants use heuristics and may not always choose optimally. For performance-critical code, use `_strategy` variants. Configure thresholds in `src/str/config.gleam`. - -### ๐Ÿงฉ Splitting & Partitioning - -| Function | Example | Result | -|----------|---------|--------| -| `partition(text, sep)` | `"a-b-c", "-"` | `#("a", "-", "b-c")` | -| `rpartition(text, sep)` | `"a-b-c", "-"` | `#("a-b", "-", "c")` | -| `splitn(text, sep, n)` | `"a-b-c-d", "-", 2` | `["a", "b-c-d"]` | -| `words(text)` | `"hello world"` | `["hello", "world"]` | -| `lines(text)` | `"a\nb\nc"` | `["a", "b", "c"]` | - -### ๐Ÿ“ Padding & Filling - -| Function | Example | Result | -|----------|---------|--------| -| `pad_left(text, width, pad)` | `"42", 5, "0"` | `"00042"` | -| `pad_right(text, width, pad)` | `"hi", 5, "*"` | `"hi***"` | -| `center(text, width, pad)` | `"hi", 6, "-"` | `"--hi--"` | -| `fill(text, width, pad, pos)` | `"x", 5, "-", "both"` | `"--x--"` | - -### โœ… Validation - -| Function | Description | -|----------|-------------| -| `is_numeric(text)` | Digits only (0-9) | -| `is_alpha(text)` | Letters only (a-z, A-Z) | -| `is_alphanumeric(text)` | Letters and digits | -| `is_ascii(text)` | ASCII only (0x00-0x7F) | -| `is_printable(text)` | Printable ASCII (0x20-0x7E) | -| `is_hex(text)` | Hexadecimal (0-9, a-f, A-F) | -| `is_blank(text)` | Whitespace only | -| `is_title_case(text)` | Title Case format | - -### ๐Ÿ”— Prefix & Suffix - -| Function | Example | Result | -|----------|---------|--------| -| `remove_prefix(text, prefix)` | `"hello world", "hello "` | `"world"` | -| `remove_suffix(text, suffix)` | `"file.txt", ".txt"` | `"file"` | -| `ensure_prefix(text, prefix)` | `"world", "hello "` | `"hello world"` | -| `ensure_suffix(text, suffix)` | `"file", ".txt"` | `"file.txt"` | -| `starts_with_any(text, list)` | `"hello", ["hi", "he"]` | `True` | -| `ends_with_any(text, list)` | `"file.txt", [".txt", ".md"]` | `True` | -| `common_prefix(strings)` | `["abc", "abd"]` | `"ab"` | -| `common_suffix(strings)` | `["abc", "xbc"]` | `"bc"` | - -### ๐Ÿ›ก๏ธ Escaping - -| Function | Example | Result | -|----------|---------|--------| -| `escape_html(text)` | `"
"` | `"<div>"` | -| `unescape_html(text)` | `"<div>"` | `"
"` | -| `escape_regex(text)` | `"a.b*c"` | `"a\\.b\\*c"` | - -### ๐Ÿ“ Similarity & Distance - -| Function | Example | Result | -|----------|---------|--------| -| `distance(a, b)` | `"kitten", "sitting"` | `3` | -| `similarity(a, b)` | `"hello", "hallo"` | `0.8` | -| `hamming_distance(a, b)` | `"karolin", "kathrin"` | `Ok(3)` | - -### ๐Ÿ“ Text Manipulation - -| Function | Description | -|----------|-------------| -| `truncate(text, len, suffix)` | Truncate with emoji preservation | -| `ellipsis(text, len)` | Truncate with โ€ฆ | -| `reverse(text)` | Grapheme-aware reversal | -| `reverse_words(text)` | Reverse word order | -| `initials(text)` | Extract initials (`"John Doe"` โ†’ `"JD"`) | -| `normalize_whitespace(text)` | Collapse whitespace | -| `strip(text, chars)` | Remove chars from ends | -| `squeeze(text, char)` | Collapse consecutive chars | -| `chomp(text)` | Remove trailing newline | - -### ๐Ÿ“„ Line Operations - -| Function | Description | -|----------|-------------| -| `lines(text)` | Split into lines | -| `dedent(text)` | Remove common indentation | -| `indent(text, spaces)` | Add indentation | -| `wrap_at(text, width)` | Word wrap | - --- -## ๐Ÿ”ค Case Conversions & ASCII Folding - -### Case Conversions - -```gleam -import str +## Documentation -str.to_snake_case("Hello World") // โ†’ "hello_world" -str.to_camel_case("hello world") // โ†’ "helloWorld" -str.to_pascal_case("hello world") // โ†’ "HelloWorld" -str.to_kebab_case("Hello World") // โ†’ "hello-world" -str.to_title_case("hello world") // โ†’ "Hello World" -str.camel_to_snake("camelCase") // โ†’ "camel_case" -str.snake_to_camel("snake_case") // โ†’ "snakeCase" -str.pascal_to_snake("PascalCase") // โ†’ "pascal_case" -str.snake_to_pascal("snake_case") // โ†’ "SnakeCase" -``` - -### ASCII Folding (Deburr) - -```gleam -str.ascii_fold("Crรจme Brรปlรฉe") // โ†’ "Creme Brulee" -str.ascii_fold("straรŸe") // โ†’ "strasse" -str.ascii_fold("รฆon") // โ†’ "aeon" -``` +| Document | Description | +|----------|-------------| +| [API Reference](docs/api_reference.md) | Complete function reference with examples | +| [Examples](docs/examples.md) | Integration snippets and patterns | -### Slug Generation - -```gleam -str.slugify("Hello, World!") // โ†’ "hello-world" -str.slugify_opts("one two three", 2, "-", False) // โ†’ "one-two" -str.slugify_opts("Hello World", 0, "_", False) // โ†’ "hello_world" -``` +| [OTP Integration](docs/otp_integration.md) | NFC/NFD normalization via Erlang | +| [Core internals](docs/str_core.md) | Grapheme-aware core operations | +| [Extra internals](docs/str_extra.md) | ASCII folding and slug generation | +| [Tokenizer](docs/str_tokenize.md) | Pure-Gleam grapheme tokenizer reference | --- -## ๐Ÿ—๏ธ Module Guide - -### Which module should I use? - -| Module | When to use | Import | -|--------|-------------|--------| -| **`str`** | All string operations (recommended) | `import str` | -| **`str/advanced`** | Low-level KMP algorithms, caching | `import str/advanced` | -| **`str/config`** | Search heuristics configuration | `import str/config` | - -**Quick start:** Use `import str` for all your needs. The main `str` module provides the complete public API including grapheme operations, ASCII folding, slugs, and case conversions. - -**Advanced users:** Import `str/advanced` for explicit control over search algorithms and KMP map caching. +## Module Guide -### Module structure +| Module | Use when | Import | +|--------|----------|--------| +| `str` | Everything โ€” recommended entry point | `import str` | +| `str/advanced` | Explicit KMP algorithm control, map caching | `import str/advanced` | +| `str/config` | Tune search heuristic thresholds | `import str/config` | -``` +```text str/ -โ”œโ”€โ”€ str.gleam # Main module (complete public API) +โ”œโ”€โ”€ str.gleam # Main module โ€” complete public API โ”œโ”€โ”€ advanced.gleam # Low-level search algorithms -โ”œโ”€โ”€ config.gleam # Search heuristics configuration -โ””โ”€โ”€ internal/ # Implementation details (not public API) +โ”œโ”€โ”€ config.gleam # Search heuristic configuration +โ””โ”€โ”€ internal/ # Implementation details (not part of public API) ``` --- -## ๐Ÿ“– Documentation - -| Document | Description | -|----------|-------------| -| [Core API](docs/str_core.md) | Grapheme-aware string operations | -| [Extra API](docs/str_extra.md) | ASCII folding and slug generation | -| [Tokenizer](docs/str_tokenize.md) | Pure-Gleam tokenizer reference | -| [Examples](EXAMPLES.md) | Integration examples and OTP patterns | -| [Character Tables](docs/character_tables.json) | Machine-readable transliteration data | - ---- - -## โšก Optional OTP Integration - -The library core is OTP-free by design. For production Unicode normalization (NFC/NFD): - -```gleam -import str - -// In your application code: -pub fn otp_nfd(s: String) -> String { - // Call Erlang's :unicode module - s -} - -// Use with str: -str.ascii_fold_with_normalizer("Crรจme", otp_nfd) -str.slugify_with_normalizer("Cafรฉ", otp_nfd) -``` - ---- - -## ๐Ÿงช Development +## Development ```sh -# Run the test suite -gleam test - -# Regenerate character tables documentation -python3 scripts/generate_character_tables.py +gleam test # run test suite (Erlang target) +gleam test --target javascript # run on JavaScript target +python3 scripts/generate_character_tables.py # regenerate transliteration tables ``` -Note: as of **2.0.0**, `escape_html` now uses the `houdini` library for fast, allocationโ€‘friendly escaping, and `unescape_html` uses `odysseus` for comprehensive entity support (named, decimal and hex numeric entities). See [CHANGELOG.md](CHANGELOG.md) for details. - ---- - -## ๐Ÿ“Š Test Coverage - -- **tests** covering all public functions -- Unicode edge cases (emoji, ZWJ, combining marks) -- Grapheme cluster boundary handling -- Cross-module integration tests - --- -## ๐Ÿค Contributing +## Contributing -Contributions welcome! Areas for improvement: - -- Expanding character transliteration tables -- Additional test cases for edge cases -- Documentation improvements -- Performance optimizations - -```sh -gleam test # Ensure tests pass before submitting PRs -``` +Contributions welcome. See [CONTRIBUTING.md](CONTRIBUTING.md). +Run `gleam test` before submitting PRs. --- -## ๐Ÿ“„ License +## License -MIT License โ€” see [LICENSE](LICENSE) for details. +MIT โ€” see [LICENSE](LICENSE). --- -## ๐Ÿ”— Links +## Links - [Gleam Language](https://gleam.run/) -- [Unicode Grapheme Clusters (UAX #29)](https://unicode.org/reports/tr29/) +- [Unicode Grapheme Clusters โ€” UAX #29](https://unicode.org/reports/tr29/) - [Hex Package](https://hex.pm/packages/str) -- [Hex Documentation](https://hexdocs.pm/str/) - ---- - -
- -**Made with ๐Ÿ’œ for the Gleam community** - -
+- [Hex Docs](https://hexdocs.pm/str/) From ae62e9269285001b2c82a9516667a8cf31cda38d Mon Sep 17 00:00:00 2001 From: Daniele Date: Wed, 22 Apr 2026 09:11:59 +0200 Subject: [PATCH 7/8] Update CHANGELOG for 2.1.1 Add 2.1.1 release notes documenting behavior and examples for several API changes --- CHANGELOG.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a4609d..029d1a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,30 @@ All notable changes to this project are documented in this file. +## [2.1.1] - 2026-04-22 + +### Documentation + +- **`similarity`**: documented the formula (normalized Levenshtein: + `1.0 - distance(a, b) / max(len(a), len(b))`). Added edge-case examples + (`""` / `""` โ†’ 1.0, fully different โ†’ 0.0). +- **`ascii_fold_no_decompose`**: explained when to use it vs `ascii_fold` โ€” + skips NFD decomposition, applies only the precomposed replacement table. + Added concrete examples. +- **`strip`**: clarified that `chars` is a **set of graphemes**, not a + literal substring (analogous to Python's `str.strip`). +- **`slugify_opts`**: documented that `max_len: -1` means no limit; + `max_len: 0` produces an empty string. Added examples for all options. +- **`chars`**: promoted the experimental warning to be more prominent โ€” + function may produce incorrect results on complex Unicode sequences. + Directs users to `chars_stdlib` for correctness. +- **`pascal_to_snake`**: noted explicitly that it is an alias for + `camel_to_snake` with identical behaviour. +- **README**: fixed `fill` example โ€” was showing string `"both"` instead + of the `Both` type constructor. + +--- + ## [2.1.0] - 2026-03-31 ### Added From 02a5dffcf7038ddca756556f54f0f841ccc21024 Mon Sep 17 00:00:00 2001 From: Daniele Date: Wed, 22 Apr 2026 09:13:40 +0200 Subject: [PATCH 8/8] Bump package version to 2.1.1 Update gleam.toml to set version from 2.1.0 to 2.1.1. --- gleam.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gleam.toml b/gleam.toml index 39523c0..9268f2d 100644 --- a/gleam.toml +++ b/gleam.toml @@ -1,5 +1,5 @@ name = "str" -version = "2.1.0" +version = "2.1.1" # Project metadata (fill or replace placeholders before publishing) description = "Unicode-aware string utilities for Gleam: grapheme-safe operations, pragmatic ASCII transliteration, and slug generation."