diff --git a/.gitignore b/.gitignore index c71d395..51b8e53 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,6 @@ tmp/ ### user-specific files artifacts/ benchmark/results/ + +### Editor / tooling config +.markdownlint.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a4609d..029d1a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,30 @@ All notable changes to this project are documented in this file. +## [2.1.1] - 2026-04-22 + +### Documentation + +- **`similarity`**: documented the formula (normalized Levenshtein: + `1.0 - distance(a, b) / max(len(a), len(b))`). Added edge-case examples + (`""` / `""` → 1.0, fully different → 0.0). +- **`ascii_fold_no_decompose`**: explained when to use it vs `ascii_fold` — + skips NFD decomposition, applies only the precomposed replacement table. + Added concrete examples. +- **`strip`**: clarified that `chars` is a **set of graphemes**, not a + literal substring (analogous to Python's `str.strip`). +- **`slugify_opts`**: documented that `max_len: -1` means no limit; + `max_len: 0` produces an empty string. Added examples for all options. +- **`chars`**: promoted the experimental warning to be more prominent — + function may produce incorrect results on complex Unicode sequences. + Directs users to `chars_stdlib` for correctness. +- **`pascal_to_snake`**: noted explicitly that it is an alias for + `camel_to_snake` with identical behaviour. +- **README**: fixed `fill` example — was showing string `"both"` instead + of the `Both` type constructor. + +--- + ## [2.1.0] - 2026-03-31 ### Added diff --git a/EXAMPLES.md b/EXAMPLES.md deleted file mode 100644 index 626b8ff..0000000 --- a/EXAMPLES.md +++ /dev/null @@ -1,492 +0,0 @@ -# Examples — Integration snippets for `str` - -This file collects short, copy-pasteable examples showing how to integrate -and extend the `str` library from an application. Keep in mind that the -`str` package itself intentionally does not depend on Erlang/OTP; any OTP -interop should live in the *integrating application* (not in `src/str/*`). - -## Core Function Examples - -### Grapheme-Aware Indexing and Search - -```gleam -import str - -pub fn search_examples() { - // Find first occurrence (grapheme-aware!) - let idx = str.index_of("Hello 👨‍👩‍👧‍👦 World", "World") - // Ok(8) - the emoji is ONE grapheme cluster! - - // Find last occurrence - let last = str.last_index_of("hello hello hello", "hello") - // Ok(12) - - // Check for multiple needles - let has_any = str.contains_any("hello world", ["foo", "world"]) - // True - - let has_all = str.contains_all("hello world", ["hello", "world"]) - // True -} -``` - -### Experimental Search Strategies & Caching - -```gleam -import str -import str/advanced - -pub fn search_strategy_examples() { - // 1) Use the automatic heuristic (experimental) - // The heuristic chooses between a sliding matcher and KMP based on - // pattern/text characteristics. It is opt-in and may choose a - // non-optimal strategy in some cases. - let auto = str.index_of_auto("some long text...", "pat") - - // 2) Force a specific strategy: use this when performance is critical - // and you know which algorithm is better for your input shape. - let forced_kmp = str.index_of_strategy("long text...", "pattern", str.Kmp) - let forced_sliding = str.index_of_strategy("short text", "pat", str.Sliding) - - // 3) Caching KMP maps: precompute pattern maps once and reuse them - // across multiple searches to avoid rebuilding prefix tables. - let pattern = "abababab..." - let maps = advanced.build_kmp_maps(pattern) - let pmap = maps.0 - let pimap = maps.1 - - // Reuse maps across many texts - let idx1 = advanced.kmp_index_of_with_maps("first long text...", pattern, pmap, pimap) - let occurrences = advanced.kmp_search_all_with_maps("another text...", pmap, pimap) - - // Guidance: prefer explicit strategy or caching in hot loops; use - // `index_of_auto` for convenience and exploratory testing. -} -``` - -> Note: `index_of_auto` is experimental and its behavior depends on tunable -> thresholds in `src/str/config.gleam`. For production-critical paths, -> prefer `index_of_strategy` or precomputing maps via `build_kmp_maps`. - -### Grapheme-Aware Length and String Checks - -```gleam -import str - -pub fn length_examples() { - // Grapheme-aware length - // Unlike standard string length, counts grapheme clusters correctly - let len = str.length("Hello") - // 5 - - // Family emoji is a SINGLE grapheme cluster - let emoji_len = str.length("👨‍👩‍👧‍👦") - // 1 - - // Flag is also a single grapheme - let flag_len = str.length("🇮🇹") - // 1 - - // Combining characters stay attached - let cafe_len = str.length("café") - // 4 (even with combining accent) -} - -pub fn contains_examples() { - // Grapheme-aware contains - let found = str.contains("hello world", "world") - // True - - let not_found = str.contains("hello", "x") - // False - - // Works correctly with emoji - let emoji_found = str.contains("👨‍👩‍👧‍👦 family", "👨‍👩‍👧‍👦") - // True -} - -pub fn prefix_suffix_examples() { - // Grapheme-aware starts_with - let starts = str.starts_with("hello", "he") - // True - - // Empty prefix always matches - let empty_prefix = str.starts_with("hello", "") - // True - - // Works with emoji on grapheme boundaries - let emoji_starts = str.starts_with("👨‍👩‍👧‍👦abc", "👨‍👩‍👧‍👦") - // True - - // Grapheme-aware ends_with - let ends = str.ends_with("hello.txt", ".txt") - // True - - let emoji_ends = str.ends_with("abc👨‍👩‍👧‍👦", "👨‍👩‍👧‍👦") - // True -} - -pub fn empty_check_examples() { - // is_empty check - let empty = str.is_empty("") - // True - - let not_empty = str.is_empty(" ") - // False (whitespace is not empty) - - // Combine with is_blank for whitespace check - let blank = str.is_blank(" ") - // True -} -``` - -### Replace First/Last Occurrence - -```gleam -import str - -pub fn replace_examples() { - // Replace only first occurrence (stdlib only has replace all) - let text = "hello hello hello" - let first = str.replace_first(text, "hello", "hi") - // "hi hello hello" - - let last = str.replace_last(text, "hello", "bye") - // "hello hello bye" -} -``` - -### HTML Escaping for Web Applications - -```gleam -import str - -pub fn html_examples() { - // Escape user input before rendering - let user_input = "" - let safe = str.escape_html(user_input) - // "<script>alert('xss')</script>" - - // Unescape for display - let escaped = "<div>Hello</div>" - let original = str.unescape_html(escaped) - // "
Hello
" -} -``` - -### String Validation - -```gleam -import str - -pub fn validation_examples() { - // Case validation (ignores non-letter characters) - assert str.is_uppercase("HELLO123") == True - assert str.is_lowercase("hello_world") == True - - // Title Case validation - assert str.is_title_case("Hello World") == True - assert str.is_title_case("hello World") == False - assert str.is_title_case("Hello 123 World") == True // numbers ignored - - // ASCII validation - assert str.is_ascii("hello!@#") == True - assert str.is_ascii("café") == False - - // Hex validation (useful for color codes, UUIDs, etc.) - assert str.is_hex("DEADBEEF") == True - assert str.is_hex("ff00ff") == True - - // Printable check (no control characters) - assert str.is_printable("hello") == True - assert str.is_printable("hello\n") == False -} -``` - -### String Similarity and Distance - -```gleam -import str - -pub fn similarity_examples() { - // Levenshtein distance (edit operations needed) - let dist = str.distance("kitten", "sitting") - // 3 - - // Similarity as percentage (0.0 to 1.0) - let sim = str.similarity("hello", "hallo") - // 0.8 (80% similar) - - // Hamming distance (same length strings only) - let ham = str.hamming_distance("karolin", "kathrin") - // Ok(3) -} -``` - -### Take/Drop from Right - -```gleam -import str - -pub fn take_drop_examples() { - // Get last N graphemes - let last3 = str.take_right("hello world", 3) - // "rld" - - // Drop last N graphemes - let without_ext = str.drop_right("file.txt", 4) - // "file" - - // Works with emoji too! - let emoji_end = str.take_right("Hello 👋🏽", 1) - // "👋🏽" (single grapheme cluster with skin tone) -} -``` - -### Capitalize and Case Manipulation - -```gleam -import str - -pub fn capitalize_examples() { - // Capitalize: first letter uppercase, rest lowercase - let text = str.capitalize("hELLO wORLD") - // "Hello world" - - // Swap case - let swapped = str.swapcase("Hello World") - // "hELLO wORLD" -} -``` - -### Partition and Split - -```gleam -import str - -pub fn partition_examples() { - // Partition from first occurrence - let #(before, sep, after) = str.partition("a-b-c", "-") - // #("a", "-", "b-c") - - // Partition from LAST occurrence - // Note: if not found, returns #("", "", text) like Python - let #(before2, sep2, after2) = str.rpartition("a-b-c", "-") - // #("a-b", "-", "c") - - // Split with max parts limit - let parts = str.splitn("one-two-three-four", "-", 2) - // ["one", "two-three-four"] - - let parts3 = str.splitn("a:b:c:d", ":", 3) - // ["a", "b", "c:d"] -} -``` - -### Padding and Filling - -```gleam -import str - -pub fn padding_examples() { - // Standard padding - let padded = str.pad_left("42", 5, "0") - // "00042" - - // Flexible fill with position type - let left_fill = str.fill("x", 5, "-", str.Left) - // "----x" - - let right_fill = str.fill("x", 5, "-", str.Right) - // "x----" - - let center_fill = str.fill("x", 5, "-", str.Both) - // "--x--" -} -``` - -### Chunking Strings - -```gleam -import str - -pub fn chunk_examples() { - // Split into fixed-size chunks - let chunks = str.chunk("abcdefg", 3) - // ["abc", "def", "g"] - - let pairs = str.chunk("abcdef", 2) - // ["ab", "cd", "ef"] - - // Works with emoji (grapheme-aware!) - let emoji_chunks = str.chunk("👨‍👩‍👧‍👦ab", 2) - // ["👨‍👩‍👧‍👦a", "b"] -} -``` - -### Prefix/Suffix Checking - -```gleam -import str - -pub fn prefix_suffix_examples() { - // Check multiple prefixes at once - let is_greeting = str.starts_with_any("hello world", ["hi", "hello", "hey"]) - // True - - // Check multiple suffixes at once - let is_image = str.ends_with_any("photo.png", [".jpg", ".png", ".gif"]) - // True - - let is_code = str.ends_with_any("main.gleam", [".gleam", ".erl", ".ex"]) - // True -} -``` - -### Whitespace Normalization - -```gleam -import str - -pub fn whitespace_examples() { - // Collapse all whitespace to single spaces - let normalized = str.normalize_whitespace(" hello world \n\t test ") - // "hello world test" - - // Great for cleaning user input - let clean = str.normalize_whitespace(" John Doe ") - // "John Doe" -} -``` - -### Text Utilities - -```gleam -import str - -pub fn utility_examples() { - // Reverse word order - let reversed = str.reverse_words("hello beautiful world") - // "world beautiful hello" - - // Extract initials - let init = str.initials("John Fitzgerald Kennedy") - // "JFK" - - // Regex escaping for pattern matching - let pattern = str.escape_regex("hello.world[test]") - // "hello\\.world\\[test\\]" -} -``` - -## OTP-based Unicode Normalization - -### Implementation Location - -Define Unicode normalization helpers in your application code (not in the `str` library). These helpers should implement the `String -> String` signature and can be passed to any `str` function that accepts a normalizer parameter. - -### Example Implementation - -```gleam -// file: src/normalize.gleam (in your app, not in `str`) -pub fn otp_nfd(s: String) -> String { - // Call OTP from your app via Erlang interop. Example (conceptual): - // :unicode.characters_to_nfd_binary(s) - s -} - -// Use it when calling into `str`: -let folded = str.ascii_fold_with_normalizer("Crème Brûlée", otp_nfd) -let slug = str.slugify_opts_with_normalizer("Crème Brûlée", 0, "-", False, otp_nfd) -``` - -Notes: -- Put the code above in your application so the `str` package remains - free of OTP as a hard dependency. -- The exact Erlang interop call depends on your project setup and - runtime; the example above is conceptual. - -## 2) Convenience alias `slugify_with_normalizer` - -A short wrapper is available for convenience. Example usage: - -```gleam -// short alias: uses default separator `-` and no token limit -let s = "Café ❤️ Gleam" -let slug = str.slugify_with_normalizer(s, otp_nfd) -``` - -## 3) No-decompose variants - -If you prefer not to run the library's limited Latin decomposer you can -call the `_no_decompose_` variants and still pass a normalizer: - -```gleam -let folded = str.ascii_fold_no_decompose_with_normalizer(s, otp_nfd) -``` - -This gives you full control over decomposition/normalization order. - -## 4) Testing locally (use `gleam test`) - -The project uses Gleam's test runner. Example commands: - -```sh -# run all tests -gleam test - -# run a single test file (shell navigation) -cd /path/to/project && gleam test -``` - -## 5) Regenerating character tables (docs) - -If you extend `src/str/internal/translit.gleam` or -`src/str/internal/decompose.gleam`, regenerate the JSON used by the -docs: - -```sh -python3 scripts/generate_character_tables.py -``` - -## 6) Example of a small fake normalizer (useful for tests) - -In tests it's handy to simulate NFD/NFC without OTP. Example: - -```gleam -let fake_nfd = fn(x) { string.replace(x, "é", "e\u{0301}") } -let slug = str.slugify_opts_with_normalizer("Café", 0, "-", False, fake_nfd) -assert slug == "cafe" -``` - -## 7) Where to put NFC/NFD helpers (application-side) - -If you want to provide explicit `nfc`/`nfd` helpers that call OTP, put -them in your application (not in the `str` library). Example (commented): - -```gleam -// file: src/normalize.gleam (in your app) -// pub fn nfd(s: String) -> String { -// // :unicode.characters_to_nfd_binary(s) -// } -// -// pub fn nfc(s: String) -> String { -// // :unicode.characters_to_nfc_binary(s) -// } -``` - -## 8) Tokenization reference - -If you need a pure-Gleam tokenizer for special processing, see -`src/str/internal/tokenize.gleam` which provides a pedagogic reference -implementation. Access the tokenizer via the public API: - -```gleam -import str - -let clusters = str.chars("café") -// -> ["c", "a", "f", "é"] - -let stdlib_clusters = str.chars_stdlib("café") -// -> ["c", "a", "f", "é"] -``` diff --git a/README.md b/README.md index aca95ff..1fab2c7 100644 --- a/README.md +++ b/README.md @@ -15,28 +15,30 @@ License: MIT

-> **Production-ready** Gleam library providing Unicode-aware string operations with a focus on grapheme-cluster correctness, pragmatic ASCII transliteration, and URL-friendly slug generation. +> Production-ready Gleam library for Unicode-aware string operations. All +> operations work at grapheme cluster boundaries, correct behaviour for +> emoji, ZWJ sequences, combining marks, and flags. --- -## ✨ Features +## Features | Category | Highlights | |----------|------------| -| 🎯 **Grapheme-Aware** | All operations correctly handle Unicode grapheme clusters (emoji, ZWJ sequences, combining marks) | -| 🔤 **Case Conversions** | `snake_case`, `camelCase`, `kebab-case`, `PascalCase`, `Title Case`, `capitalize` | -| 🔗 **Slug Generation** | Configurable `slugify` with token limits, custom separators, and Unicode preservation | -| 🔍 **Search & Replace** | `index_of`, `last_index_of`, `replace_first`, `replace_last`, `contains_any/all` | -| ✅ **Validation** | `is_uppercase`, `is_lowercase`, `is_title_case`, `is_ascii`, `is_hex`, `is_numeric`, `is_alpha` | -| 🛡️ **Escaping** | `escape_html`, `unescape_html`, `escape_regex` | -| 📏 **Similarity** | Levenshtein `distance`, percentage `similarity`, `hamming_distance` | -| 🧩 **Splitting** | `splitn`, `partition`, `rpartition`, `chunk`, `lines`, `words` | -| 📐 **Padding** | `pad_left`, `pad_right`, `center`, `fill` | -| 🚀 **Minimal Dependencies** | Pure Gleam implementation with no OTP requirement | +| **Grapheme-Aware** | `take`, `drop`, `length`, `reverse`, `chunk` — all grapheme-correct | +| **Case Conversions** | `snake_case`, `camelCase`, `kebab-case`, `PascalCase`, `Title Case` | +| **Slug Generation** | `slugify` with token limits, custom separators, Unicode preservation | +| **Search & Replace** | `index_of`, `last_index_of`, `replace_first/last`, `contains_any/all` | +| **Validation** | `is_uppercase/lowercase/title_case`, `is_ascii/hex/numeric/alpha` | +| **Escaping** | `escape_html`, `unescape_html`, `escape_regex` | +| **Similarity** | Levenshtein `distance`, normalized `similarity`, `hamming_distance` | +| **Splitting** | `splitn`, `partition`, `rpartition`, `chunk`, `lines`, `words` | +| **Padding** | `pad_left`, `pad_right`, `center`, `fill` | +| **Minimal deps** | No OTP requirement — works on Erlang and JavaScript targets | --- -## 📦 Installation +## Installation ```sh gleam add str @@ -44,339 +46,92 @@ gleam add str --- -## 🚀 Quick Start +## Quick Start ```gleam import str -pub fn main() { - // 🎯 Grapheme-safe truncation preserves emoji - let text = "Hello 👩‍👩‍👧‍👦 World" - str.truncate(text, 10, "...") - // → "Hello 👩‍👩‍👧‍👦..." - - // 🔗 ASCII transliteration and slugification - str.slugify("Crème Brûlée — Recipe 2025!") - // → "creme-brulee-recipe-2025" - - // 🔤 Case conversions - str.to_camel_case("hello world") // → "helloWorld" - str.to_snake_case("Hello World") // → "hello_world" - str.capitalize("hELLO wORLD") // → "Hello world" - - // 🔍 Grapheme-aware search - str.index_of("👨‍👩‍👧‍👦 family test", "family") - // → Ok(2) - counts grapheme clusters, not bytes! - - // 📏 String similarity - str.similarity("hello", "hallo") - // → 0.8 (80% similar) - - // 🛡️ HTML escaping - str.escape_html("") - // → "<script>alert('xss')</script>" -} -``` - ---- - -## 📚 API Reference - -### 🔤 Case & Capitalization - -| Function | Example | Result | -|----------|---------|--------| -| `capitalize(text)` | `"hELLO wORLD"` | `"Hello world"` | -| `swapcase(text)` | `"Hello World"` | `"hELLO wORLD"` | -| `is_uppercase(text)` | `"HELLO123"` | `True` | -| `is_lowercase(text)` | `"hello_world"` | `True` | -| `is_title_case(text)` | `"Hello World"` | `True` | -| `is_mixed_case(text)` | `"helloWorld"` | `True` | - -### ✂️ Grapheme Extraction - -| Function | Example | Result | -|----------|---------|--------| -| `take(text, n)` | `take("👨‍👩‍👧‍👦abc", 2)` | `"👨‍👩‍👧‍👦a"` | -| `drop(text, n)` | `drop("hello", 2)` | `"llo"` | -| `take_right(text, n)` | `take_right("hello", 3)` | `"llo"` | -| `drop_right(text, n)` | `drop_right("hello", 2)` | `"hel"` | -| `at(text, index)` | `at("hello", 1)` | `Ok("e")` | -| `chunk(text, size)` | `chunk("abcdef", 2)` | `["ab", "cd", "ef"]` | - -### 🔍 Search & Replace - -| Function | Example | Result | -|----------|---------|--------| -| `index_of(text, needle)` | `"hello world", "world"` | `Ok(6)` | -| `last_index_of(text, needle)` | `"hello hello", "hello"` | `Ok(6)` | -| `contains_any(text, needles)` | `"hello", ["x", "e", "z"]` | `True` | -| `contains_all(text, needles)` | `"hello", ["h", "e"]` | `True` | -| `replace_first(text, old, new)` | `"aaa", "a", "b"` | `"baa"` | -| `replace_last(text, old, new)` | `"aaa", "a", "b"` | `"aab"` | +// Grapheme-safe truncation preserves emoji sequences +str.truncate("Hello 👩‍👩‍👧‍👦 World", 10, "...") +// → "Hello 👩‍👩‍👧‍👦..." -### ⚠️ Experimental: Search Strategies +// ASCII transliteration and URL-friendly slugs +str.slugify("Crème Brûlée — Recipe 2025!") +// → "creme-brulee-recipe-2025" -**Algorithms:** -- **KMP**: optimized for long/repetitive patterns -- **Sliding**: fast for short patterns, zero allocations +// Case conversions +str.to_camel_case("hello world") // → "helloWorld" +str.to_snake_case("Hello World") // → "hello_world" -**APIs:** +// Grapheme-aware search — counts clusters, not bytes +str.index_of("👨‍👩‍👧‍👦 family test", "family") +// → Ok(2) -| Function | Description | -|----------|-------------| -| `index_of_auto(text, pattern)` | Auto-select algorithm (heuristic) | -| `index_of_strategy(text, pattern, Kmp\|Sliding)` | Explicit algorithm choice | -| `count_auto(text, pattern, overlapping)` | Auto-select for counting | -| `count_strategy(text, pattern, overlapping, Kmp\|Sliding)` | Explicit count algorithm | - -**Examples:** - -```gleam -// Force KMP explicitly -str.index_of_strategy("long text...", "pattern", str.Kmp) - -// Let heuristic decide (experimental) -str.index_of_auto("some text", "pat") +// Normalized Levenshtein similarity +str.similarity("hello", "hallo") +// → 0.8 ``` -> **Note:** `_auto` variants use heuristics and may not always choose optimally. For performance-critical code, use `_strategy` variants. Configure thresholds in `src/str/config.gleam`. - -### 🧩 Splitting & Partitioning - -| Function | Example | Result | -|----------|---------|--------| -| `partition(text, sep)` | `"a-b-c", "-"` | `#("a", "-", "b-c")` | -| `rpartition(text, sep)` | `"a-b-c", "-"` | `#("a-b", "-", "c")` | -| `splitn(text, sep, n)` | `"a-b-c-d", "-", 2` | `["a", "b-c-d"]` | -| `words(text)` | `"hello world"` | `["hello", "world"]` | -| `lines(text)` | `"a\nb\nc"` | `["a", "b", "c"]` | - -### 📐 Padding & Filling - -| Function | Example | Result | -|----------|---------|--------| -| `pad_left(text, width, pad)` | `"42", 5, "0"` | `"00042"` | -| `pad_right(text, width, pad)` | `"hi", 5, "*"` | `"hi***"` | -| `center(text, width, pad)` | `"hi", 6, "-"` | `"--hi--"` | -| `fill(text, width, pad, pos)` | `"x", 5, "-", "both"` | `"--x--"` | - -### ✅ Validation - -| Function | Description | -|----------|-------------| -| `is_numeric(text)` | Digits only (0-9) | -| `is_alpha(text)` | Letters only (a-z, A-Z) | -| `is_alphanumeric(text)` | Letters and digits | -| `is_ascii(text)` | ASCII only (0x00-0x7F) | -| `is_printable(text)` | Printable ASCII (0x20-0x7E) | -| `is_hex(text)` | Hexadecimal (0-9, a-f, A-F) | -| `is_blank(text)` | Whitespace only | -| `is_title_case(text)` | Title Case format | - -### 🔗 Prefix & Suffix - -| Function | Example | Result | -|----------|---------|--------| -| `remove_prefix(text, prefix)` | `"hello world", "hello "` | `"world"` | -| `remove_suffix(text, suffix)` | `"file.txt", ".txt"` | `"file"` | -| `ensure_prefix(text, prefix)` | `"world", "hello "` | `"hello world"` | -| `ensure_suffix(text, suffix)` | `"file", ".txt"` | `"file.txt"` | -| `starts_with_any(text, list)` | `"hello", ["hi", "he"]` | `True` | -| `ends_with_any(text, list)` | `"file.txt", [".txt", ".md"]` | `True` | -| `common_prefix(strings)` | `["abc", "abd"]` | `"ab"` | -| `common_suffix(strings)` | `["abc", "xbc"]` | `"bc"` | - -### 🛡️ Escaping - -| Function | Example | Result | -|----------|---------|--------| -| `escape_html(text)` | `"
"` | `"<div>"` | -| `unescape_html(text)` | `"<div>"` | `"
"` | -| `escape_regex(text)` | `"a.b*c"` | `"a\\.b\\*c"` | - -### 📏 Similarity & Distance - -| Function | Example | Result | -|----------|---------|--------| -| `distance(a, b)` | `"kitten", "sitting"` | `3` | -| `similarity(a, b)` | `"hello", "hallo"` | `0.8` | -| `hamming_distance(a, b)` | `"karolin", "kathrin"` | `Ok(3)` | - -### 📝 Text Manipulation - -| Function | Description | -|----------|-------------| -| `truncate(text, len, suffix)` | Truncate with emoji preservation | -| `ellipsis(text, len)` | Truncate with … | -| `reverse(text)` | Grapheme-aware reversal | -| `reverse_words(text)` | Reverse word order | -| `initials(text)` | Extract initials (`"John Doe"` → `"JD"`) | -| `normalize_whitespace(text)` | Collapse whitespace | -| `strip(text, chars)` | Remove chars from ends | -| `squeeze(text, char)` | Collapse consecutive chars | -| `chomp(text)` | Remove trailing newline | - -### 📄 Line Operations - -| Function | Description | -|----------|-------------| -| `lines(text)` | Split into lines | -| `dedent(text)` | Remove common indentation | -| `indent(text, spaces)` | Add indentation | -| `wrap_at(text, width)` | Word wrap | - --- -## 🔤 Case Conversions & ASCII Folding - -### Case Conversions - -```gleam -import str +## Documentation -str.to_snake_case("Hello World") // → "hello_world" -str.to_camel_case("hello world") // → "helloWorld" -str.to_pascal_case("hello world") // → "HelloWorld" -str.to_kebab_case("Hello World") // → "hello-world" -str.to_title_case("hello world") // → "Hello World" -str.camel_to_snake("camelCase") // → "camel_case" -str.snake_to_camel("snake_case") // → "snakeCase" -str.pascal_to_snake("PascalCase") // → "pascal_case" -str.snake_to_pascal("snake_case") // → "SnakeCase" -``` - -### ASCII Folding (Deburr) - -```gleam -str.ascii_fold("Crème Brûlée") // → "Creme Brulee" -str.ascii_fold("straße") // → "strasse" -str.ascii_fold("æon") // → "aeon" -``` +| Document | Description | +|----------|-------------| +| [API Reference](docs/api_reference.md) | Complete function reference with examples | +| [Examples](docs/examples.md) | Integration snippets and patterns | -### Slug Generation - -```gleam -str.slugify("Hello, World!") // → "hello-world" -str.slugify_opts("one two three", 2, "-", False) // → "one-two" -str.slugify_opts("Hello World", 0, "_", False) // → "hello_world" -``` +| [OTP Integration](docs/otp_integration.md) | NFC/NFD normalization via Erlang | +| [Core internals](docs/str_core.md) | Grapheme-aware core operations | +| [Extra internals](docs/str_extra.md) | ASCII folding and slug generation | +| [Tokenizer](docs/str_tokenize.md) | Pure-Gleam grapheme tokenizer reference | --- -## 🏗️ Module Guide - -### Which module should I use? - -| Module | When to use | Import | -|--------|-------------|--------| -| **`str`** | All string operations (recommended) | `import str` | -| **`str/advanced`** | Low-level KMP algorithms, caching | `import str/advanced` | -| **`str/config`** | Search heuristics configuration | `import str/config` | - -**Quick start:** Use `import str` for all your needs. The main `str` module provides the complete public API including grapheme operations, ASCII folding, slugs, and case conversions. - -**Advanced users:** Import `str/advanced` for explicit control over search algorithms and KMP map caching. +## Module Guide -### Module structure +| Module | Use when | Import | +|--------|----------|--------| +| `str` | Everything — recommended entry point | `import str` | +| `str/advanced` | Explicit KMP algorithm control, map caching | `import str/advanced` | +| `str/config` | Tune search heuristic thresholds | `import str/config` | -``` +```text str/ -├── str.gleam # Main module (complete public API) +├── str.gleam # Main module — complete public API ├── advanced.gleam # Low-level search algorithms -├── config.gleam # Search heuristics configuration -└── internal/ # Implementation details (not public API) +├── config.gleam # Search heuristic configuration +└── internal/ # Implementation details (not part of public API) ``` --- -## 📖 Documentation - -| Document | Description | -|----------|-------------| -| [Core API](docs/str_core.md) | Grapheme-aware string operations | -| [Extra API](docs/str_extra.md) | ASCII folding and slug generation | -| [Tokenizer](docs/str_tokenize.md) | Pure-Gleam tokenizer reference | -| [Examples](EXAMPLES.md) | Integration examples and OTP patterns | -| [Character Tables](docs/character_tables.json) | Machine-readable transliteration data | - ---- - -## ⚡ Optional OTP Integration - -The library core is OTP-free by design. For production Unicode normalization (NFC/NFD): - -```gleam -import str - -// In your application code: -pub fn otp_nfd(s: String) -> String { - // Call Erlang's :unicode module - s -} - -// Use with str: -str.ascii_fold_with_normalizer("Crème", otp_nfd) -str.slugify_with_normalizer("Café", otp_nfd) -``` - ---- - -## 🧪 Development +## Development ```sh -# Run the test suite -gleam test - -# Regenerate character tables documentation -python3 scripts/generate_character_tables.py +gleam test # run test suite (Erlang target) +gleam test --target javascript # run on JavaScript target +python3 scripts/generate_character_tables.py # regenerate transliteration tables ``` -Note: as of **2.0.0**, `escape_html` now uses the `houdini` library for fast, allocation‑friendly escaping, and `unescape_html` uses `odysseus` for comprehensive entity support (named, decimal and hex numeric entities). See [CHANGELOG.md](CHANGELOG.md) for details. - ---- - -## 📊 Test Coverage - -- **tests** covering all public functions -- Unicode edge cases (emoji, ZWJ, combining marks) -- Grapheme cluster boundary handling -- Cross-module integration tests - --- -## 🤝 Contributing +## Contributing -Contributions welcome! Areas for improvement: - -- Expanding character transliteration tables -- Additional test cases for edge cases -- Documentation improvements -- Performance optimizations - -```sh -gleam test # Ensure tests pass before submitting PRs -``` +Contributions welcome. See [CONTRIBUTING.md](CONTRIBUTING.md). +Run `gleam test` before submitting PRs. --- -## 📄 License +## License -MIT License — see [LICENSE](LICENSE) for details. +MIT — see [LICENSE](LICENSE). --- -## 🔗 Links +## Links - [Gleam Language](https://gleam.run/) -- [Unicode Grapheme Clusters (UAX #29)](https://unicode.org/reports/tr29/) +- [Unicode Grapheme Clusters — UAX #29](https://unicode.org/reports/tr29/) - [Hex Package](https://hex.pm/packages/str) -- [Hex Documentation](https://hexdocs.pm/str/) - ---- - -
- -**Made with 💜 for the Gleam community** - -
+- [Hex Docs](https://hexdocs.pm/str/) diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000..882c24a --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,60 @@ +# Roadmap + +## v2.1.1 — Documentation fixes ✅ + +Patch release. No API changes. + +- Documented `similarity` formula (normalized Levenshtein) +- Clarified `strip` treats `chars` as a set, not a literal +- Clarified `ascii_fold_no_decompose` vs `ascii_fold` with examples +- Documented `slugify_opts` `max_len: -1` = no limit +- Promoted `chars` experimental warning +- Fixed README `fill` example (was showing string `"both"` instead of `Both`) + +--- + +## v2.2.0 — New API surface + deprecations + +Additive release. All existing code continues to work. + +**New API:** +- `TruncateMode` type (`Strict | Preserve`) replacing `truncate_strict` / + `truncate_preserve` / `truncate_with_flag` — collapses 4 functions into 1 +- `SlugifyOpts` record + `slug_with` replacing the positional `slugify_opts` + variants — eliminates opaque positional args and combinatorial `_with_normalizer` variants +- `strip_affixes` as the canonical name for `unwrap` — `unwrap` conflicts + with established FP semantics (extract from container or panic) + +**Deprecations** (removed in 3.0): +- `truncate_strict`, `truncate_preserve`, `truncate_default`, `truncate_with_flag` +- `slugify_opts`, `slugify_with_normalizer`, `slugify_opts_with_normalizer` +- `ascii_fold_with_normalizer`, `ascii_fold_no_decompose_with_normalizer` +- `unwrap`, `index_of_simple`, `count_simple` +- KMP/sliding re-exports in `str` main module → use `str/advanced` directly +- Config constant re-exports in `str` main module → use `str/config` directly + +**Also:** +- `MIGRATION.md` with full old → new mapping +- `chars` moved to `str/tokenize` or renamed `approximate_chars`; + `chars_stdlib` becomes the canonical `chars` + +--- + +## v3.0.0 — Breaking cleanup + +Removes everything deprecated in v2.2.0. Users who followed deprecation +notices have zero changes to make. + +**Removed:** +- All deprecated truncate variants → only `truncate` (3-arg, default behaviour) + and `truncate_mode` (4-arg with `TruncateMode`) remain +- All deprecated slugify/ascii_fold variants → `slugify`, `slug_with`, `ascii_fold`, + `ascii_fold_no_decompose` remain +- `unwrap` → `strip_affixes` +- `index_of_simple`, `count_simple` +- KMP/sliding/config re-exports from `str` main module +- Legacy modules `str/core`, `str/extra`, `str/tokenize` (deprecated since 2.0) + +**API surface goal:** `str.gleam` exposes only user-facing operations. +Algorithm plumbing lives exclusively in `str/advanced`. Configuration in +`str/config`. No internals leaking through the main module. diff --git a/docs/api_reference.md b/docs/api_reference.md new file mode 100644 index 0000000..7ea5f4b --- /dev/null +++ b/docs/api_reference.md @@ -0,0 +1,223 @@ +# str — API Reference + +Complete function reference for `import str`. For module internals see +[str_core.md](str_core.md) and [str_extra.md](str_extra.md). + +--- + +## Case & Capitalization + +| Function | Example | Result | +|----------|---------|--------| +| `capitalize(text)` | `"hELLO wORLD"` | `"Hello world"` | +| `swapcase(text)` | `"Hello World"` | `"hELLO wORLD"` | +| `is_uppercase(text)` | `"HELLO123"` | `True` | +| `is_lowercase(text)` | `"hello_world"` | `True` | +| `is_title_case(text)` | `"Hello World"` | `True` | +| `is_mixed_case(text)` | `"helloWorld"` | `True` | + +### Case Conversions + +```gleam +str.to_snake_case("Hello World") // → "hello_world" +str.to_camel_case("hello world") // → "helloWorld" +str.to_pascal_case("hello world") // → "HelloWorld" +str.to_kebab_case("Hello World") // → "hello-world" +str.to_title_case("hello world") // → "Hello World" +str.camel_to_snake("camelCase") // → "camel_case" +str.pascal_to_snake("PascalCase") // → "pascal_case" // alias for camel_to_snake +str.snake_to_camel("snake_case") // → "snakeCase" +str.snake_to_pascal("snake_case") // → "SnakeCase" +``` + +--- + +## Grapheme Extraction + +| Function | Example | Result | +|----------|---------|--------| +| `take(text, n)` | `take("👨‍👩‍👧‍👦abc", 2)` | `"👨‍👩‍👧‍👦a"` | +| `drop(text, n)` | `drop("hello", 2)` | `"llo"` | +| `take_right(text, n)` | `take_right("hello", 3)` | `"llo"` | +| `drop_right(text, n)` | `drop_right("hello", 2)` | `"hel"` | +| `at(text, index)` | `at("hello", 1)` | `Ok("e")` | +| `chunk(text, size)` | `chunk("abcdef", 2)` | `["ab", "cd", "ef"]` | +| `length(text)` | `length("👨‍👩‍👧‍👦")` | `1` | +| `reverse(text)` | `reverse("Hello 👋")` | `"👋 olleH"` | + +--- + +## Search & Replace + +| Function | Example | Result | +|----------|---------|--------| +| `index_of(text, needle)` | `"hello world", "world"` | `Ok(6)` | +| `last_index_of(text, needle)` | `"hello hello", "hello"` | `Ok(6)` | +| `contains(text, needle)` | `"hello world", "world"` | `True` | +| `contains_any(text, needles)` | `"hello", ["x", "e"]` | `True` | +| `contains_all(text, needles)` | `"hello", ["h", "e"]` | `True` | +| `replace_first(text, old, new)` | `"aaa", "a", "b"` | `"baa"` | +| `replace_last(text, old, new)` | `"aaa", "a", "b"` | `"aab"` | + +### Search Strategies (experimental) + +Automatic heuristic selection between KMP (long/repetitive patterns) and +sliding window (short patterns, zero allocations). Configure thresholds in +`str/config`. + +| Function | Description | +|----------|-------------| +| `index_of_auto(text, pattern)` | Heuristic algorithm selection | +| `index_of_strategy(text, pattern, Kmp\|Sliding)` | Explicit algorithm | +| `count_auto(text, pattern, overlapping)` | Heuristic for counting | +| `count_strategy(text, pattern, overlapping, Kmp\|Sliding)` | Explicit count | + +For pre-built KMP maps (hot loops), use `import str/advanced`. See +[str_core.md](str_core.md) for details. + +--- + +## Splitting & Partitioning + +| Function | Example | Result | +|----------|---------|--------| +| `partition(text, sep)` | `"a-b-c", "-"` | `#("a", "-", "b-c")` | +| `rpartition(text, sep)` | `"a-b-c", "-"` | `#("a-b", "-", "c")` | +| `splitn(text, sep, n)` | `"a-b-c-d", "-", 2` | `["a", "b-c-d"]` | +| `words(text)` | `"hello world"` | `["hello", "world"]` | +| `lines(text)` | `"a\nb\nc"` | `["a", "b", "c"]` | + +--- + +## Padding & Filling + +| Function | Example | Result | +|----------|---------|--------| +| `pad_left(text, width, pad)` | `"42", 5, "0"` | `"00042"` | +| `pad_right(text, width, pad)` | `"hi", 5, "*"` | `"hi***"` | +| `center(text, width, pad)` | `"hi", 6, "-"` | `"--hi--"` | +| `fill(text, width, pad, pos)` | `"x", 5, "-", Both` | `"--x--"` | + +`FillPosition` values: `Left`, `Right`, `Both`. + +--- + +## Prefix & Suffix + +| Function | Example | Result | +|----------|---------|--------| +| `starts_with(text, prefix)` | `"hello", "he"` | `True` | +| `ends_with(text, suffix)` | `"file.txt", ".txt"` | `True` | +| `starts_with_any(text, list)` | `"hello", ["hi", "he"]` | `True` | +| `ends_with_any(text, list)` | `"file.txt", [".txt"]` | `True` | +| `remove_prefix(text, prefix)` | `"hello world", "hello "` | `"world"` | +| `remove_suffix(text, suffix)` | `"file.txt", ".txt"` | `"file"` | +| `ensure_prefix(text, prefix)` | `"world", "hello "` | `"hello world"` | +| `ensure_suffix(text, suffix)` | `"file", ".txt"` | `"file.txt"` | +| `common_prefix(strings)` | `["abc", "abd"]` | `"ab"` | +| `common_suffix(strings)` | `["abc", "xbc"]` | `"bc"` | + +--- + +## Validation + +| Function | Description | +|----------|-------------| +| `is_empty(text)` | Empty string | +| `is_blank(text)` | Whitespace only | +| `is_numeric(text)` | Digits only (0–9) | +| `is_alpha(text)` | Letters only | +| `is_alphanumeric(text)` | Letters and digits | +| `is_ascii(text)` | ASCII only (0x00–0x7F) | +| `is_printable(text)` | Printable ASCII (0x20–0x7E) | +| `is_hex(text)` | Hexadecimal (0–9, a–f, A–F) | +| `is_uppercase(text)` | All cased chars uppercase | +| `is_lowercase(text)` | All cased chars lowercase | +| `is_title_case(text)` | Title Case format | +| `is_mixed_case(text)` | Both cases present | + +--- + +## Escaping + +| Function | Example | Result | +|----------|---------|--------| +| `escape_html(text)` | `"
"` | `"<div>"` | +| `unescape_html(text)` | `"<div>"` | `"
"` | +| `escape_regex(text)` | `"a.b*c"` | `"a\\.b\\*c"` | + +--- + +## Similarity & Distance + +| Function | Example | Result | +|----------|---------|--------| +| `distance(a, b)` | `"kitten", "sitting"` | `3` | +| `similarity(a, b)` | `"hello", "hallo"` | `0.8` | +| `hamming_distance(a, b)` | `"karolin", "kathrin"` | `Ok(3)` | + +`similarity` uses normalized Levenshtein: `1.0 - distance(a,b) / max(len(a), len(b))`. + +--- + +## Text Manipulation + +| Function | Description | +|----------|-------------| +| `truncate(text, len, suffix)` | Truncate grapheme-aware | +| `truncate_strict(text, len, suffix)` | Truncate — may split emoji | +| `truncate_preserve(text, len, suffix)` | Truncate — keeps emoji whole | +| `ellipsis(text, len)` | Truncate with `…` | +| `normalize_whitespace(text)` | Collapse to single spaces | +| `strip(text, chars)` | Remove char set from ends | +| `squeeze(text, char)` | Collapse consecutive chars | +| `chomp(text)` | Remove trailing newline | +| `surround(text, pre, suf)` | Add prefix and suffix | +| `unwrap(text, pre, suf)` | Remove prefix and suffix | +| `strip_affixes(text, pre, suf)` | Alias for `unwrap` (preferred) | +| `reverse_words(text)` | Reverse word order | +| `initials(text)` | Extract initials | + +`strip(text, chars)` — `chars` is a **set of graphemes**, not a literal substring. + +--- + +## Line Operations + +| Function | Description | +|----------|-------------| +| `lines(text)` | Split into lines | +| `dedent(text)` | Remove common indentation | +| `indent(text, spaces)` | Add indentation | +| `wrap_at(text, width)` | Word wrap | + +--- + +## ASCII Folding + +```gleam +str.ascii_fold("Crème Brûlée") // → "Creme Brulee" +str.ascii_fold("straße") // → "strasse" +str.ascii_fold("æon") // → "aeon" +``` + +`ascii_fold_no_decompose` applies only the precomposed replacement table, +skipping NFD decomposition. Use it when the input is already NFC-normalized +and you want to avoid re-decomposing. + +For OTP-based NFC/NFD normalization, see [otp_integration.md](otp_integration.md). + +--- + +## Slug Generation + +```gleam +str.slugify("Hello, World!") // → "hello-world" +str.slugify_opts("one two three", 2, "-", False) // → "one-two" +str.slugify_opts("Hello World", -1, "_", False) // → "hello_world" +``` + +`slugify_opts` parameters: +- `max_len` — max tokens. `-1` = no limit. +- `sep` — separator string. +- `preserve_unicode` — `True` keeps non-ASCII chars instead of folding. diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000..a5db74a --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,200 @@ +# Examples + +Copy-pasteable integration snippets. For full API see [api_reference.md](api_reference.md). + +--- + +## Grapheme-aware search + +```gleam +import str + +pub fn search_examples() { + // Emoji is ONE grapheme cluster — index counts clusters, not bytes + let idx = str.index_of("Hello 👨‍👩‍👧‍👦 World", "World") + // Ok(8) + + let last = str.last_index_of("hello hello hello", "hello") + // Ok(12) + + let has_any = str.contains_any("hello world", ["foo", "world"]) + // True + + let has_all = str.contains_all("hello world", ["hello", "world"]) + // True +} +``` + +--- + +## KMP map caching (hot loops) + +```gleam +import str +import str/advanced + +pub fn cached_search() { + let pattern = "abababab" + let maps = advanced.build_kmp_maps(pattern) + let pmap = maps.0 + let pimap = maps.1 + + // Reuse pre-built maps across many texts — avoids rebuilding prefix table + let idx1 = advanced.kmp_index_of_with_maps("first long text...", pattern, pmap, pimap) + let all = advanced.kmp_search_all_with_maps("another text...", pmap, pimap) +} +``` + +> Prefer `str/advanced` for explicit algorithm control. Use `index_of_auto` +> only for exploratory/non-critical paths. + +--- + +## Grapheme length and boundaries + +```gleam +import str + +pub fn length_examples() { + str.length("Hello") // 5 + str.length("👨‍👩‍👧‍👦") // 1 — family emoji is one grapheme cluster + str.length("🇮🇹") // 1 — flag sequence + str.length("café") // 4 — combining accent stays attached +} +``` + +--- + +## HTML escaping + +```gleam +import str + +let safe = str.escape_html("") +// "<script>alert('xss')</script>" + +let original = str.unescape_html("<div>Hello</div>") +// "
Hello
" +``` + +--- + +## Validation + +```gleam +import str + +str.is_uppercase("HELLO123") // True +str.is_lowercase("hello_world") // True +str.is_title_case("Hello World") // True +str.is_title_case("hello World") // False +str.is_ascii("hello!@#") // True +str.is_ascii("café") // False +str.is_hex("DEADBEEF") // True +str.is_printable("hello\n") // False +``` + +--- + +## Similarity and distance + +```gleam +import str + +str.distance("kitten", "sitting") // 3 +str.similarity("hello", "hallo") // 0.8 +str.hamming_distance("karolin", "kathrin") // Ok(3) +``` + +--- + +## Take / drop from right + +```gleam +import str + +str.take_right("hello world", 3) // "rld" +str.drop_right("file.txt", 4) // "file" +str.take_right("Hello 👋🏽", 1) // "👋🏽" — skin-tone modifier preserved +``` + +--- + +## Partition and split + +```gleam +import str + +str.partition("a-b-c", "-") // #("a", "-", "b-c") +str.rpartition("a-b-c", "-") // #("a-b", "-", "c") +str.splitn("one-two-three-four", "-", 2) // ["one", "two-three-four"] +``` + +--- + +## Padding and fill + +```gleam +import str + +str.pad_left("42", 5, "0") // "00042" +str.fill("x", 5, "-", str.Left) // "----x" +str.fill("x", 5, "-", str.Right) // "x----" +str.fill("x", 5, "-", str.Both) // "--x--" +``` + +--- + +## String manipulation + +```gleam +import str + +str.replace_first("aaa", "a", "b") // "baa" +str.replace_last("aaa", "a", "b") // "aab" +str.normalize_whitespace(" hello world \n") // "hello world" +str.reverse_words("hello beautiful world") // "world beautiful hello" +str.initials("John Fitzgerald Kennedy") // "JFK" +str.strip("..hello..", ".") // "hello" +str.squeeze("aabbcc", "b") // "aabcc" +``` + +--- + +## Slugification + +```gleam +import str + +str.slugify("Crème Brûlée — Recipe 2025!") +// → "creme-brulee-recipe-2025" + +str.slugify_opts("one two three four", 2, "-", False) +// → "one-two" + +// Preserve Unicode characters +str.slugify_opts("Héllo Wörld", -1, "-", True) +// → "héllo-wörld" +``` + +--- + +## Case conversions + +```gleam +import str + +str.to_snake_case("Hello World") // → "hello_world" +str.to_camel_case("hello world") // → "helloWorld" +str.to_pascal_case("hello world") // → "HelloWorld" +str.to_kebab_case("Hello World") // → "hello-world" +str.camel_to_snake("camelCase") // → "camel_case" +str.snake_to_camel("snake_case") // → "snakeCase" +``` + +--- + +## OTP-based Unicode normalization + +See [otp_integration.md](otp_integration.md) for passing NFC/NFD normalizers +to `ascii_fold_with_normalizer` and `slugify_with_normalizer`. diff --git a/docs/otp_integration.md b/docs/otp_integration.md new file mode 100644 index 0000000..9baf634 --- /dev/null +++ b/docs/otp_integration.md @@ -0,0 +1,71 @@ +# OTP Integration + +`str` is OTP-free by design. Its internal decomposer covers common Latin +scripts; for production-grade Unicode normalization (NFC/NFD) pass an OTP +normalizer function from your application. + +--- + +## Normalizer signature + +Any function with type `fn(String) -> String` works. + +```gleam +// In your application (not in the str library) +pub fn otp_nfd(s: String) -> String { + // Call Erlang's :unicode module via FFI + // :unicode.characters_to_nfd_binary(s) + s // replace with actual FFI call +} +``` + +--- + +## Usage with ascii_fold + +```gleam +import str + +// Full pipeline: decompose → normalize via OTP → remove combining marks +str.ascii_fold_with_normalizer("Crème Brûlée", otp_nfd) +// → "Creme Brulee" + +// Skip decomposition, apply normalizer only +str.ascii_fold_no_decompose_with_normalizer("Café", otp_nfd) +``` + +--- + +## Usage with slugify + +```gleam +// Default separator, no token limit +str.slugify_with_normalizer("Crème Brûlée", otp_nfd) +// → "creme-brulee" + +// Full options + normalizer +str.slugify_opts_with_normalizer("Crème Brûlée", 2, "-", False, otp_nfd) +// → "creme-brulee" +``` + +--- + +## Testing without OTP + +A fake normalizer is useful in tests to simulate NFD without Erlang interop: + +```gleam +import gleam/string + +let fake_nfd = fn(s) { string.replace(s, "é", "e\u{0301}") } +let slug = str.slugify_opts_with_normalizer("Café", -1, "-", False, fake_nfd) +// → "cafe" +``` + +--- + +## Placement note + +Keep normalizer helpers in your application, not in `str` itself. +`str` has no OTP dependency; adding one would break JavaScript targets and +pure-Gleam environments. diff --git a/gleam.toml b/gleam.toml index 39523c0..9268f2d 100644 --- a/gleam.toml +++ b/gleam.toml @@ -1,5 +1,5 @@ name = "str" -version = "2.1.0" +version = "2.1.1" # Project metadata (fill or replace placeholders before publishing) description = "Unicode-aware string utilities for Gleam: grapheme-safe operations, pragmatic ASCII transliteration, and slug generation." diff --git a/src/str.gleam b/src/str.gleam index ff92489..49b54ba 100644 --- a/src/str.gleam +++ b/src/str.gleam @@ -397,6 +397,19 @@ pub fn ensure_suffix(text: String, suffix: String) -> String { } /// Strips specified characters from both ends. +/// +/// `chars` is treated as a **set of individual graphemes** to remove, not +/// as a literal substring. Each grapheme in `chars` is removed independently +/// from both ends of `text` until a grapheme not in the set is found. +/// +/// ## Examples +/// +/// ```gleam +/// strip("..hello..", ".") +/// // -> "hello" +/// strip("xyxhelloxyxy", "xy") +/// // -> "hello" // removes any 'x' or 'y' from both ends +/// ``` pub fn strip(text: String, chars: String) -> String { core.strip(text, chars) } @@ -486,13 +499,20 @@ pub fn distance(a: String, b: String) -> Int { core.distance(a, b) } -/// Calculates similarity as a percentage (0.0 to 1.0). +/// Calculates similarity as a percentage (0.0 to 1.0) using normalized +/// Levenshtein distance: `1.0 - distance(a, b) / max(len(a), len(b))`. +/// Returns 1.0 for identical strings, 0.0 for fully different strings of +/// equal length. /// /// ## Examples /// /// ```gleam /// similarity("hello", "hallo") /// // -> 0.8 +/// similarity("abc", "xyz") +/// // -> 0.0 +/// similarity("", "") +/// // -> 1.0 /// ``` pub fn similarity(a: String, b: String) -> Float { core.similarity(a, b) @@ -700,6 +720,23 @@ pub fn slugify_with_normalizer(text: String, normalizer) -> String { } /// Creates slug with detailed options. +/// +/// - `max_len`: maximum number of tokens (words) to include. Pass `-1` for +/// no limit. Pass `0` to produce an empty string. +/// - `sep`: separator between tokens (default `"-"` in `slugify`). +/// - `preserve_unicode`: when `True`, keeps non-ASCII characters instead of +/// folding them to ASCII equivalents. +/// +/// ## Examples +/// +/// ```gleam +/// slugify_opts("one two three", 2, "-", False) +/// // -> "one-two" +/// slugify_opts("Hello World", -1, "_", False) +/// // -> "hello_world" +/// slugify_opts("Héllo", -1, "-", True) +/// // -> "héllo" +/// ``` pub fn slugify_opts( text: String, max_len: Int, @@ -739,6 +776,25 @@ pub fn ascii_fold(text: String) -> String { } /// ASCII folding without Unicode decomposition. +/// +/// Unlike `ascii_fold`, this function skips the NFD decomposition step and +/// applies only the direct replacement table. Use this when the input is +/// already NFC-normalized and you want to avoid re-decomposing it, or when +/// you need to preserve combining marks that decomposition would expose. +/// +/// Precomposed characters in the replacement table (e.g. `"é"` → `"e"`) are +/// still converted. Characters not in the table are left unchanged. +/// +/// ## Examples +/// +/// ```gleam +/// ascii_fold_no_decompose("café") +/// // -> "cafe" // "é" is in the precomposed table +/// ascii_fold("straße") +/// // -> "strasse" // ligature handled by decompose path +/// ascii_fold_no_decompose("straße") +/// // -> "strasse" // also in the table directly +/// ``` pub fn ascii_fold_no_decompose(text: String) -> String { extra.ascii_fold_no_decompose(text) } @@ -797,7 +853,9 @@ pub fn camel_to_snake(text: String) -> String { extra.camel_to_snake(text) } -/// Alias for camel_to_snake. +/// Converts PascalCase to snake_case. Alias for `camel_to_snake` — both +/// functions use the same implementation since the conversion rules are +/// identical for PascalCase and camelCase. pub fn pascal_to_snake(text: String) -> String { extra.pascal_to_snake(text) } @@ -818,8 +876,14 @@ pub fn snake_to_pascal(text: String) -> String { /// Pure Gleam grapheme tokenizer (approximates Unicode segmentation). /// -/// This is an experimental pure-Gleam implementation that approximates -/// Unicode grapheme cluster segmentation without external dependencies. +/// **WARNING: experimental.** This implementation approximates Unicode +/// grapheme cluster segmentation and **may produce incorrect results** on +/// complex sequences such as ZWJ emoji, skin-tone modifiers, or flag +/// sequences. For correctness, use `chars_stdlib` instead. +/// +/// Only use this function if you have a specific reason to avoid the BEAM +/// stdlib (e.g., compiling to JavaScript where BEAM primitives are absent) +/// and you have verified the edge cases that matter to you. pub fn chars(text: String) -> List(String) { tokenize.chars(text) }