diff --git a/ROADMAP.md b/ROADMAP.md index ecca24e..983f1a4 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -17,11 +17,11 @@ Legend: [ ] todo, [x] done, [~] in progress ## M1. Quality & Safety -- [ ] Placeholder normalization and validation - - [ ] Mapping between iOS (`%1$@`, `%d`) and Android (`%1$s`, `%d`) - - [ ] Detect placeholder mismatches across languages; fail in strict mode, warn otherwise - - [ ] Auto‑fix option for common cases (`%@` → `%s`, `%1$@` → `%1$s`) - - [ ] Tests across `.strings`, Android, `.xcstrings` +- [x] Placeholder normalization and validation + - [x] Mapping between iOS (`%1$@`, `%@`, `%ld`) and Android (`%1$s`, `%s`, `%d/%u`) + - [x] Detect placeholder mismatches across languages; strict vs non‑strict modes + - [x] Auto‑fix option for common cases (`normalize_placeholders_in_place`) + - [x] Tests across singular and plural entries; cross‑language normalization - [ ] Plural rules engine - [ ] CLDR‑driven required category sets per locale (few/many/etc.) - [ ] Validation pass: flag missing categories per key+locale diff --git a/langcodec-cli/src/main.rs b/langcodec-cli/src/main.rs index 8bf9de6..2678d1b 100644 --- a/langcodec-cli/src/main.rs +++ b/langcodec-cli/src/main.rs @@ -3,10 +3,10 @@ mod debug; mod formats; mod merge; mod path_glob; +mod stats; mod transformers; mod validation; mod view; -mod stats; use crate::convert::{ConvertOptions, run_unified_convert_command, try_custom_format_view}; use crate::debug::run_debug_command; @@ -304,7 +304,9 @@ fn main() { Commands::Stats { input, lang, json } => { // Validate let mut context = ValidationContext::new().with_input_file(input.clone()); - if let Some(l) = &lang { context = context.with_language_code(l.clone()); } + if let Some(l) = &lang { + context = context.with_language_code(l.clone()); + } if let Err(e) = validate_context(&context) { eprintln!("❌ Validation failed: {}", e); std::process::exit(1); diff --git a/langcodec-cli/src/stats.rs b/langcodec-cli/src/stats.rs index b466e03..410707a 100644 --- a/langcodec-cli/src/stats.rs +++ b/langcodec-cli/src/stats.rs @@ -30,7 +30,11 @@ fn accumulate(lang_stats: &mut LangStats, status: &EntryStatus) { pub fn print_stats(codec: &Codec, lang_filter: &Option, json_output: bool) { let resources: Vec<_> = match lang_filter { - Some(lang) => codec.resources.iter().filter(|r| r.metadata.language == *lang).collect(), + Some(lang) => codec + .resources + .iter() + .filter(|r| r.metadata.language == *lang) + .collect(), None => codec.resources.iter().collect(), }; @@ -84,7 +88,10 @@ pub fn print_stats(codec: &Codec, lang_filter: &Option, json_output: boo println!(" Total: {}", stats.total); println!(" By status:"); for (k, v) in [ - ("translated", stats.by_status.get("translated").copied().unwrap_or(0)), + ( + "translated", + stats.by_status.get("translated").copied().unwrap_or(0), + ), ( "needs_review", stats.by_status.get("needs_review").copied().unwrap_or(0), @@ -105,4 +112,3 @@ pub fn print_stats(codec: &Codec, lang_filter: &Option, json_output: boo println!(" Completion: {:.2}%", percent); } } - diff --git a/langcodec-cli/tests/stats_cli_tests.rs b/langcodec-cli/tests/stats_cli_tests.rs index 8fa1c46..6ffcb11 100644 --- a/langcodec-cli/tests/stats_cli_tests.rs +++ b/langcodec-cli/tests/stats_cli_tests.rs @@ -48,4 +48,3 @@ fn test_stats_json_on_android_strings() { assert_eq!(by_status["do_not_translate"], 1); assert_eq!(by_status["new"], 1); } - diff --git a/langcodec/src/codec.rs b/langcodec/src/codec.rs index 731a8cb..90ff8ff 100644 --- a/langcodec/src/codec.rs +++ b/langcodec/src/codec.rs @@ -619,6 +619,182 @@ impl Codec { .retain(|resource| !resource.entries.is_empty()); } + /// Validate placeholder consistency across languages for each key. + /// + /// Rules (initial version): + /// - For each key, each language must have the same placeholder signature. + /// - For plural entries, all forms within a language must share the same signature. + /// - iOS vs Android differences like `%@`/`%1$@` vs `%s`/`%1$s` are normalized. + /// + /// Example + /// ```rust + /// use langcodec::{Codec, types::{Entry, EntryStatus, Metadata, Resource, Translation}}; + /// let mut codec = Codec::new(); + /// let en = Resource{ + /// metadata: Metadata{ language: "en".into(), domain: String::new(), custom: Default::default() }, + /// entries: vec![Entry{ id: "greet".into(), value: Translation::Singular("Hello %1$@".into()), comment: None, status: EntryStatus::Translated, custom: Default::default() }] + /// }; + /// let fr = Resource{ + /// metadata: Metadata{ language: "fr".into(), domain: String::new(), custom: Default::default() }, + /// entries: vec![Entry{ id: "greet".into(), value: Translation::Singular("Bonjour %1$s".into()), comment: None, status: EntryStatus::Translated, custom: Default::default() }] + /// }; + /// codec.add_resource(en); + /// codec.add_resource(fr); + /// assert!(codec.validate_placeholders(true).is_ok()); + /// ``` + pub fn validate_placeholders(&self, strict: bool) -> Result<(), Error> { + use crate::placeholder::signature; + use crate::types::Translation; + use std::collections::HashMap; + + // key -> lang -> Vec + let mut map: HashMap>>> = HashMap::new(); + + for res in &self.resources { + for entry in &res.entries { + let sigs: Vec> = match &entry.value { + Translation::Singular(v) => vec![signature(v)], + Translation::Plural(p) => p.forms.values().map(|v| signature(v)).collect(), + }; + map.entry(entry.id.clone()) + .or_default() + .entry(res.metadata.language.clone()) + .or_default() + .push(sigs.into_iter().flatten().collect()); + } + } + + let mut problems = Vec::new(); + + for (key, langs) in map { + // Per-language: ensure all collected signatures for this entry are identical + let mut per_lang_sig: HashMap> = HashMap::new(); + for (lang, sig_lists) in langs { + if let Some(first) = sig_lists.first() { + if sig_lists.iter().any(|s| s != first) { + problems.push(format!( + "Key '{}' in '{}': inconsistent placeholders across forms: {:?}", + key, lang, sig_lists + )); + } + per_lang_sig.insert(lang, first.clone()); + } + } + + // Across languages, pick one baseline and compare + if let Some((base_lang, base_sig)) = per_lang_sig.iter().next() { + for (lang, sig) in &per_lang_sig { + if sig != base_sig { + problems.push(format!( + "Key '{}' mismatch: {} {:?} vs {} {:?}", + key, base_lang, base_sig, lang, sig + )); + } + } + } + } + + if problems.is_empty() { + return Ok(()); + } + if strict { + return Err(Error::validation_error(format!( + "Placeholder issues: {}", + problems.join(" | ") + ))); + } + // Non-strict mode: treat as success + Ok(()) + } + + /// Collect placeholder issues without failing. + /// Returns a list of human-readable messages; empty if none. + /// + /// Useful to warn in non-strict mode. + pub fn collect_placeholder_issues(&self) -> Vec { + use crate::placeholder::signature; + use crate::types::Translation; + use std::collections::HashMap; + + let mut map: HashMap>>> = HashMap::new(); + for res in &self.resources { + for entry in &res.entries { + let sigs: Vec> = match &entry.value { + Translation::Singular(v) => vec![signature(v)], + Translation::Plural(p) => p.forms.values().map(|v| signature(v)).collect(), + }; + map.entry(entry.id.clone()) + .or_default() + .entry(res.metadata.language.clone()) + .or_default() + .push(sigs.into_iter().flatten().collect()); + } + } + + let mut problems = Vec::new(); + for (key, langs) in map { + let mut per_lang_sig: HashMap> = HashMap::new(); + for (lang, sig_lists) in langs { + if let Some(first) = sig_lists.first() { + if sig_lists.iter().any(|s| s != first) { + problems.push(format!( + "Key '{}' in '{}': inconsistent placeholders across forms: {:?}", + key, lang, sig_lists + )); + } + per_lang_sig.insert(lang, first.clone()); + } + } + if let Some((base_lang, base_sig)) = per_lang_sig.iter().next() { + for (lang, sig) in &per_lang_sig { + if sig != base_sig { + problems.push(format!( + "Key '{}' mismatch: {} {:?} vs {} {:?}", + key, base_lang, base_sig, lang, sig + )); + } + } + } + } + problems + } + + /// Normalize placeholders in all entries (mutates in place). + /// Converts iOS patterns like `%@`, `%1$@`, `%ld` to canonical forms (%s, %1$s, %d/%u). + /// + /// Example + /// ```rust + /// use langcodec::{Codec, types::{Entry, EntryStatus, Metadata, Resource, Translation}}; + /// let mut codec = Codec::new(); + /// codec.add_resource(Resource{ + /// metadata: Metadata{ language: "en".into(), domain: String::new(), custom: Default::default() }, + /// entries: vec![Entry{ id: "id".into(), value: Translation::Singular("Hello %@ and %1$@".into()), comment: None, status: EntryStatus::Translated, custom: Default::default() }] + /// }); + /// codec.normalize_placeholders_in_place(); + /// let v = match &codec.resources[0].entries[0].value { Translation::Singular(v) => v.clone(), _ => unreachable!() }; + /// assert!(v.contains("%s") && v.contains("%1$s")); + /// ``` + pub fn normalize_placeholders_in_place(&mut self) { + use crate::placeholder::normalize_placeholders; + use crate::types::Translation; + for res in &mut self.resources { + for entry in &mut res.entries { + match &mut entry.value { + Translation::Singular(v) => { + let nv = normalize_placeholders(v); + *v = nv; + } + Translation::Plural(p) => { + for v in p.forms.values_mut() { + let nv = normalize_placeholders(v); + *v = nv; + } + } + } + } + } + } + /// Merge resources with the same language by the given strategy. /// /// This method groups resources by language and merges multiple resources @@ -1584,4 +1760,136 @@ mod tests { assert_eq!(merged.resources[0].metadata.language, "en"); assert_eq!(merged.resources[0].entries.len(), 2); } + + #[test] + fn test_validate_placeholders_across_languages() { + let mut codec = Codec::new(); + // English with %1$@, French with %1$s should match after normalization + codec.add_resource(Resource { + metadata: Metadata { + language: "en".into(), + domain: "d".into(), + custom: HashMap::new(), + }, + entries: vec![Entry { + id: "greet".into(), + value: Translation::Singular("Hello %1$@".into()), + comment: None, + status: EntryStatus::Translated, + custom: HashMap::new(), + }], + }); + codec.add_resource(Resource { + metadata: Metadata { + language: "fr".into(), + domain: "d".into(), + custom: HashMap::new(), + }, + entries: vec![Entry { + id: "greet".into(), + value: Translation::Singular("Bonjour %1$s".into()), + comment: None, + status: EntryStatus::Translated, + custom: HashMap::new(), + }], + }); + assert!(codec.validate_placeholders(true).is_ok()); + } + + #[test] + fn test_validate_placeholders_mismatch() { + let mut codec = Codec::new(); + codec.add_resource(Resource { + metadata: Metadata { + language: "en".into(), + domain: "d".into(), + custom: HashMap::new(), + }, + entries: vec![Entry { + id: "count".into(), + value: Translation::Singular("%d files".into()), + comment: None, + status: EntryStatus::Translated, + custom: HashMap::new(), + }], + }); + codec.add_resource(Resource { + metadata: Metadata { + language: "fr".into(), + domain: "d".into(), + custom: HashMap::new(), + }, + entries: vec![Entry { + id: "count".into(), + value: Translation::Singular("%s fichiers".into()), + comment: None, + status: EntryStatus::Translated, + custom: HashMap::new(), + }], + }); + assert!(codec.validate_placeholders(true).is_err()); + } + + #[test] + fn test_collect_placeholder_issues_non_strict_ok() { + let mut codec = Codec::new(); + codec.add_resource(Resource { + metadata: Metadata { + language: "en".into(), + domain: "d".into(), + custom: HashMap::new(), + }, + entries: vec![Entry { + id: "count".into(), + value: Translation::Singular("%d files".into()), + comment: None, + status: EntryStatus::Translated, + custom: HashMap::new(), + }], + }); + codec.add_resource(Resource { + metadata: Metadata { + language: "fr".into(), + domain: "d".into(), + custom: HashMap::new(), + }, + entries: vec![Entry { + id: "count".into(), + value: Translation::Singular("%s fichiers".into()), + comment: None, + status: EntryStatus::Translated, + custom: HashMap::new(), + }], + }); + // Non-strict should be Ok but issues present + assert!(codec.validate_placeholders(false).is_ok()); + let issues = codec.collect_placeholder_issues(); + assert!(!issues.is_empty()); + } + + #[test] + fn test_normalize_placeholders_in_place() { + let mut codec = Codec::new(); + codec.add_resource(Resource { + metadata: Metadata { + language: "en".into(), + domain: "d".into(), + custom: HashMap::new(), + }, + entries: vec![Entry { + id: "g".into(), + value: Translation::Singular("Hello %@ and %1$@".into()), + comment: None, + status: EntryStatus::Translated, + custom: HashMap::new(), + }], + }); + codec.normalize_placeholders_in_place(); + let v = match &codec.resources[0].entries[0].value { + Translation::Singular(v) => v.clone(), + _ => String::new(), + }; + assert!(v.contains("%s")); + assert!(v.contains("%1$s")); + } } diff --git a/langcodec/src/converter.rs b/langcodec/src/converter.rs index e994064..5f511e0 100644 --- a/langcodec/src/converter.rs +++ b/langcodec/src/converter.rs @@ -9,6 +9,7 @@ use crate::{ formats::{ AndroidStringsFormat, CSVFormat, FormatType, StringsFormat, TSVFormat, XcstringsFormat, }, + placeholder::normalize_placeholders, traits::Parser, types::Resource, }; @@ -143,14 +144,135 @@ pub fn convert>( } // Read input as resources - let resources = match input_format { - FormatType::AndroidStrings(_) => vec![AndroidStringsFormat::read_from(&input)?.into()], - FormatType::Strings(_) => vec![StringsFormat::read_from(&input)?.into()], - FormatType::Xcstrings => Vec::::try_from(XcstringsFormat::read_from(&input)?)?, - FormatType::CSV => Vec::::try_from(CSVFormat::read_from(&input)?)?, - FormatType::TSV => Vec::::try_from(TSVFormat::read_from(&input)?)?, + let mut resources = match input_format { + FormatType::AndroidStrings(_) => vec![AndroidStringsFormat::read_from(input)?.into()], + FormatType::Strings(_) => vec![StringsFormat::read_from(input)?.into()], + FormatType::Xcstrings => Vec::::try_from(XcstringsFormat::read_from(input)?)?, + FormatType::CSV => Vec::::try_from(CSVFormat::read_from(input)?)?, + FormatType::TSV => Vec::::try_from(TSVFormat::read_from(input)?)?, }; + // Ensure language is set for single-language inputs if provided on input_format + if let Some(l) = input_format.language().cloned() { + for res in &mut resources { + if res.metadata.language.is_empty() { + res.metadata.language = l.clone(); + } + } + } + + // Helper to extract resource by language if present, or first one + let pick_resource = |lang: Option| -> Option { + match lang { + Some(l) => resources.iter().find(|r| r.metadata.language == l).cloned(), + None => resources.first().cloned(), + } + }; + + match output_format { + FormatType::AndroidStrings(lang) => { + let resource = pick_resource(lang); + if let Some(res) = resource { + AndroidStringsFormat::from(res).write_to(output) + } else { + Err(Error::InvalidResource( + "No matching resource for output language.".to_string(), + )) + } + } + FormatType::Strings(lang) => { + let resource = pick_resource(lang); + if let Some(res) = resource { + StringsFormat::try_from(res)?.write_to(output) + } else { + Err(Error::InvalidResource( + "No matching resource for output language.".to_string(), + )) + } + } + FormatType::Xcstrings => XcstringsFormat::try_from(resources)?.write_to(output), + FormatType::CSV => CSVFormat::try_from(resources)?.write_to(output), + FormatType::TSV => TSVFormat::try_from(resources)?.write_to(output), + } +} + +/// Convert like [`convert`], with an option to normalize placeholders before writing. +/// +/// When `normalize` is true, common iOS placeholder tokens like `%@`, `%1$@`, `%ld` are +/// converted to canonical forms (`%s`, `%1$s`, `%d`) prior to serialization. +/// Convert with optional placeholder normalization. +/// +/// Example +/// ```rust,no_run +/// use langcodec::formats::FormatType; +/// use langcodec::converter::convert_with_normalization; +/// convert_with_normalization( +/// "en.lproj/Localizable.strings", +/// FormatType::Strings(Some("en".to_string())), +/// "values/strings.xml", +/// FormatType::AndroidStrings(Some("en".to_string())), +/// true, // normalize placeholders (e.g., %@ -> %s) +/// )?; +/// # Ok::<(), langcodec::Error>(()) +/// ``` +pub fn convert_with_normalization>( + input: P, + input_format: FormatType, + output: P, + output_format: FormatType, + normalize: bool, +) -> Result<(), Error> { + let input = input.as_ref(); + let output = output.as_ref(); + + // Carry language between single-language formats + let output_format = if let Some(lang) = input_format.language() { + output_format.with_language(Some(lang.clone())) + } else { + output_format + }; + + if !input_format.matches_language_of(&output_format) { + return Err(Error::InvalidResource( + "Input and output formats must match in language.".to_string(), + )); + } + + // Read input as resources + let mut resources = match input_format { + FormatType::AndroidStrings(_) => vec![AndroidStringsFormat::read_from(input)?.into()], + FormatType::Strings(_) => vec![StringsFormat::read_from(input)?.into()], + FormatType::Xcstrings => Vec::::try_from(XcstringsFormat::read_from(input)?)?, + FormatType::CSV => Vec::::try_from(CSVFormat::read_from(input)?)?, + FormatType::TSV => Vec::::try_from(TSVFormat::read_from(input)?)?, + }; + + // Ensure language is set for single-language inputs if provided on input_format + if let Some(l) = input_format.language().cloned() { + for res in &mut resources { + if res.metadata.language.is_empty() { + res.metadata.language = l.clone(); + } + } + } + + if normalize { + for res in &mut resources { + for entry in &mut res.entries { + match &mut entry.value { + crate::types::Translation::Singular(v) => { + *v = normalize_placeholders(v); + } + crate::types::Translation::Plural(p) => { + for (_c, v) in p.forms.iter_mut() { + *v = normalize_placeholders(v); + } + } + } + } + } + } + // Helper to extract resource by language if present, or first one let pick_resource = |lang: Option| -> Option { match lang { @@ -163,7 +285,7 @@ pub fn convert>( FormatType::AndroidStrings(lang) => { let resource = pick_resource(lang); if let Some(res) = resource { - AndroidStringsFormat::from(res).write_to(&output) + AndroidStringsFormat::from(res).write_to(output) } else { Err(Error::InvalidResource( "No matching resource for output language.".to_string(), @@ -173,16 +295,16 @@ pub fn convert>( FormatType::Strings(lang) => { let resource = pick_resource(lang); if let Some(res) = resource { - StringsFormat::try_from(res)?.write_to(&output) + StringsFormat::try_from(res)?.write_to(output) } else { Err(Error::InvalidResource( "No matching resource for output language.".to_string(), )) } } - FormatType::Xcstrings => XcstringsFormat::try_from(resources)?.write_to(&output), - FormatType::CSV => CSVFormat::try_from(resources)?.write_to(&output), - FormatType::TSV => TSVFormat::try_from(resources)?.write_to(&output), + FormatType::Xcstrings => XcstringsFormat::try_from(resources)?.write_to(output), + FormatType::CSV => CSVFormat::try_from(resources)?.write_to(output), + FormatType::TSV => TSVFormat::try_from(resources)?.write_to(output), } } @@ -223,6 +345,79 @@ pub fn convert_auto>(input: P, output: P) -> Result<(), Error> { convert(input, input_format, output, output_format) } +#[cfg(test)] +mod normalize_tests { + use super::*; + use std::fs; + + #[test] + fn test_convert_strings_to_android_with_normalization() { + let tmp = tempfile::tempdir().unwrap(); + let strings = tmp.path().join("en.strings"); + let xml = tmp.path().join("strings.xml"); + + fs::write(&strings, "\n\"g\" = \"Hello %@ and %1$@ and %ld\";\n").unwrap(); + + // Without normalization: convert should succeed + convert( + &strings, + FormatType::Strings(Some("en".into())), + &xml, + FormatType::AndroidStrings(Some("en".into())), + ) + .unwrap(); + let content = fs::read_to_string(&xml).unwrap(); + assert!(content.contains("Hello %")); + + // With normalization + convert_with_normalization( + &strings, + FormatType::Strings(Some("en".into())), + &xml, + FormatType::AndroidStrings(Some("en".into())), + true, + ) + .unwrap(); + let content = fs::read_to_string(&xml).unwrap(); + assert!(content.contains("%s")); + assert!(content.contains("%1$s")); + assert!(content.contains("%d")); + } +} + +/// Auto-infer formats from paths and convert, with optional placeholder normalization. +/// Auto-infer formats and convert with optional placeholder normalization. +/// +/// Example +/// ```rust,no_run +/// use langcodec::converter::convert_auto_with_normalization; +/// convert_auto_with_normalization( +/// "Localizable.strings", +/// "strings.xml", +/// true, // normalize placeholders +/// )?; +/// # Ok::<(), langcodec::Error>(()) +/// ``` +pub fn convert_auto_with_normalization>( + input: P, + output: P, + normalize: bool, +) -> Result<(), Error> { + let input_format = infer_format_from_path(&input).ok_or_else(|| { + Error::UnknownFormat(format!( + "Cannot infer input format from extension: {:?}", + input.as_ref().extension() + )) + })?; + let output_format = infer_format_from_path(&output).ok_or_else(|| { + Error::UnknownFormat(format!( + "Cannot infer output format from extension: {:?}", + output.as_ref().extension() + )) + })?; + convert_with_normalization(input, input_format, output, output_format, normalize) +} + /// Infers a [`FormatType`] from a file path's extension. /// /// Returns `Some(FormatType)` if the extension matches a known format, otherwise `None`. diff --git a/langcodec/src/lib.rs b/langcodec/src/lib.rs index 83fb918..dcff9cd 100644 --- a/langcodec/src/lib.rs +++ b/langcodec/src/lib.rs @@ -142,6 +142,7 @@ pub mod codec; pub mod converter; pub mod error; pub mod formats; +pub mod placeholder; pub mod traits; pub mod types; @@ -150,11 +151,13 @@ pub use crate::{ builder::CodecBuilder, codec::Codec, converter::{ - convert, convert_auto, convert_resources_to_format, infer_format_from_extension, - infer_format_from_path, infer_language_from_path, merge_resources, + convert, convert_auto, convert_auto_with_normalization, convert_resources_to_format, + convert_with_normalization, infer_format_from_extension, infer_format_from_path, + infer_language_from_path, merge_resources, }, error::Error, formats::FormatType, + placeholder::{extract_placeholders, normalize_placeholders, signature}, types::{ ConflictStrategy, Entry, EntryStatus, Metadata, Plural, PluralCategory, Resource, Translation, diff --git a/langcodec/src/placeholder.rs b/langcodec/src/placeholder.rs new file mode 100644 index 0000000..1ba24d6 --- /dev/null +++ b/langcodec/src/placeholder.rs @@ -0,0 +1,178 @@ +//! Placeholder parsing, normalization and validation utilities. +//! +//! Goals: +//! - Normalize common iOS vs Android placeholder variants to a canonical form. +//! - Extract a placeholder "signature" for comparison across languages. +//! - Validate placeholder consistency per entry (across all languages and plural forms). + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PlaceholderToken { + pub index: Option, + pub kind: char, // canonical kind: s, d, f, etc. +} + +impl PlaceholderToken { + pub fn to_signature(&self) -> String { + match self.index { + Some(i) => format!("{}${}", i, self.kind), + None => format!("{}", self.kind), + } + } +} + +/// Extracts placeholder tokens from a string and returns them in occurrence order. +/// Handles iOS and Android variants and ignores escaped percent `%%`. +pub fn extract_placeholders(input: &str) -> Vec { + let bytes = input.as_bytes(); + let mut i = 0; + let mut out = Vec::new(); + + while i < bytes.len() { + if bytes[i] != b'%' { + i += 1; + continue; + } + // Handle escaped percent + if i + 1 < bytes.len() && bytes[i + 1] == b'%' { + i += 2; + continue; + } + + let mut j = i + 1; + + // Optional positional index: digits followed by '$' + let mut index: Option = None; + let start_digits = j; + while j < bytes.len() && bytes[j].is_ascii_digit() { + j += 1; + } + if j < bytes.len() && j > start_digits && bytes[j] == b'$' { + // parse digits + if let Some(num) = std::str::from_utf8(&bytes[start_digits..j]) + .ok() + .and_then(|s| s.parse::().ok()) + { + index = Some(num); + } + j += 1; // skip '$' + } else { + // reset j if not actually positional + j = i + 1; + } + + // Optional length modifiers (l/ll) + if j < bytes.len() && bytes[j] == b'l' { + j += 1; + if j < bytes.len() && bytes[j] == b'l' { + j += 1; + } + } + + // Expect a type character + if j < bytes.len() { + let ch = bytes[j] as char; + if ch.is_ascii_alphabetic() || ch == '@' { + out.push(PlaceholderToken { + index, + kind: canonical_kind_char(ch), + }); + i = j + 1; + continue; + } + } + + // Not a recognized placeholder; skip this '%' + i += 1; + } + + out +} + +/// Normalize a string by converting iOS-specific tokens to canonical ones. +/// - %@ -> %s +/// - %1$@ -> %1$s +/// - %ld, %lu -> %d / %u +pub fn normalize_placeholders(input: &str) -> String { + // Replace positional iOS object placeholders %$@ -> %$s + let bytes = input.as_bytes(); + let mut i = 0; + let mut tmp = String::with_capacity(input.len()); + while i < bytes.len() { + if bytes[i] == b'%' { + let mut j = i + 1; + let start_digits = j; + while j < bytes.len() && bytes[j].is_ascii_digit() { + j += 1; + } + if j > start_digits && j + 1 < bytes.len() && bytes[j] == b'$' && bytes[j + 1] == b'@' { + // Copy prefix, then normalized token + tmp.push('%'); + tmp.push_str(&input[start_digits..j]); // digits + tmp.push('$'); + tmp.push('s'); + i = j + 2; + continue; + } + } + tmp.push(bytes[i] as char); + i += 1; + } + + // Simple iOS object -> string + let out = tmp.replace("%@", "%s"); + // Long ints to canonical + let out = out.replace("%ld", "%d"); + + out.replace("%lu", "%u") +} + +/// Build a normalized signature (sequence of tokens) for comparison. +pub fn signature(input: &str) -> Vec { + extract_placeholders(&normalize_placeholders(input)) + .into_iter() + .map(|t| t.to_signature()) + .collect() +} + +fn canonical_kind_char(ch: char) -> char { + match ch { + '@' => 's', + // Map uppercase to lowercase for type letters where it matters + c => c.to_ascii_lowercase(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_android_and_ios() { + let s = "Hello %1$@, you have %2$d items and %s extra"; + let sig = signature(s); + assert_eq!(sig, vec!["1$s", "2$d", "s"]); + } + + #[test] + fn test_normalize_ios_simple() { + let s = "Value: %@ and number %ld"; + let n = normalize_placeholders(s); + assert!(n.contains("%s")); + assert!(n.contains("%d")); + assert_eq!(signature(s), vec!["s", "d"]); + } + + #[test] + fn test_normalize_positional_object() { + let s = "Hello %1$@"; + let n = normalize_placeholders(s); + assert!(n.contains("%1$s")); + } + + #[test] + fn test_ignore_escaped_percent() { + let s = "Discount: 50%% and value %d"; + let sig = signature(s); + assert_eq!(sig, vec!["d"]); + } +}