From 9759d283c9127599d7c426a4e3c42ad93961ac92 Mon Sep 17 00:00:00 2001 From: oguna Date: Sun, 12 May 2024 15:47:51 +0900 Subject: [PATCH] add dictionary builder --- src/migemo/compact_dictionary_builder.rs | 162 +++++++++++++++++++++++ src/migemo/louds_trie.rs | 2 +- src/migemo/mod.rs | 3 +- 3 files changed, 165 insertions(+), 2 deletions(-) create mode 100644 src/migemo/compact_dictionary_builder.rs diff --git a/src/migemo/compact_dictionary_builder.rs b/src/migemo/compact_dictionary_builder.rs new file mode 100644 index 0000000..5488617 --- /dev/null +++ b/src/migemo/compact_dictionary_builder.rs @@ -0,0 +1,162 @@ +use std::collections::HashMap; + +use byteorder::{BigEndian, WriteBytesExt}; + +use super::{bit_list::BitList, louds_trie::LoudsTrie}; + +fn encode_char(c: char) -> Option { + if c == '\u{00}' { + return Some(0); + } + if '\u{20}' <= c && c <= '\u{7e}' { + return Some(c as u16); + } + if '\u{3041}' <= c && c <= '\u{3096}' { + return Some((c as u16) - 0x3040 + 0xa0); + } + if '\u{30fc}' == c { + return Some((c as u16) - 0x3040 + 0xa0); + } + return None; +} + +pub fn build(mut dict: HashMap>) -> Vec { + // remove some keys + let mut keys_to_remove = Vec::new(); + for key in dict.keys() { + for c in key.chars() { + let encoded = encode_char(c); + match encoded { + Some(_) => {} + None => { + keys_to_remove.push(key.clone()); + println!("skipped the word: {}", key); + } + } + } + } + for key in keys_to_remove { + dict.remove(&key); + } + + // build key trie + let mut keys: Vec> = dict.keys().map(|s| s.encode_utf16().collect()).collect(); + keys.sort(); + let key_trie = LoudsTrie::build(&keys).0; + + // build value trie + let mut values_set = std::collections::HashSet::new(); + for value in dict.values() { + for v in value { + values_set.insert(v.clone()); + } + } + let mut values: Vec> = values_set.iter().map(|s| s.encode_utf16().collect()).collect(); + values.sort(); + let value_trie = LoudsTrie::build(&values).0; + + // build trie mapping + let mut mapping_count = 0; + for i in dict.values() { + mapping_count += i.len(); + } + let mut mapping: Vec = vec![0; mapping_count]; + let mut mapping_index = 0; + let mut mapping_bit_list = BitList::new(); + for i in 1..=key_trie.size() + 1 { + let key = key_trie.get_key(i); + mapping_bit_list.add(false); + if let Some(values) = dict.get(&String::from_utf16_lossy(&key)) { + for j in 0..values.len() { + mapping_bit_list.add(true); + let a: Vec = values[j].encode_utf16().collect(); + mapping[mapping_index] = value_trie.get(&a).unwrap() as u32; + mapping_index += 1; + } + } + } + + // calculate output size + let key_trie_data_size = + 8 + key_trie.edges.len() + ((key_trie.bit_vector.size() + 63) >> 6) * 8; + let value_trie_data_size = + 8 + value_trie.edges.len() * 2 + ((value_trie.bit_vector.size() + 63) >> 6) * 8; + let mapping_data_size = 8 + ((mapping_bit_list.size + 63) >> 6) * 8 + mapping.len() * 4; + let output_data_size = key_trie_data_size + value_trie_data_size + mapping_data_size; + + // ready output + let mut output_data: Vec = Vec::with_capacity(output_data_size); + + // output key trie + output_data.write_i32::(key_trie.edges.len() as i32).unwrap(); + for edge in key_trie.edges { + let compact_char = encode_char(char::from_u32(edge as u32).unwrap()).unwrap(); + output_data.write_u8(compact_char as u8).unwrap(); + } + output_data.write_i32::(key_trie.bit_vector.size() as i32).unwrap(); + for word in key_trie.bit_vector.words { + output_data.write_u64::(word).unwrap(); + } + + // output value trie + output_data.write_i32::(value_trie.edges.len() as i32).unwrap(); + for edge in value_trie.edges { + output_data.write_u16::(edge).unwrap(); + } + output_data.write_i32::(value_trie.bit_vector.size() as i32).unwrap(); + for word in value_trie.bit_vector.words { + output_data.write_u64::(word).unwrap(); + } + + // output mapping + output_data.write_i32::(mapping_bit_list.size as i32).unwrap(); + let mapping_words_len = (mapping_bit_list.size + 63) >> 6; + for i in 0..mapping_words_len { + output_data.write_u64::(mapping_bit_list.words[i]).unwrap(); + } + output_data.write_i32::(mapping.len() as i32).unwrap(); + for value in mapping { + output_data.write_u32::(value).unwrap(); + } + + // check data size + let data_view_index = output_data.len(); + if data_view_index != output_data_size { + panic!("file size is not valid: expected={}, actual={}", output_data_size, data_view_index); + } + return output_data; +} + +mod tests { + use std::{collections::HashMap}; + + use crate::migemo::compact_dictionary::CompactDictionary; + + use super::build; + + #[test] + fn test_1() { + let mut dict = HashMap::new(); + dict.insert("けんさ".to_string(), vec!["検査".to_string()]); + dict.insert("けんさく".to_string(), vec!["検索".to_string(),"研削".to_string()]); + let buffer = build(dict); + let compact_dict = CompactDictionary::new(&buffer); + let a: Vec = "けんさく".encode_utf16().collect(); + let mut result: Vec = Vec::new(); + for s in compact_dict.search(&a) { + result.push(String::from_utf16(&s).unwrap()); + } + assert_eq!(result[0], "検索"); + assert_eq!(result[1], "研削"); + assert_eq!(result.len(), 2); + + let expected_buffer: Vec = vec![ + 0x00, 0x00, 0x00, 0x06, 0x20, 0x20, 0xB1, 0xF3, 0xB5, 0xAF, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x55, 0x00, 0x00, 0x00, 0x07, 0x00, 0x20, 0x00, 0x20, 0x69, 0x1C, + 0x78, 0x14, 0x67, 0xFB, 0x7D, 0x22, 0x52, 0x4A, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x6D, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xD0, + 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06 + ]; + assert_eq!(buffer, expected_buffer); + } +} diff --git a/src/migemo/louds_trie.rs b/src/migemo/louds_trie.rs index 9d55087..d28029d 100644 --- a/src/migemo/louds_trie.rs +++ b/src/migemo/louds_trie.rs @@ -126,7 +126,7 @@ impl LoudsTrie { let mut memo: Vec = vec![1; keys.len()]; let mut offset = 0; let mut current_node: usize = 1; - let mut edges = vec![0x30, 0x30]; + let mut edges = vec![0x20, 0x20]; let mut child_sizes: Vec = vec![0; 128]; loop { let mut last_char = 0; diff --git a/src/migemo/mod.rs b/src/migemo/mod.rs index 6aaef9d..129dc0a 100644 --- a/src/migemo/mod.rs +++ b/src/migemo/mod.rs @@ -7,4 +7,5 @@ pub mod romaji_processor; pub mod query; pub mod bit_list; pub mod ternary_regex_generator; -pub mod simple_dictionary; \ No newline at end of file +pub mod simple_dictionary; +pub mod compact_dictionary_builder; \ No newline at end of file