From 6cbc2d893f0d499292d0f488accf6c18198beb00 Mon Sep 17 00:00:00 2001 From: Mathew Sanders Date: Mon, 2 Jan 2017 15:55:49 -0500 Subject: [PATCH] Change from token type to tokenizer type (#2) * Update API Switch internal terminology from use of `Token` to `Tokenizer`. Change external API from `matches(:)` back to the original `tokens(:)`. Add a convenience method `components(:)` that returns just substrings rather than array of `Token`. * Update file name. * Update README to show `Token` type. * Include `import Mustard` in code snippet. --- Documentation/Expressive matching.md | 82 +++++----- .../Greedy tokens and tokenizer order.md | 58 +++---- Documentation/Matching emoji.md | 12 +- Documentation/TallyType protocol.md | 114 ------------- Documentation/TokenizerType protocol.md | 126 +++++++++++++++ Documentation/Tokens with internal state.md | 62 +++---- Mustard/Mustard.xcodeproj/project.pbxproj | 8 +- Mustard/Mustard/CharacterSet+Mustard.swift | 30 ++-- Mustard/Mustard/Mustard.swift | 78 ++++----- Mustard/Mustard/TokenType.swift | 141 ---------------- Mustard/Mustard/TokenizerType.swift | 153 ++++++++++++++++++ .../MustardTests/CharacterSetTokenTests.swift | 26 +-- Mustard/MustardTests/CustomTokenTests.swift | 32 ++-- Mustard/MustardTests/EmojiTokenTests.swift | 24 +-- .../MustardTests/FallbackTokenizerTests.swift | 28 ++-- .../MustardTests/FuzzyMatchTokenTests.swift | 42 ++--- Mustard/MustardTests/LiteralTokenTests.swift | 22 +-- Mustard/MustardTests/MixedTokenTests.swift | 70 ++++---- README.md | 73 +++++---- 19 files changed, 614 insertions(+), 567 deletions(-) delete mode 100644 Documentation/TallyType protocol.md create mode 100644 Documentation/TokenizerType protocol.md delete mode 100644 Mustard/Mustard/TokenType.swift create mode 100644 Mustard/Mustard/TokenizerType.swift diff --git a/Documentation/Expressive matching.md b/Documentation/Expressive matching.md index badb092..ace006e 100644 --- a/Documentation/Expressive matching.md +++ b/Documentation/Expressive matching.md @@ -1,83 +1,79 @@ # Example: expressive matching -The results returned by `matches(from:)`returns an array tuples with the signature `(tokenizer: TokenType, text: String, range: Range)` +The results returned by `tokens(matchedWith:)`returns an array `Token` which in turn is a tuple with the signature `(tokenizer: TokenizerType, text: String, range: Range)` To make use of the `tokenizer` element, you need to either use type casting (using `as?`) or type checking (using `is`) for the `tokenizer` element to be useful. -Maybe we want to filter out only tokens that are numbers: +Maybe we want to filter out only tokens that were matched with a number tokenizer: ````Swift import Mustard -let messy = "123Hello world&^45.67" -let matches = messy.matches(from: .decimalDigits, .letters) -// matches.count -> 5 +let tokens = "123Hello world&^45.67".tokens(matchedWith: .decimalDigits, .letters) +// tokens.count -> 5 -let numbers = matches.filter({ $0.tokenizer is NumberToken }) -// numbers.count -> 0 +let numberTokens = tokens.filter({ $0.tokenizer is NumberTokenizer }) +// numberTokens.count -> 0 ```` -This can lead to bugs in your logic-- in the example above `numberTokens` will be empty because the tokenizers used were the character sets `.decimalDigits`, and `.letters`, so the filter won't match any of the tokens. +This can lead to bugs in your logic-- in the example above `numberTokens` will be empty because the tokenizers used were `CharacterSet.decimalDigits`, and `CharacterSet.letters`, so the filter won't match any of the tokens. This may seem like an obvious error, but it's the type of unexpected bug that can slip in when we're using loosely typed results. -Thankfully, Mustard can return a strongly typed set of matches if a single `TokenType` is used: +Thankfully, Mustard can return a strongly typed set of matches if a single `TokenizerType` is used: ````Swift import Mustard -let messy = "123Hello world&^45.67" - -// call `matches()` method on string to get matching tokens from string -let numberMatches: [NumberToken.Match] = messy.matches() -// numberMatches.count -> 2 +// call `tokens()` method on `String` to get matching tokens from the string +let numberTokens: [NumberTokenizer.Token] = "123Hello world&^45.67".tokens() +// numberTokens.count -> 2 ```` -Used in this way, this isn't very useful, but it does allow for multiple `TokenType` to be bundled together as a single `TokenType` by implementing a TokenType using an `enum`. +Used in this way, this isn't very useful, but it does allow for multiple `TokenizerType` to be bundled together as a single tokenizer by implementing with an `enum`. -An enum token type can either manage it's own internal state, or potentially act as a lightweight wrapper to existing tokenizers. -Here's an example `TokenType` that acts as a wrapper for word, number, and emoji tokenizers: +An enum tokenizer can either manage it's own internal state, or potentially act as a lightweight wrapper to other existing tokenizers. -````Swift +Here's an example `TokenizerType` that acts as a wrapper for word, number, and emoji tokenizers: -enum MixedToken: TokenType { +````Swift +enum MixedTokenizer: TokenizerType { case word case number case emoji case none // 'none' case not strictly needed, and // in this implementation will never be matched - init() { self = .none } - static let wordToken = WordToken() - static let numberToken = NumberToken() - static let emojiToken = EmojiToken() + static let wordTokenizer = WordTokenizer() + static let numberTokenizer = NumberTokenizer() + static let emojiTokenizer = EmojiTokenizer() - func canAppend(next scalar: UnicodeScalar) -> Bool { + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { switch self { - case .word: return MixedToken.wordToken.canAppend(next: scalar) - case .number: return MixedToken.numberToken.canAppend(next: scalar) - case .emoji: return MixedToken.emojiToken.canAppend(next: scalar) + case .word: return MixedTokenizer.wordTokenizer.tokenCanTake(scalar) + case .number: return MixedTokenizer.numberTokenizer.tokenCanTake(scalar) + case .emoji: return MixedTokenizer.emojiTokenizer.tokenCanTake(scalar) case .none: return false } } - func token(startingWith scalar: UnicodeScalar) -> TokenType? { + func token(startingWith scalar: UnicodeScalar) -> TokenizerType? { - if let _ = MixedToken.wordToken.token(startingWith: scalar) { - return MixedToken.word + if let _ = MixedTokenizer.wordTokenizer.token(startingWith: scalar) { + return MixedTokenizer.word } - else if let _ = MixedToken.numberToken.token(startingWith: scalar) { - return MixedToken.number + else if let _ = MixedTokenizer.numberTokenizer.token(startingWith: scalar) { + return MixedTokenizer.number } - else if let _ = MixedToken.emojiToken.token(startingWith: scalar) { - return MixedToken.emoji + else if let _ = MixedTokenizer.emojiTokenizer.token(startingWith: scalar) { + return MixedTokenizer.emoji } else { return nil @@ -90,25 +86,25 @@ Mustard defines a default typealias for `Token` that exposes the specific type i results tuple. ````Swift -public extension TokenType { - typealias Match = (tokenizer: Self, text: String, range: Range) +public extension TokenizerType { + typealias Token = (tokenizer: Self, text: String, range: Range) } ```` -Setting your results array to this type gives you the option to use the shorter `matches()` method, +Setting your results array to this type gives you the option to use the shorter `tokens()` method, where Mustard uses the inferred type to perform tokenization. -Since the matches array is strongly typed, you can be more expressive with the results, and the +Since the tokens array is strongly typed, you can be more expressive with the results, and the complier can give you more hints to prevent you from making mistakes. ````Swift -// use the `matches()` method to grab matching substrings using a single tokenizer -let matches: [MixedToken.Match] = "123πŸ‘©β€πŸ‘©β€πŸ‘¦β€πŸ‘¦Hello worldπŸ‘Ά againπŸ‘ΆπŸΏ 45.67".matches() -// matches.count -> 8 +// use the `tokens()` method to grab matching substrings using a single tokenizer +let tokens: [MixedTokenizer.Token] = "123πŸ‘©β€πŸ‘©β€πŸ‘¦β€πŸ‘¦Hello worldπŸ‘Ά againπŸ‘ΆπŸΏ 45.67".tokens() +// tokens.count -> 8 -matches.forEach({ match in - switch (match.tokenizer, match.text) { +tokens.forEach({ token in + switch (token.tokenizer, token.text) { case (.word, let word): print("word:", word) case (.number, let number): print("number:", number) case (.emoji, let emoji): print("emoji:", emoji) diff --git a/Documentation/Greedy tokens and tokenizer order.md b/Documentation/Greedy tokens and tokenizer order.md index 98ea25b..2c891a1 100644 --- a/Documentation/Greedy tokens and tokenizer order.md +++ b/Documentation/Greedy tokens and tokenizer order.md @@ -1,43 +1,43 @@ # Greedy tokens and tokenizer order -Tokenizers are greedy. The order that tokenizers are passed into the `matches(from: TokenType...)` will effect how substrings are matched. +Tokenizers are greedy. The order that tokenizers are passed into the `matches(from: TokenizerType...)` will effect how substrings are matched. -Here's an example using the `CharacterSet.decimalDigits` tokenizer and the custom tokenizer `DateToken` that matches dates in the format `MM/dd/yy` ([see example](Tokens with internal state.md) for implementation). +Here's an example using the `CharacterSet.decimalDigits` tokenizer and the custom tokenizer `DateTokenizer` that matches dates in the format `MM/dd/yy` ([see example](Tokens with internal state.md) for implementation). ````Swift import Mustard let numbers = "03/29/17 36" -let matches = numbers.matches(from: CharacterSet.decimalDigits, DateToken.tokenizer) -// matches.count -> 4 +let tokens = numbers.tokens(matchedWith: CharacterSet.decimalDigits, DateTokenizer.defaultTokenizer) +// tokens.count -> 4 // -// matches[0].text -> "03" -// matches[0].tokenizer -> CharacterSet.decimalDigits +// tokens[0].text -> "03" +// tokens[0].tokenizer -> CharacterSet.decimalDigits // -// matches[1].text -> "29" -// matches[1].tokenizer -> CharacterSet.decimalDigits +// tokens[1].text -> "29" +// tokens[1].tokenizer -> CharacterSet.decimalDigits // -// matches[2].text -> "17" -// matches[2].tokenizer -> CharacterSet.decimalDigits +// tokens[2].text -> "17" +// tokens[2].tokenizer -> CharacterSet.decimalDigits // -// matches[3].text -> "36" -// matches[3].tokenizer -> CharacterSet.decimalDigits +// tokens[3].text -> "36" +// tokens[3].tokenizer -> CharacterSet.decimalDigits ```` -To get expected behavior, the `matches` method should be called with more specific tokenizers placed before more general tokenizers: +To get expected behavior, the `tokens` method should be called with more specific tokenizers placed before more general tokenizers: ````Swift import Mustard let numbers = "03/29/17 36" -let matches = numbers.matches(from: DateToken.tokenizer, CharacterSet.decimalDigits) -// matches.count -> 2 +let tokens = numbers.tokens(matchedWith: DateTokenizer.defaultTokenizer, CharacterSet.decimalDigits) +// tokens.count -> 2 // -// matches[0].text -> "03/29/17" -// matches[0].tokenizer -> DateToken() +// tokens[0].text -> "03/29/17" +// tokens[0].tokenizer -> DateTokenizer() // -// matches[1].text -> "36" -// matches[1].tokenizer -> CharacterSet.decimalDigits +// tokens[1].text -> "36" +// tokens[1].tokenizer -> CharacterSet.decimalDigits ```` If the more specific tokenizer fails to match a token, the more general tokens still have a chance to perform matches: @@ -46,18 +46,18 @@ If the more specific tokenizer fails to match a token, the more general tokens s import Mustard let numbers = "99/99/99 36" -let matches = numbers.matches(from: DateToken.tokenizer, CharacterSet.decimalDigits) -// matches.count -> 4 +let tokens = numbers.tokens(matchedWith: DateTokenizer.defaultTokenizer, CharacterSet.decimalDigits) +// tokens.count -> 4 // -// matches[0].text -> "99" -// matches[0].tokenizer -> CharacterSet.decimalDigits +// tokens[0].text -> "99" +// tokens[0].tokenizer -> CharacterSet.decimalDigits // -// matches[1].text -> "99" -// matches[1].tokenizer -> CharacterSet.decimalDigits +// tokens[1].text -> "99" +// tokens[1].tokenizer -> CharacterSet.decimalDigits // -// matches[2].text -> "99" -// matches[2].tokenizer -> CharacterSet.decimalDigits +// tokens[2].text -> "99" +// tokens[2].tokenizer -> CharacterSet.decimalDigits // -// matches[3].text -> "36" -// matches[3].tokenizer -> CharacterSet.decimalDigits +// tokens[3].text -> "36" +// tokens[3].tokenizer -> CharacterSet.decimalDigits ```` diff --git a/Documentation/Matching emoji.md b/Documentation/Matching emoji.md index e038e44..a7a246c 100644 --- a/Documentation/Matching emoji.md +++ b/Documentation/Matching emoji.md @@ -6,21 +6,21 @@ As an example, the character 'πŸ‘ΆπŸΏ' is comprised by two scalars: 'πŸ‘Ά', and The rainbow flag character 'πŸ³οΈβ€πŸŒˆ' is again comprised by two adjacent scalars '🏳' and '🌈'. A final example, the character 'πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦' is actually 7 scalars: 'πŸ‘¨' 'πŸ‘¨' 'πŸ‘§' 'πŸ‘¦' joined by three ZWJs (zero-with joiner). -To create a TokenType that matches emoji we can instead check to see if a scalar falls within known range, or if it's a ZWJ. +To create a `TokenizerType` that matches emoji we can instead check to see if a scalar falls within known range, or if it's a ZWJ. This isn't the most *accurate* emoji tokenizer because it would potentially matches an emoji scalar followed by 100 zero-width joiners, but for basic use it might be enough. ````Swift -struct EmojiToken: TokenType { +struct EmojiTokenizer: TokenizerType { // (e.g. can't start with a ZWJ) - func canStart(with scalar: UnicodeScalar) -> Bool { - return EmojiToken.isEmojiScalar(scalar) + func tokenCanStart(with scalar: UnicodeScalar) -> Bool { + return EmojiTokenizer.isEmojiScalar(scalar) } // either in the known range for a emoji, or a ZWJ - func canTake(_ scalar: UnicodeScalar) -> Bool { - return EmojiToken.isEmojiScalar(scalar) || EmojiToken.isJoiner(scalar) + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { + return EmojiTokenizer.isEmojiScalar(scalar) || EmojiTokenizer.isJoiner(scalar) } static func isJoiner(_ scalar: UnicodeScalar) -> Bool { diff --git a/Documentation/TallyType protocol.md b/Documentation/TallyType protocol.md deleted file mode 100644 index ef55aae..0000000 --- a/Documentation/TallyType protocol.md +++ /dev/null @@ -1,114 +0,0 @@ -# TallyType protocol: implementing your own tokenizer - -You can create your own tokenizers by implementing the `TallyType` protocol. - -````Swift - -/// A tuple capturing information about a token match. -/// -/// - tokenizer: The instance of `TokenType` that matched the token. -/// - text: The text that the token matched. -/// - range: The range of the matched text in the original input. -public typealias Match = (tokenizer: TokenType, text: String, range: Range) - -public protocol TokenType { - - /// Asks the token if it can start with the given scalar. - /// - /// The default implementation of this method is an alias for `canTake(_:)`. - /// Implement this method if the token has unique criteria for the first scalar to match. - /// - /// - Parameter scalar: The scalar to check. - /// - /// - Returns: `true` if the token can start with this scalar; otherwise, false. - func canStart(with scalar: UnicodeScalar) -> Bool - - /// Asks the token if if can capture this scalar as a valid match. - /// - /// - Parameter scalar: The scalar to check using the token. - /// - /// - Returns: `true` if the token can take this this scalar; otherwise, false. - func canTake(_ scalar: UnicodeScalar) -> Bool - - /// Returns a boolean value if the token is complete. - var isComplete: Bool { get } - - /// Asks the token if it is invalid given context of the first scalar following this token. - /// - /// The default implementation of this method performs always returns `false`. - /// Implement this method to return `true` in situations where a token can not be followed - /// by certain scalars. - /// - /// - Parameter scalar: The first scalar following this token, or `nil` if the token has - /// reached the end of the text. - /// - /// - Returns: `true` if the token is invalid with the following scalar; otherwise, false. - func completeTokenIsInvalid(whenNextScalarIs scalar: UnicodeScalar?) -> Bool - - /// Ask the token to prepare itself to start matching a new series of scalars. - /// - /// The default implementation of this method does nothing. - /// Implement this method to reset the token if calls to `canTake(_:)` change the state - /// of the token. - func prepareForReuse() - - /// Returns an instance of that can start with the given scalar, - /// or `nil` if type can't start with this scalar. - /// - /// The default implementation of this method returns itself if `canStart(with:)` returns true; - /// otherwise, nil. - func token(startingWith scalar: UnicodeScalar) -> TokenType? - - /// Initialize an empty instance. - init() - - /// Returns a new instance of a token that's a copy of the receiver. - /// - /// The object returned is set as the `tokenizer` element from a call to `matches()` - /// If the type implements NSCopying protocol, the default implementation returns the result of - /// `copy(with: nil)`; otherwise, returns self. - var tokenizerForMatch: TokenType { get } -} - -```` - -Default implementations are provided for all methods except for `canTake(_:)` which means many implementations may be trivial. -As an example, here's the extension of `CharacterSet` allowing any character set to act as a `TokenType`. - -````Swift - -extension CharacterSet: TokenType { - public func canTake(_ scalar: UnicodeScalar) -> Bool { - return self.contains(scalar) - } -} - -```` - -Here's an example showing how to match individuals words identified by [camel case](https://en.wikipedia.org/wiki/Camel_case): - -````Swift -struct CamelCaseToken: TokenType { - - // start of token is identified by an uppercase letter - func canStart(with scalar: UnicodeScalar) -> Bool - return CharacterSet.uppercaseLetters.contains(scalar) - } - - // all remaining characters must be lowercase letters - public func canTake(_ scalar: UnicodeScalar) -> Bool { - return CharacterSet.lowercaseLetters.contains(scalar) - } -} -```` - -Mustard uses instances of TokenType to perform tokenization. If your `TokenType` uses the default initializer, you can use the static property `tokenizer` as a semantic alias. - -````Swift -let words = "HelloWorld".matches(from: CamelCaseToken.tokenizer) -// `CamelCaseToken.tokenizer` is equivalent to `CamelCaseToken()` - -// words.count -> 2 -// words[0].text -> "Hello" -// words[1].text -> "World" -```` diff --git a/Documentation/TokenizerType protocol.md b/Documentation/TokenizerType protocol.md new file mode 100644 index 0000000..8cb6da1 --- /dev/null +++ b/Documentation/TokenizerType protocol.md @@ -0,0 +1,126 @@ +# TokenizerType protocol: implementing your own tokenizer + +You can create your own tokenizers by implementing the `TokenizerType` protocol. + +````Swift +/// Token is a typelias for a tuple with the following named elements: +/// +/// - tokenizer: An instance of `TokenizerType` that matched the token. +/// - text: A substring that the tokenizer matched in the original string. +/// - range: The range of the matched text in the original string. +public typealias Token = (tokenizer: TokenizerType, text: String, range: Range) + +public protocol TokenizerType { + + /// Returns an instance of a tokenizer that starts with the given scalar, + /// or `nil` if this type can't start with this scalar. + /// + /// The default implementation of this method returns `self` if `tokenCanStart(with:)` returns true; + /// otherwise, nil. + func token(startingWith scalar: UnicodeScalar) -> TokenizerType? + + /// Checks if tokens of this type can start with the given scalar. + /// + /// The default implementation of this method is an alias for `tokenCanTake(_:)`. + /// Provide an alternate implementation if tokens have special starting criteria. + /// + /// - Parameter scalar: The scalar the token could start with. + /// + /// - Returns: `true` if the token can start with this scalar; otherwise, false. + func tokenCanStart(with scalar: UnicodeScalar) -> Bool + + /// Checks if tokens can include this scalar as part of a token. + /// + /// This method is called multiple times for each subsequent scalar in a String until the tokenizer + /// returns `false`. + /// + /// - Parameter scalar: The scalar the token could include. + /// + /// - Returns: `true` if the token can take this this scalar; otherwise, false. + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool + + /// Returns a boolean value if the token is considered complete. + /// + /// The default implementation returns `true`. + /// + /// Provide an alternate implementation if tokens have some internal criteria that need to be + /// satisfied before a token is complete. + var tokenIsComplete: Bool { get } + + /// Checks if a complete token should be discarded given the context of the first scalar following this token. + /// + /// The default implementation of this method performs always returns `false`. + /// + /// Provide an alternate implementation to return `true` in situations where a token can not be followed + /// by certain scalars. + /// + /// - Parameter scalar: The first scalar following this token, or `nil` if the tokenizer has + /// matched a token that reaches the end of the text. + /// + /// - Returns: `true` if the token is invalid with the following scalar; otherwise, false. + func completeTokenIsInvalid(whenNextScalarIs scalar: UnicodeScalar?) -> Bool + + /// Ask the tokenizer to prepare itself to start matching a new series of scalars. + /// + /// The default implementation of this method does nothing. + /// + /// Provide an alternate implementation if the tokenizer maintains an internal state that updates based on calls to + /// `tokenCanTake(_:)` + func prepareForReuse() + + /// Initialize an empty instance of the tokenizer. + init() + + /// Returns an instance of the tokenizer that will be used as the `tokenizer` element in the `Token` tuple. + /// + /// If the tokenizer implements `NSCopying` protocol, the default implementation returns the result of + /// `copy(with: nil)`; otherwise, returns `self` which is suitable for structs. + /// + /// Provide an alternate implementation if the tokenizer is a reference type that does not implement `NSCopying`. + var tokenizerForMatch: TokenizerType { get } +} + +```` + +Default implementations are provided for all methods except for `tokenCanTake(_:)` which means many implementations may be trivial. + +As an example, here's the extension that Mustard uses to allow any `CharacterSet` to act as a tokenizer. + +````Swift + +extension CharacterSet: TokenizerType { + public func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { + return self.contains(scalar) + } +} + +```` + +Here's an example showing how to match individuals words identified by [camel case](https://en.wikipedia.org/wiki/Camel_case): + +````Swift +struct CamelCaseTokenizer: TokenizerType { + + // start of token is identified by an uppercase letter + func tokenCanStart(with scalar: UnicodeScalar) -> Bool + return CharacterSet.uppercaseLetters.contains(scalar) + } + + // all remaining characters must be lowercase letters + public func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { + return CharacterSet.lowercaseLetters.contains(scalar) + } +} +```` + +Mustard uses instances of TokenizerType to perform tokenization. If your `TokenizerType` uses the default +initializer, you have the option of using the static property `defaultTokenizer` as a semantic alias. + +````Swift +let words = "HelloWorld".tokens(matchedWith: CamelCaseTokenizer.defaultTokenizer) +// `CamelCaseTokenizer.defaultTokenizer` is equivalent to `CamelCaseTokenizer()` + +// words.count -> 2 +// words[0].text -> "Hello" +// words[1].text -> "World" +```` diff --git a/Documentation/Tokens with internal state.md b/Documentation/Tokens with internal state.md index 660a6bb..7c7edb6 100644 --- a/Documentation/Tokens with internal state.md +++ b/Documentation/Tokens with internal state.md @@ -6,30 +6,30 @@ In examples so far, token types have looked at individual scalars without contex Without keeping some internal state of what's been matched so far, it's not possible to create a token that matches *cat* but not *cta* since they both start with the same scalar, and have the same set of characters. -A `LiteralToken` is a more complex `TokenType` that uses a target `String` as the basis for tokenization: +A `LiteralTokenizer` is a more complex `TokenizerType` that uses a target `String` as the basis for tokenization: ````Swift -// implementing as class rather than struct since `canTake(_:)` will have mutating effect. -class LiteralToken: TokenType { +// implementing as class rather than struct since `tokenCanTake(_:)` will have mutating effect. +class LiteralTokenizer: TokenizerType { private let target: String private var position: String.UnicodeScalarIndex - // required by the TokenType protocol, but non-sensical to use + // required by the TokenizerType protocol, but non-sensical to use required convenience init() { self.init(target: "") } - // instead, we should initalize instance with the target String we're looking for + // instead, we should initialize instance with the target String we're looking for init(target: String) { self.target = target self.position = target.unicodeScalars.startIndex } // instead of looking at a set of scalars, the order that the scalar occurs - // is relevent for the token - func canTake(_ scalar: UnicodeScalar) -> Bool { + // is relevant for the token + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { guard position < target.unicodeScalars.endIndex else { return false @@ -48,7 +48,7 @@ class LiteralToken: TokenType { // this token is only complete when we've called `canTake(_:)` with the correct sequence // of scalars such that `position` has advanced to the endIndex of the target - var isComplete: Bool { + var tokenIsComplete: Bool { return position == target.unicodeScalars.endIndex } @@ -63,7 +63,7 @@ class LiteralToken: TokenType { } } - // token instances are re-used, in most cases this doesn't matter, but because we keep + // tokenizer instances are re-used, in most cases this doesn't matter, but because we keep // an internal state, we need to reset this instance to start matching again func prepareForReuse() { position = target.unicodeScalars.startIndex @@ -71,9 +71,9 @@ class LiteralToken: TokenType { } extension String { - // a convenience to allow us to use `"cat".literalToken` instead of `LiteralToken("cat")` - var literalToken: LiteralToken { - return LiteralToken(target: self) + // a convenience to allow us to use `"cat".literalToken` instead of `LiteralTokenizer("cat")` + var literalToken: LiteralTokenizer { + return LiteralTokenizer(target: self) } } ```` @@ -82,11 +82,11 @@ This allows us to match tokens by specific words. Note in this example that the ````Swift let input = "the cat and the catastrophe duck" -let matches = input.matches(from: "cat".literalToken, "duck".literalToken) -matches.count // -> 2 +let tokens = input.tokens(matchedWith: "cat".literalToken, "duck".literalToken) +tokens.count // -> 2 -for match in matches { - print("-", "'\(match.text)'") +for token in tokens { + print("-", "'\(token.text)'") } // prints -> // - 'cat' @@ -98,7 +98,7 @@ for match in matches { Another useful pattern would be allow us to look for a matching sequence of scalars but using a template rather than a literal match. -A `DateMatch` is a more complex `TokenType` that uses an internal template as the basis for tokenization: +A `DateTokenizer` is a more complex `TokenizerType` that uses an internal template as the basis for tokenization: ````Swift @@ -108,7 +108,7 @@ func ~= (option: CharacterSet, input: UnicodeScalar) -> Bool { return option.contains(input) } -class DateToken: TokenType { +class DateTokenizer: TokenizerType { // private properties private let _template = "00/00/00" @@ -134,7 +134,7 @@ class DateToken: TokenType { _dateText = "" } - func canTake(_ scalar: UnicodeScalar) -> Bool { + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { guard _position < _template.unicodeScalars.endIndex else { // we've matched all of the template @@ -154,7 +154,7 @@ class DateToken: TokenType { } } - var isComplete: Bool { + var tokenIsComplete: Bool { if _position == _template.unicodeScalars.endIndex, let date = DateToken.dateFormatter.date(from: _dateText) { // we've reached the end of the template @@ -194,26 +194,26 @@ class DateToken: TokenType { This will match tokens for any text that has the format of three pairs of numbers joined with the '/' character, but will also ignore characters that match that format, but don't form a valid date. -Combined with the technique used in the [expressive matching example](Documentation/3. Expressive matching.md) where tokenizing using a single TokenType returns results of the actual type used, we can even access the `Date` object associated with the token. +Combined with the technique used in the [expressive matching example](Documentation/Expressive matching.md) where tokenizing using a single TokenType returns results of the actual type used, we can even access the `Date` object associated with the token. ````Swift import Mustard let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned) 12/03/27 (Arrived) ref: 99/99/99" -let dateMatches: [DateToken.Match] = messyInput.matches() -// dateMatches.count -> 2 -// ('99/99/99' is not matched by `DateToken`) +let dateTokens: [DateTokenizer.Token] = messyInput.tokens() +// dateTokens.count -> 2 +// ('99/99/99' is not matched by `DateTokenizer`) // // first date -// dateMatches[0].text -> "12/01/27" -// dateMatches[0].tokenizer -> DateToken() -// dateMatches[0].tokenizer.date -> Date(2027-12-01 05:00:00 +0000) +// dateTokens[0].text -> "12/01/27" +// dateTokens[0].tokenizer -> DateTokenizer() +// dateTokens[0].tokenizer.date -> Date(2027-12-01 05:00:00 +0000) // // last date -// dateMatches[1].text -> "12/03/27" -// dateMatches[1].tokenizer -> DateToken() -// dateMatches[1].tokenizer.date -> Date(2027-12-03 05:00:00 +0000) +// dateTokens[1].text -> "12/03/27" +// dateTokens[1].tokenizer -> DateTokenizer() +// dateTokens[1].tokenizer.date -> Date(2027-12-03 05:00:00 +0000) ```` -See [FuzzyMatchTokenTests.swift](/Mustard/MustardTests/FuzzyMatchTokenTests.swift) for a unit test that includes fuzzy matching of a literal String, but ignoring certain characters. +See [FuzzyMatchTokenTests.swift](/Mustard/MustardTests/FuzzyMatchTokenTests.swift) for a unit test that includes literal matching of a literal String, but fuzzy in the sense that it ignores certain characters. diff --git a/Mustard/Mustard.xcodeproj/project.pbxproj b/Mustard/Mustard.xcodeproj/project.pbxproj index 2f37018..59107b7 100644 --- a/Mustard/Mustard.xcodeproj/project.pbxproj +++ b/Mustard/Mustard.xcodeproj/project.pbxproj @@ -12,7 +12,7 @@ AF5DE7641E16EBD8007E2D49 /* EmojiTokenTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = AF5DE7631E16EBD8007E2D49 /* EmojiTokenTests.swift */; }; AF5DE7661E16F71D007E2D49 /* MixedTokenTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = AF5DE7651E16F71D007E2D49 /* MixedTokenTests.swift */; }; AF77D0361E18336D007287DC /* LiteralTokenTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = AF77D0351E18336D007287DC /* LiteralTokenTests.swift */; }; - AF77D03C1E18439D007287DC /* TokenType.swift in Sources */ = {isa = PBXBuildFile; fileRef = AF77D03B1E18439D007287DC /* TokenType.swift */; }; + AF77D03C1E18439D007287DC /* TokenizerType.swift in Sources */ = {isa = PBXBuildFile; fileRef = AF77D03B1E18439D007287DC /* TokenizerType.swift */; }; AFC3B9271E16E379005B4A99 /* Mustard.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = AFC3B91D1E16E379005B4A99 /* Mustard.framework */; }; AFC3B92C1E16E379005B4A99 /* CharacterSetTokenTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = AFC3B92B1E16E379005B4A99 /* CharacterSetTokenTests.swift */; }; AFC3B92E1E16E379005B4A99 /* Mustard.h in Headers */ = {isa = PBXBuildFile; fileRef = AFC3B9201E16E379005B4A99 /* Mustard.h */; settings = {ATTRIBUTES = (Public, ); }; }; @@ -37,7 +37,7 @@ AF5DE7631E16EBD8007E2D49 /* EmojiTokenTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = EmojiTokenTests.swift; sourceTree = ""; }; AF5DE7651E16F71D007E2D49 /* MixedTokenTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MixedTokenTests.swift; sourceTree = ""; }; AF77D0351E18336D007287DC /* LiteralTokenTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = LiteralTokenTests.swift; sourceTree = ""; }; - AF77D03B1E18439D007287DC /* TokenType.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = TokenType.swift; sourceTree = ""; }; + AF77D03B1E18439D007287DC /* TokenizerType.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = TokenizerType.swift; sourceTree = ""; }; AFC3B91D1E16E379005B4A99 /* Mustard.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Mustard.framework; sourceTree = BUILT_PRODUCTS_DIR; }; AFC3B9201E16E379005B4A99 /* Mustard.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = Mustard.h; sourceTree = ""; }; AFC3B9211E16E379005B4A99 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; @@ -90,7 +90,7 @@ isa = PBXGroup; children = ( AFC3B9201E16E379005B4A99 /* Mustard.h */, - AF77D03B1E18439D007287DC /* TokenType.swift */, + AF77D03B1E18439D007287DC /* TokenizerType.swift */, AFC3B9371E16E38D005B4A99 /* Mustard.swift */, AFC3B9391E16E3DE005B4A99 /* CharacterSet+Mustard.swift */, AFC3B9211E16E379005B4A99 /* Info.plist */, @@ -226,7 +226,7 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( - AF77D03C1E18439D007287DC /* TokenType.swift in Sources */, + AF77D03C1E18439D007287DC /* TokenizerType.swift in Sources */, AFC3B9381E16E38D005B4A99 /* Mustard.swift in Sources */, AFC3B93A1E16E3DE005B4A99 /* CharacterSet+Mustard.swift in Sources */, ); diff --git a/Mustard/Mustard/CharacterSet+Mustard.swift b/Mustard/Mustard/CharacterSet+Mustard.swift index 0793402..1bbe8ea 100644 --- a/Mustard/Mustard/CharacterSet+Mustard.swift +++ b/Mustard/Mustard/CharacterSet+Mustard.swift @@ -22,30 +22,38 @@ import Foundation -extension CharacterSet: TokenType { - public func canTake(_ scalar: UnicodeScalar) -> Bool { +extension CharacterSet: TokenizerType { + public func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { return self.contains(scalar) } } extension String { - /// Returns matches from the string found using tokenizers made from one or more CharacterSet. + /// Returns an array of `Token` in the `String` matched using tokenization based on one or + /// more characterSets. /// - /// - Parameter characterSets: One or more character sets to match substrings in the string. + /// - Parameter characterSets: One or more character sets to use as tokenziers to match + /// substrings in the `String`. /// - /// This method is an alias for calling `matches(from tokenizers: TokenType...) -> [Match]`. + /// This method is an alias for calling `tokens(matchedWith tokenizers: TokenizerType...) -> [Token]`. /// - /// Returns: An array of type `Match` which is a tuple containing an instance of the tokenizer - /// that matched the result, the substring that was matched, and the range of the matched - /// substring in this string. - public func matches(from characterSets: CharacterSet...) -> [Match] { - return matches(from: characterSets) + /// Returns: An array of `Token` where each token is a tuple containing a substring from the + /// `String`, the range of the substring in the `String`, and an instance of `TokenizerType` + /// that matched the substring. + public func tokens(matchedWith characterSets: CharacterSet...) -> [Token] { + return tokens(from: characterSets) + } + + /// Returns an array containing substrings from the `String` that have been matched by + /// tokenization using one or more character sets. + public func components(matchedWith characterSets: CharacterSet...) -> [String] { + return tokens(from: characterSets).map({ $0.text }) } } infix operator ~= -public func ~= (option: CharacterSet, input: TokenType) -> Bool { +public func ~= (option: CharacterSet, input: TokenizerType) -> Bool { if let characterSet = input as? CharacterSet { return characterSet == option } diff --git a/Mustard/Mustard/Mustard.swift b/Mustard/Mustard/Mustard.swift index ba89c27..00c2f01 100644 --- a/Mustard/Mustard/Mustard.swift +++ b/Mustard/Mustard/Mustard.swift @@ -24,69 +24,73 @@ import Foundation public extension String { - /// Returns matches from the string found using a single tokenizer of type `TokenType`. + /// Returns an array of `Token` in the `String` from a single tokenizer of type `TokenizerType`. + /// Each Token contains a substring from the `String`, the range of the substring in the `String`, + /// and an instance of `TokenizerType` that matched the substring. /// - /// The type of TokenType that is used is inferred by the result type. + /// The `TokenizerType` is inferred by the result type of the method reciever. /// /// ~~~~ /// // example usage: - /// // `WordToken` is a `TokenType` that matches any letter characters. - /// let input = "ab cd ef" - /// let matches: [WordToken.Match] = input.matches() - /// // matches.count -> 3 - /// // - /// // matches[0] -> - /// // (tokenizer: WordToken(), - /// // text: "ab", - /// // range: Range(0, 2)) + /// // `WordTokenizer` is a `TokenizerType` that matches any letter characters. + /// + /// let tokens: [WordTokenizer.Token] = "ab cd ef".tokens() + /// // tokens.count -> 3 + /// + /// // tokens[0] -> + /// // (text: "ab", + /// // range: Range(0, 2) + /// // tokenizer: WordTokenizer(),) /// ~~~~ /// - /// Note: Using this method initalizes the TokenType with the default initalizer `init()`. - /// If the tokenizer needs to use another initalizer, then use the `matches(from:)` method - /// to find matches instead. + /// Note: Using this method initalizes a tokenizer with the default `init()` initalizer. + /// + /// If the tokenizer needs to use another initalizer, then use the alternate `tokens(matchedWith:)` method + /// instead. /// - /// Returns: An array of type `T.Match` where T is the generic `TokenType` used. - func matches() -> [(tokenizer: T, text: String, range: Range)] { + /// Returns: An array of type `TokenizerType.Token`. + func tokens() -> [(tokenizer: T, text: String, range: Range)] { - return self.matches(from: T()).flatMap({ - if let tokenType = $0.tokenizer as? T { - return (tokenizer: tokenType, text: $0.text, range: $0.range) + return self.tokens(matchedWith: T()).flatMap({ + if let tokenizer = $0.tokenizer as? T { + return (tokenizer: tokenizer, text: $0.text, range: $0.range) } else { return nil } }) } - /// Returns matches from the string found using one or more tokenizers of type `TokenType`. - /// - /// - Parameter tokenizers: One or more tokenizers to use to match substrings in the string. + /// Returns an array of `Token` in the `String` matched using one or more tokenizers of + /// type `TokenizerType`. + /// + /// - Parameter tokenizers: One or more tokenizers to use to match substrings in the `String`. /// - /// Tokenizers are greedy and are used in the order that they occur within `tokenizers`. + /// Note: Tokenizers are greedy and are used in the order that they occur within `tokenizers`. /// /// Typical behavior when using tokeninzers that may match substrings in different ways is /// to call this method with the most specific tokenziers before more general tokenizers. /// - /// If a specifc tokenzier fails to complete a match, the general tokenizer still has a - /// chance to match it later. + /// If a specifc tokenzier fails to complete a match, subsequent tokenizers will be given the + /// opportunity to match a substring. /// - /// Returns: An array of type `Match` which is a tuple containing an instance of the tokenizer - /// that matched the result, the substring that was matched, and the range of the matched - /// substring in this string. - func matches(from tokenizers: TokenType...) -> [Match] { - return matches(from: tokenizers) + /// Returns: An array of `Token` where each token is a tuple containing a substring from the + /// `String`, the range of the substring in the `String`, and an instance of `TokenizerType` + /// that matched the substring. + func tokens(matchedWith tokenizers: TokenizerType...) -> [Token] { + return tokens(from: tokenizers) } - internal func matches(from tokenizers: [TokenType]) -> [Match] { + internal func tokens(from tokenizers: [TokenizerType]) -> [Token] { guard !tokenizers.isEmpty else { return [] } let text = self - var matches: [Match] = [] + var tokens: [Token] = [] var tokenStartIndex = text.unicodeScalars.startIndex advanceTokenStart: while tokenStartIndex < text.unicodeScalars.endIndex { // prepare a backlog of tokens that can start with the current scalar - let possibleTokens = tokenizers.flatMap({ tokenizer -> TokenType? in + let possibleTokens = tokenizers.flatMap({ tokenizer -> TokenizerType? in tokenizer.prepareForReuse() return tokenizer.token(startingWith: text.unicodeScalars[tokenStartIndex]) }) @@ -104,12 +108,12 @@ public extension String { let currentIndex = text.unicodeScalars.index(after: tokenEndIndex) let scalar = (currentIndex == text.unicodeScalars.endIndex) ? nil : text.unicodeScalars[currentIndex] - if let scalar = scalar, token.canTake(scalar) { + if let scalar = scalar, token.tokenCanTake(scalar) { // the scalar is not nil, and the token can take the scalar: // - expand tokenEndIndex one position tokenEndIndex = text.unicodeScalars.index(after: tokenEndIndex) } - else if token.isComplete, token.isValid(whenNextScalarIs: scalar), + else if token.tokenIsComplete, token.tokenIsValid(whenNextScalarIs: scalar), let start = tokenStartIndex.samePosition(in: text), let next = currentIndex.samePosition(in: text) { // the scalar is either nil, or the token can not take it; and @@ -118,7 +122,7 @@ public extension String { // - advance tokenStartIndex to the currentIndex; and // - continue looking for tokens at new startIndex - matches.append( + tokens.append( (tokenizer: token.tokenizerForMatch, text: text[start..) - -public protocol TokenType { - - /// Asks the token if it can start with the given scalar. - /// - /// The default implementation of this method is an alias for `canTake(_:)`. - /// Implement this method if the token has unique criteria for the first scalar to match. - /// - /// - Parameter scalar: The scalar to check. - /// - /// - Returns: `true` if the token can start with this scalar; otherwise, false. - func canStart(with scalar: UnicodeScalar) -> Bool - - /// Asks the token if if can capture this scalar as a valid match. - /// - /// - Parameter scalar: The scalar to check using the token. - /// - /// - Returns: `true` if the token can take this this scalar; otherwise, false. - func canTake(_ scalar: UnicodeScalar) -> Bool - - /// Returns a boolean value if the token is complete. - var isComplete: Bool { get } - - /// Asks the token if it is invalid given context of the first scalar following this token. - /// - /// The default implementation of this method performs always returns `false`. - /// Implement this method to return `true` in situations where a token can not be followed - /// by certain scalars. - /// - /// - Parameter scalar: The first scalar following this token, or `nil` if the token has - /// reached the end of the text. - /// - /// - Returns: `true` if the token is invalid with the following scalar; otherwise, false. - func completeTokenIsInvalid(whenNextScalarIs scalar: UnicodeScalar?) -> Bool - - /// Ask the token to prepare itself to start matching a new series of scalars. - /// - /// The default implementation of this method does nothing. - /// Implement this method to reset the token if calls to `canTake(_:)` change the state - /// of the token. - func prepareForReuse() - - /// Returns an instance of that can start with the given scalar, - /// or `nil` if type can't start with this scalar. - /// - /// The default implementation of this method returns itself if `canStart(with:)` returns true; - /// otherwise, nil. - func token(startingWith scalar: UnicodeScalar) -> TokenType? - - /// Initialize an empty instance. - init() - - /// Returns a new instance of a token that's a copy of the receiver. - /// - /// The object returned is set as the `tokenizer` element from a call to `matches()` - /// If the type implements NSCopying protocol, the default implementation returns the result of - /// `copy(with: nil)`; otherwise, returns self. - var tokenizerForMatch: TokenType { get } -} - -public extension TokenType { - - /// A tuple capturing information about a token match. - /// - /// - tokenzier: The instance of Self that matched the token. - /// - text: The text that the token matched. - /// - range: The range of the matched text in the original input. - typealias Match = (tokenizer: Self, text: String, range: Range) - - /// The default tokenzier for this type. - /// Is equivilent to using the default initalizer `init()`. - static var tokenizer: TokenType { return Self() } - - func canStart(with scalar: UnicodeScalar) -> Bool { - return canTake(scalar) - } - - /// Returns a boolean value if the token is complete. - /// This default implementation returns `true`. - var isComplete: Bool { - return true - } - - func completeTokenIsInvalid(whenNextScalarIs scalar: UnicodeScalar?) -> Bool { - return false - } - - internal func isValid(whenNextScalarIs scalar: UnicodeScalar?) -> Bool { - return !completeTokenIsInvalid(whenNextScalarIs: scalar) - } - - func token(startingWith scalar: UnicodeScalar) -> TokenType? { - return canStart(with: scalar) ? self : nil - } - - func prepareForReuse() {} - - /// Returns a new instance of a token that's a copy of the reciever. - /// - /// The object returned is set as the `tokenizer` element from a call to `matches()` - /// If the type implements NSCopying protocol, the default implementation returns the result of - /// `copy(with: nil)`; otherwise, returns self. - var tokenizerForMatch: TokenType { - if let copying = self as? NSCopying, let aCopy = copying.copy(with: nil) as? TokenType { - return aCopy - } - else { - return self - } - } -} diff --git a/Mustard/Mustard/TokenizerType.swift b/Mustard/Mustard/TokenizerType.swift new file mode 100644 index 0000000..962aa60 --- /dev/null +++ b/Mustard/Mustard/TokenizerType.swift @@ -0,0 +1,153 @@ +// TokenizerType.swift +// +// Copyright (c) 2017 Mathew Sanders +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +import Foundation + +/// Token is a typelias for a tuple with the following named elements: +/// +/// - tokenizer: An instance of `TokenizerType` that matched the token. +/// - text: A substring that the tokenizer matched in the original string. +/// - range: The range of the matched text in the original string. +public typealias Token = (tokenizer: TokenizerType, text: String, range: Range) + +public protocol TokenizerType { + + /// Returns an instance of a tokenizer that starts with the given scalar, + /// or `nil` if this type can't start with this scalar. + /// + /// The default implementation of this method returns `self` if `tokenCanStart(with:)` returns true; + /// otherwise, nil. + func token(startingWith scalar: UnicodeScalar) -> TokenizerType? + + /// Checks if tokens of this type can start with the given scalar. + /// + /// The default implementation of this method is an alias for `tokenCanTake(_:)`. + /// Provide an alternate implementation if tokens have special starting criteria. + /// + /// - Parameter scalar: The scalar the token could start with. + /// + /// - Returns: `true` if the token can start with this scalar; otherwise, false. + func tokenCanStart(with scalar: UnicodeScalar) -> Bool + + /// Checks if tokens can include this scalar as part of a token. + /// + /// This method is called multiple times for each subsequent scalar in a String until the tokenizer + /// returns `false`. + /// + /// - Parameter scalar: The scalar the token could include. + /// + /// - Returns: `true` if the token can take this this scalar; otherwise, false. + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool + + /// Returns a boolean value if the token is considered complete. + /// + /// The default implementation returns `true`. + /// + /// Provide an alternate implementation if tokens have some internal criteria that need to be + /// satisfied before a token is complete. + var tokenIsComplete: Bool { get } + + /// Checks if a complete token should be discarded given the context of the first scalar following this token. + /// + /// The default implementation of this method performs always returns `false`. + /// + /// Provide an alternate implementation to return `true` in situations where a token can not be followed + /// by certain scalars. + /// + /// - Parameter scalar: The first scalar following this token, or `nil` if the tokenizer has + /// matched a token that reaches the end of the text. + /// + /// - Returns: `true` if the token is invalid with the following scalar; otherwise, false. + func completeTokenIsInvalid(whenNextScalarIs scalar: UnicodeScalar?) -> Bool + + /// Ask the tokenizer to prepare itself to start matching a new series of scalars. + /// + /// The default implementation of this method does nothing. + /// + /// Provide an alternate implementation if the tokenizer maintains an internal state that updates based on calls to + /// `tokenCanTake(_:)` + func prepareForReuse() + + /// Initialize an empty instance of the tokenizer. + init() + + /// Returns an instance of the tokenizer that will be used as the `tokenizer` element in the `Token` tuple. + /// + /// If the tokenizer implements `NSCopying` protocol, the default implementation returns the result of + /// `copy(with: nil)`; otherwise, returns `self` which is suitable for structs. + /// + /// Provide an alternate implementation if the tokenizer is a reference type that does not implement `NSCopying`. + var tokenizerForMatch: TokenizerType { get } +} + +public extension TokenizerType { + + /// Token is a typelias for a tuple with the following named elements: + /// + /// - tokenizer: An instance of `Self` that matched the token. + /// - text: A substring that the tokenizer matched in the original string. + /// - range: The range of the matched text in the original string. + typealias Token = (tokenizer: Self, text: String, range: Range) + + /// The default tokenzier for this type. + /// This is equivilent to using the default initalizer `init()`. + static var defaultTokenzier: TokenizerType { return Self() } + + func tokenCanStart(with scalar: UnicodeScalar) -> Bool { + return tokenCanTake(scalar) + } + + /// Returns a boolean value if the token is complete. + /// This default implementation returns `true`. + var tokenIsComplete: Bool { + return true + } + + func completeTokenIsInvalid(whenNextScalarIs scalar: UnicodeScalar?) -> Bool { + return false + } + + internal func tokenIsValid(whenNextScalarIs scalar: UnicodeScalar?) -> Bool { + return !completeTokenIsInvalid(whenNextScalarIs: scalar) + } + + func token(startingWith scalar: UnicodeScalar) -> TokenizerType? { + return tokenCanStart(with: scalar) ? self : nil + } + + func prepareForReuse() {} + + /// Returns an instance of the tokenizer that will be used as the `tokenizer` element in the `Token` tuple. + /// + /// If the tokenizer implements `NSCopying` protocol, the default implementation returns the result of + /// `copy(with: nil)`; otherwise, returns `self` which is suitable for structs. + /// + /// Provide an alternate implementation if the tokenizer is a reference type that does not implement `NSCopying`. + var tokenizerForMatch: TokenizerType { + if let copying = self as? NSCopying, let aCopy = copying.copy(with: nil) as? TokenizerType { + return aCopy + } + else { + return self + } + } +} diff --git a/Mustard/MustardTests/CharacterSetTokenTests.swift b/Mustard/MustardTests/CharacterSetTokenTests.swift index 9e40342..263bca2 100644 --- a/Mustard/MustardTests/CharacterSetTokenTests.swift +++ b/Mustard/MustardTests/CharacterSetTokenTests.swift @@ -24,7 +24,7 @@ import XCTest import Mustard infix operator == -fileprivate func == (option: TokenType, input: CharacterSet) -> Bool { +fileprivate func == (option: TokenizerType, input: CharacterSet) -> Bool { if let characterSet = option as? CharacterSet { return characterSet == input } @@ -35,24 +35,24 @@ class CharacterSetTokenTests: XCTestCase { func testCharacterSetTokenizer() { - let matches = "123Hello world&^45.67".matches(from: .decimalDigits, .letters) + let tokens = "123Hello world&^45.67".tokens(matchedWith: .decimalDigits, .letters) - XCTAssert(matches.count == 5, "Unexpected number of characterset matches [\(matches.count)]") + XCTAssert(tokens.count == 5, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(matches[0].tokenizer == CharacterSet.decimalDigits) - XCTAssert(matches[0].text == "123") + XCTAssert(tokens[0].tokenizer == CharacterSet.decimalDigits) + XCTAssert(tokens[0].text == "123") - XCTAssert(matches[1].tokenizer == CharacterSet.letters) - XCTAssert(matches[1].text == "Hello") + XCTAssert(tokens[1].tokenizer == CharacterSet.letters) + XCTAssert(tokens[1].text == "Hello") - XCTAssert(matches[2].tokenizer == CharacterSet.letters) - XCTAssert(matches[2].text == "world") + XCTAssert(tokens[2].tokenizer == CharacterSet.letters) + XCTAssert(tokens[2].text == "world") - XCTAssert(matches[3].tokenizer == CharacterSet.decimalDigits) - XCTAssert(matches[3].text == "45") + XCTAssert(tokens[3].tokenizer == CharacterSet.decimalDigits) + XCTAssert(tokens[3].text == "45") - XCTAssert(matches[4].tokenizer == CharacterSet.decimalDigits) - XCTAssert(matches[4].text == "67") + XCTAssert(tokens[4].tokenizer == CharacterSet.decimalDigits) + XCTAssert(tokens[4].text == "67") } } diff --git a/Mustard/MustardTests/CustomTokenTests.swift b/Mustard/MustardTests/CustomTokenTests.swift index d76b993..1281d1e 100644 --- a/Mustard/MustardTests/CustomTokenTests.swift +++ b/Mustard/MustardTests/CustomTokenTests.swift @@ -23,25 +23,25 @@ import XCTest import Mustard -struct NumberToken: TokenType { +struct NumberTokenizer: TokenizerType { static private let numberCharacters = CharacterSet.decimalDigits.union(CharacterSet(charactersIn: ".")) // numbers must start with character 0...9 - func canStart(with scalar: UnicodeScalar) -> Bool { + func tokenCanStart(with scalar: UnicodeScalar) -> Bool { return CharacterSet.decimalDigits.contains(scalar) } // number token can include any character in 0...9 + '.' - func canTake(_ scalar: UnicodeScalar) -> Bool { - return NumberToken.numberCharacters.contains(scalar) + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { + return NumberTokenizer.numberCharacters.contains(scalar) } } -struct WordToken: TokenType { +struct WordTokenizer: TokenizerType { // word token can include any character in a...z + A...Z - func canTake(_ scalar: UnicodeScalar) -> Bool { + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { return CharacterSet.letters.contains(scalar) } } @@ -50,21 +50,21 @@ class CustomTokenTests: XCTestCase { func testNumberToken() { - let matches = "123Hello world&^45.67".matches(from: NumberToken.tokenizer, WordToken.tokenizer) + let tokens = "123Hello world&^45.67".tokens(matchedWith: NumberTokenizer.defaultTokenzier, WordTokenizer.defaultTokenzier) - XCTAssert(matches.count == 4, "Unexpected number of matches [\(matches.count)]") + XCTAssert(tokens.count == 4, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(matches[0].tokenizer is NumberToken) - XCTAssert(matches[0].text == "123") + XCTAssert(tokens[0].tokenizer is NumberTokenizer) + XCTAssert(tokens[0].text == "123") - XCTAssert(matches[1].tokenizer is WordToken) - XCTAssert(matches[1].text == "Hello") + XCTAssert(tokens[1].tokenizer is WordTokenizer) + XCTAssert(tokens[1].text == "Hello") - XCTAssert(matches[2].tokenizer is WordToken) - XCTAssert(matches[2].text == "world") + XCTAssert(tokens[2].tokenizer is WordTokenizer) + XCTAssert(tokens[2].text == "world") - XCTAssert(matches[3].tokenizer is NumberToken) - XCTAssert(matches[3].text == "45.67") + XCTAssert(tokens[3].tokenizer is NumberTokenizer) + XCTAssert(tokens[3].text == "45.67") } } diff --git a/Mustard/MustardTests/EmojiTokenTests.swift b/Mustard/MustardTests/EmojiTokenTests.swift index b93f32e..7891403 100644 --- a/Mustard/MustardTests/EmojiTokenTests.swift +++ b/Mustard/MustardTests/EmojiTokenTests.swift @@ -23,16 +23,16 @@ import XCTest import Mustard -struct EmojiToken: TokenType { +struct EmojiTokenizer: TokenizerType { // (e.g. can't start with a ZWJ) - func canStart(with scalar: UnicodeScalar) -> Bool { - return EmojiToken.isEmojiScalar(scalar) + func tokenCanStart(with scalar: UnicodeScalar) -> Bool { + return EmojiTokenizer.isEmojiScalar(scalar) } // either in the known range for a emoji, or a ZWJ - func canTake(_ scalar: UnicodeScalar) -> Bool { - return EmojiToken.isEmojiScalar(scalar) || EmojiToken.isJoiner(scalar) + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { + return EmojiTokenizer.isEmojiScalar(scalar) || EmojiTokenizer.isJoiner(scalar) } static func isJoiner(_ scalar: UnicodeScalar) -> Bool { @@ -80,15 +80,15 @@ class EmojiTokenTests: XCTestCase { // -> 7 (4 base, combied with 3 zero-width joiners \u{200D}) let sample = "baby:πŸ‘Ά baby:πŸ‘ΆπŸΏ flag:πŸ‡³πŸ‡Ώ flag:πŸ³οΈβ€πŸŒˆ family:πŸ‘©β€πŸ‘©β€πŸ‘¦β€πŸ‘¦" - let matches: [EmojiToken.Match] = sample.matches() + let tokens: [EmojiTokenizer.Token] = sample.tokens() - XCTAssert(matches.count == 5, "Unexpected number of emoji matches [\(matches.count)]") + XCTAssert(tokens.count == 5, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(matches[0].text == "πŸ‘Ά") - XCTAssert(matches[1].text == "πŸ‘ΆπŸΏ") - XCTAssert(matches[2].text == "πŸ‡³πŸ‡Ώ") - XCTAssert(matches[3].text == "πŸ³οΈβ€πŸŒˆ") - XCTAssert(matches[4].text == "πŸ‘©β€πŸ‘©β€πŸ‘¦β€πŸ‘¦") + XCTAssert(tokens[0].text == "πŸ‘Ά") + XCTAssert(tokens[1].text == "πŸ‘ΆπŸΏ") + XCTAssert(tokens[2].text == "πŸ‡³πŸ‡Ώ") + XCTAssert(tokens[3].text == "πŸ³οΈβ€πŸŒˆ") + XCTAssert(tokens[4].text == "πŸ‘©β€πŸ‘©β€πŸ‘¦β€πŸ‘¦") } } diff --git a/Mustard/MustardTests/FallbackTokenizerTests.swift b/Mustard/MustardTests/FallbackTokenizerTests.swift index 871f460..a11a571 100644 --- a/Mustard/MustardTests/FallbackTokenizerTests.swift +++ b/Mustard/MustardTests/FallbackTokenizerTests.swift @@ -28,26 +28,26 @@ class FallbackTokenizerTests: XCTestCase { func testFallback() { let input = "1.2 34 abc catastrophe cat 0.5" - let matches = input.matches(from: NumberToken.tokenizer, "cat".literalToken, CharacterSet.letters) + let tokens = input.tokens(matchedWith: NumberTokenizer.defaultTokenzier, "cat".literalTokenizer, CharacterSet.letters) - XCTAssert(matches.count == 6, "Unexpected number of tokens [\(matches.count)]") + XCTAssert(tokens.count == 6, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(matches[0].tokenizer is NumberToken) - XCTAssert(matches[0].text == "1.2") + XCTAssert(tokens[0].tokenizer is NumberTokenizer) + XCTAssert(tokens[0].text == "1.2") - XCTAssert(matches[1].tokenizer is NumberToken) - XCTAssert(matches[1].text == "34") + XCTAssert(tokens[1].tokenizer is NumberTokenizer) + XCTAssert(tokens[1].text == "34") - XCTAssert(matches[2].tokenizer is CharacterSet) - XCTAssert(matches[2].text == "abc") + XCTAssert(tokens[2].tokenizer is CharacterSet) + XCTAssert(tokens[2].text == "abc") - XCTAssert(matches[3].tokenizer is CharacterSet) - XCTAssert(matches[3].text == "catastrophe") + XCTAssert(tokens[3].tokenizer is CharacterSet) + XCTAssert(tokens[3].text == "catastrophe") - XCTAssert(matches[4].tokenizer is LiteralToken) - XCTAssert(matches[4].text == "cat") + XCTAssert(tokens[4].tokenizer is LiteralTokenizer) + XCTAssert(tokens[4].text == "cat") - XCTAssert(matches[5].tokenizer is NumberToken) - XCTAssert(matches[5].text == "0.5") + XCTAssert(tokens[5].tokenizer is NumberTokenizer) + XCTAssert(tokens[5].text == "0.5") } } diff --git a/Mustard/MustardTests/FuzzyMatchTokenTests.swift b/Mustard/MustardTests/FuzzyMatchTokenTests.swift index dc72169..b26750c 100644 --- a/Mustard/MustardTests/FuzzyMatchTokenTests.swift +++ b/Mustard/MustardTests/FuzzyMatchTokenTests.swift @@ -28,7 +28,7 @@ func ~= (option: CharacterSet, input: UnicodeScalar) -> Bool { return option.contains(input) } -class FuzzyLiteralMatch: TokenType { +class FuzzyLiteralMatch: TokenizerType { let target: String private let exclusions: CharacterSet @@ -44,7 +44,7 @@ class FuzzyLiteralMatch: TokenType { self.exclusions = exclusions } - func canTake(_ scalar: UnicodeScalar) -> Bool { + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { guard position < target.unicodeScalars.endIndex else { // we've matched all of the target @@ -81,7 +81,7 @@ class FuzzyLiteralMatch: TokenType { } } - var isComplete: Bool { + var tokenIsComplete: Bool { return position == target.unicodeScalars.endIndex } @@ -90,7 +90,7 @@ class FuzzyLiteralMatch: TokenType { } } -class DateToken: TokenType { +class DateTokenizer: TokenizerType { // private properties private let _template = "00/00/00" @@ -116,7 +116,7 @@ class DateToken: TokenType { _dateText = "" } - func canTake(_ scalar: UnicodeScalar) -> Bool { + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { guard _position < _template.unicodeScalars.endIndex else { // we've matched all of the template @@ -136,9 +136,9 @@ class DateToken: TokenType { } } - var isComplete: Bool { + var tokenIsComplete: Bool { if _position == _template.unicodeScalars.endIndex, - let date = DateToken.dateFormatter.date(from: _dateText) { + let date = DateTokenizer.dateFormatter.date(from: _dateText) { // we've reached the end of the template // and the date text collected so far represents a valid // date format (e.g. not 99/99/99) @@ -161,8 +161,8 @@ class DateToken: TokenType { // return an instance of tokenizer to return in matching tokens // we return a copy so that the instance keeps reference to the // dateText that has been matched, and the date that was parsed - var tokenizerForMatch: TokenType { - return DateToken(text: _dateText, date: _date) + var tokenizerForMatch: TokenizerType { + return DateTokenizer(text: _dateText, date: _date) } // only used by `tokenizerForMatch` @@ -181,29 +181,29 @@ class FuzzyMatchTokenTests: XCTestCase { let fuzzyTokenzier = FuzzyLiteralMatch(target: "#YF1942B", ignoring: CharacterSet.whitespaces.union(.punctuationCharacters)) - let matches = messyInput.matches(from: fuzzyTokenzier, DateToken.tokenizer) + let tokens = messyInput.tokens(matchedWith: fuzzyTokenzier, DateTokenizer.defaultTokenzier) - XCTAssert(matches.count == 3, "Unexpected number of matches [\(matches.count)]") + XCTAssert(tokens.count == 3, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(matches[0].tokenizer is FuzzyLiteralMatch) - XCTAssert(matches[0].text == "#YF 1942-b") + XCTAssert(tokens[0].tokenizer is FuzzyLiteralMatch) + XCTAssert(tokens[0].text == "#YF 1942-b") - XCTAssert(matches[1].tokenizer is DateToken) - XCTAssert(matches[1].text == "12/01/27") + XCTAssert(tokens[1].tokenizer is DateTokenizer) + XCTAssert(tokens[1].text == "12/01/27") } func testDateMatches() { let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned) 12/02/27 (Arrived) ref: 99/99/99" - let matches: [DateToken.Match] = messyInput.matches() + let tokens: [DateTokenizer.Token] = messyInput.tokens() - XCTAssert(matches.count == 2, "Unexpected number of matches [\(matches.count)]") + XCTAssert(tokens.count == 2, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(matches[0].text == "12/01/27") - XCTAssert(matches[0].tokenizer.date == DateToken.dateFormatter.date(from: matches[0].text)) + XCTAssert(tokens[0].text == "12/01/27") + XCTAssert(tokens[0].tokenizer.date == DateTokenizer.dateFormatter.date(from: tokens[0].text)) - XCTAssert(matches[1].text == "12/02/27") - XCTAssert(matches[1].tokenizer.date == DateToken.dateFormatter.date(from: matches[1].text)) + XCTAssert(tokens[1].text == "12/02/27") + XCTAssert(tokens[1].tokenizer.date == DateTokenizer.dateFormatter.date(from: tokens[1].text)) } } diff --git a/Mustard/MustardTests/LiteralTokenTests.swift b/Mustard/MustardTests/LiteralTokenTests.swift index e00669a..f9b034c 100644 --- a/Mustard/MustardTests/LiteralTokenTests.swift +++ b/Mustard/MustardTests/LiteralTokenTests.swift @@ -24,7 +24,7 @@ import XCTest import Mustard // implementing as class rather than struct since `canTake(_:)` will have mutating effect. -class LiteralToken: TokenType { +class LiteralTokenizer: TokenizerType { private let target: String private var position: String.UnicodeScalarIndex @@ -42,7 +42,7 @@ class LiteralToken: TokenType { // instead of looking at a set of scalars, the order that the scalar occurs // is relevent for the token - func canTake(_ scalar: UnicodeScalar) -> Bool { + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { guard position < target.unicodeScalars.endIndex else { return false @@ -61,7 +61,7 @@ class LiteralToken: TokenType { // this token is only complete when we've called `canTake(_:)` with the correct sequence // of scalars such that `position` has advanced to the endIndex of the target - var isComplete: Bool { + var tokenIsComplete: Bool { return position == target.unicodeScalars.endIndex } @@ -86,8 +86,8 @@ class LiteralToken: TokenType { extension String { // a convenience to allow us to use `"cat".literalToken` instead of `LiteralToken("cat")` - var literalToken: LiteralToken { - return LiteralToken(target: self) + var literalTokenizer: LiteralTokenizer { + return LiteralTokenizer(target: self) } } @@ -97,15 +97,15 @@ class LiteralTokenTests: XCTestCase { func testGetCatAndDuck() { let input = "the cat and the catastrophe duck" - let matches = input.matches(from: "cat".literalToken, "duck".literalToken) + let tokens = input.tokens(matchedWith: "cat".literalTokenizer, "duck".literalTokenizer) - XCTAssert(matches.count == 2, "Unexpected number of matches [\(matches.count)]") + XCTAssert(tokens.count == 2, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(matches[0].tokenizer is LiteralToken) - XCTAssert(matches[0].text == "cat") + XCTAssert(tokens[0].tokenizer is LiteralTokenizer) + XCTAssert(tokens[0].text == "cat") - XCTAssert(matches[1].tokenizer is LiteralToken) - XCTAssert(matches[1].text == "duck") + XCTAssert(tokens[1].tokenizer is LiteralTokenizer) + XCTAssert(tokens[1].text == "duck") } } diff --git a/Mustard/MustardTests/MixedTokenTests.swift b/Mustard/MustardTests/MixedTokenTests.swift index ec2e06d..f9bb183 100644 --- a/Mustard/MustardTests/MixedTokenTests.swift +++ b/Mustard/MustardTests/MixedTokenTests.swift @@ -23,41 +23,41 @@ import XCTest import Mustard -enum MixedToken: TokenType { +enum MixedTokenizer: TokenizerType { case word case number case emoji - case none - + case none // 'none' case not strictly needed, and + // in this implementation will never be matched init() { self = .none } - static let wordToken = WordToken() - static let numberToken = NumberToken() - static let emojiToken = EmojiToken() + static let wordTokenizer = WordTokenizer() + static let numberTokenizer = NumberTokenizer() + static let emojiTokenizer = EmojiTokenizer() - func canTake(_ scalar: UnicodeScalar) -> Bool { + func tokenCanTake(_ scalar: UnicodeScalar) -> Bool { switch self { - case .word: return MixedToken.wordToken.canTake(scalar) - case .number: return MixedToken.numberToken.canTake(scalar) - case .emoji: return MixedToken.emojiToken.canTake(scalar) + case .word: return MixedTokenizer.wordTokenizer.tokenCanTake(scalar) + case .number: return MixedTokenizer.numberTokenizer.tokenCanTake(scalar) + case .emoji: return MixedTokenizer.emojiTokenizer.tokenCanTake(scalar) case .none: return false } } - func token(startingWith scalar: UnicodeScalar) -> TokenType? { + func token(startingWith scalar: UnicodeScalar) -> TokenizerType? { - if let _ = MixedToken.wordToken.token(startingWith: scalar) { - return MixedToken.word + if let _ = MixedTokenizer.wordTokenizer.token(startingWith: scalar) { + return MixedTokenizer.word } - else if let _ = MixedToken.numberToken.token(startingWith: scalar) { - return MixedToken.number + else if let _ = MixedTokenizer.numberTokenizer.token(startingWith: scalar) { + return MixedTokenizer.number } - else if let _ = MixedToken.emojiToken.token(startingWith: scalar) { - return MixedToken.emoji + else if let _ = MixedTokenizer.emojiTokenizer.token(startingWith: scalar) { + return MixedTokenizer.emoji } else { return nil @@ -69,33 +69,33 @@ class MixedTokenTests: XCTestCase { func testMixedTokens() { - let matches: [MixedToken.Match] = "123πŸ‘©β€πŸ‘©β€πŸ‘¦β€πŸ‘¦Hello worldπŸ‘ΆagainπŸ‘ΆπŸΏ45.67".matches() + let tokens: [MixedTokenizer.Token] = "123πŸ‘©β€πŸ‘©β€πŸ‘¦β€πŸ‘¦Hello worldπŸ‘ΆagainπŸ‘ΆπŸΏ45.67".tokens() - XCTAssert(matches.count == 8, "Unexpected number of matches [\(matches.count)]") + XCTAssert(tokens.count == 8, "Unexpected number of tokens [\(tokens.count)]") - XCTAssert(matches[0].tokenizer == .number) - XCTAssert(matches[0].text == "123") + XCTAssert(tokens[0].tokenizer == .number) + XCTAssert(tokens[0].text == "123") - XCTAssert(matches[1].tokenizer == .emoji) - XCTAssert(matches[1].text == "πŸ‘©β€πŸ‘©β€πŸ‘¦β€πŸ‘¦") + XCTAssert(tokens[1].tokenizer == .emoji) + XCTAssert(tokens[1].text == "πŸ‘©β€πŸ‘©β€πŸ‘¦β€πŸ‘¦") - XCTAssert(matches[2].tokenizer == .word) - XCTAssert(matches[2].text == "Hello") + XCTAssert(tokens[2].tokenizer == .word) + XCTAssert(tokens[2].text == "Hello") - XCTAssert(matches[3].tokenizer == .word) - XCTAssert(matches[3].text == "world") + XCTAssert(tokens[3].tokenizer == .word) + XCTAssert(tokens[3].text == "world") - XCTAssert(matches[4].tokenizer == .emoji) - XCTAssert(matches[4].text == "πŸ‘Ά") + XCTAssert(tokens[4].tokenizer == .emoji) + XCTAssert(tokens[4].text == "πŸ‘Ά") - XCTAssert(matches[5].tokenizer == .word) - XCTAssert(matches[5].text == "again") + XCTAssert(tokens[5].tokenizer == .word) + XCTAssert(tokens[5].text == "again") - XCTAssert(matches[6].tokenizer == .emoji) - XCTAssert(matches[6].text == "πŸ‘ΆπŸΏ") + XCTAssert(tokens[6].tokenizer == .emoji) + XCTAssert(tokens[6].text == "πŸ‘ΆπŸΏ") - XCTAssert(matches[7].tokenizer == .number) - XCTAssert(matches[7].text == "45.67") + XCTAssert(tokens[7].tokenizer == .number) + XCTAssert(tokens[7].text == "45.67") } } diff --git a/README.md b/README.md index 9ebb413..d3b31e5 100644 --- a/README.md +++ b/README.md @@ -4,63 +4,78 @@ Mustard is a Swift library for tokenizing strings when splitting by whitespace d ## Quick start using character sets -Mustard extends `String` with the method `matches(from: CharacterSet...)` which allows you to pass in one -or more character sets to use criteria to find substring matches using one or more character sets as tokenizers. +Foundation includes the `String` method [`components(separatedBy:)`](https://developer.apple.com/reference/swift/string/1690777-components) that allows us to get substrings divided up by certain characters: -Here's an example that extracts any sequence of characters that are either letters or digits: +````Swift +let sentence = "hello 2007 year" +let words = sentence.components(separatedBy: .whitespace) +// words.count -> 3 +// words = ["hello", "2007", "year"] +```` + +Mustard provides a similar feature, but with the opposite approach, where instead of matching by separators you can match by one or more character sets, which is useful if separators simply don't exist: + +````Swift +import Mustard + +let sentence = "hello2007year" +let words = sentence.components(matchedWith: .letters, .decimalDigits) +// words.count -> 3 +// words = ["hello", "2007", "year"] +```` + +If you want more than just the substrings, you can use the `tokens(matchedWith: CharacterSet...)` method which returns a tuple with the substring, range, and the CharacterSet responsible for matching the substring: ````Swift import Mustard -let matches = "123Hello world&^45.67".matches(from: .decimalDigits, .letters) -// matches.count -> 5 -// matches: [(tokenizer: TokenType, text: String, range: Range)] -// matches is an array of tuples which contains an instance of the TokenType that -// is responsible for the match, the actual text that was matched, and the range of the token -// in the original input. +let tokens: [Token] = "123Hello world&^45.67".tokens(matchedWith: .decimalDigits, .letters) +// typealias Token = (text: String, range: Range, tokenizer: TokenizerType) +// tokens.count -> 5 (characters '&', '^', and '.' are ignored) // // second token.. -// matches[1].tokenizer -> CharacterSet.letters -// matches[1].text -> "Hello" -// matches[1].range -> Range(3..<8) +// token[1].text -> "Hello" +// token[1].range -> Range(3..<8) +// token[1].tokenizer -> CharacterSet.letters // // last token.. -// matches[4].tokenizer -> CharacterSet.decimalDigits -// matches[4].text -> "67" -// matches[4].range -> Range(19..<21) +// tokens[4].text -> "67" +// tokens[4].range -> Range(19..<21) +// tokens[4].tokenizer -> CharacterSet.decimalDigits ```` ## Expressive use with custom tokenizers -By creating types that implement the `TokenType` protocol we can create tokenizers with more sophisticated behaviors. +Rather than being limited to matching substrings from character sets, you can create your own tokenizers with more +sophisticated behavior by implementing the `TokenizerType` protocol. -Here's some usage of a `DateToken` type ([see example](Documentation/Tokens with internal state.md) for implementation) -that matches tokens with the a valid `MM/dd/yy` format, and at the same time exposes a `date` property allowing access to a -corresponding `Date` object. +Here's an example of using `DateTokenizer` ([see example](Documentation/Tokens with internal state.md) for implementation) +that matches substrings with a valid `MM/dd/yy` format, and at the same time exposes a `Date` object corresponding to the date represented by the substring: ````Swift import Mustard let messyInput = "Serial: #YF 1942-b 12/01/27 (Scanned) 12/03/27 (Arrived) ref: 99/99/99" -let matches: [DateToken.Match] = messyInput.matches() -// matches.count -> 2 -// ('99/99/99' is *not* matched by `DateToken`) + +let tokens: [DateTokenizer.Token] = messyInput.tokens() +// tokens.count -> 2 +// ('99/99/99' is *not* matched by `DateTokenizer`) // // first date -// matches[0].text -> "12/01/27" -// matches[0].tokenizer -> DateToken() -// matches[0].tokenizer.date -> Date(2027-12-01 05:00:00 +0000) +// tokens[0].text -> "12/01/27" +// tokens[0].tokenizer -> DateTokenizer() +// tokens[0].tokenizer.date -> Date(2027-12-01 05:00:00 +0000) // // last date -// matches[1].text -> "12/03/27" -// matches[1].tokenizer -> DateToken() -// matches[1].tokenizer.date -> Date(2027-12-03 05:00:00 +0000) +// tokens[1].text -> "12/03/27" +// tokens[1].tokenizer -> DateTokenizer() +// tokens[1].tokenizer.date -> Date(2027-12-03 05:00:00 +0000) ```` ## Documentation & Examples - [Greedy tokens and tokenizer order](Documentation/Greedy tokens and tokenizer order.md) -- [TallyType protocol: implementing your own tokenizer](Documentation/TallyType protocol.md) +- [TokenizerType: implementing your own tokenizer](Documentation/TokenizerType protocol.md) - [Example: matching emoji](Documentation/Matching emoji.md) - [Example: expressive matching](Documentation/Expressive matching.md) - [Example: literal and template matching using tokens with internal state](Documentation/Tokens with internal state.md)