Permalink
Browse files

Add a strict HTML block parser

  • Loading branch information...
1 parent 6c0c548 commit e47c1a2dbb683841363dd180d9026d3d0bc862f3 @mdiep committed Feb 22, 2014
Showing with 148 additions and 27 deletions.
  1. +132 −25 Source/MMHTMLParser.m
  2. +16 −2 Tests/MMHTMLTests.m
View
@@ -38,31 +38,13 @@ @implementation MMHTMLParser
- (MMElement *)parseBlockTagWithScanner:(MMScanner *)scanner
{
- // which starts with a '<'
- if ([scanner nextCharacter] != '<')
- return nil;
- [scanner advance];
-
- NSSet *htmlBlockTags = [NSSet setWithObjects:
- @"p", @"div", @"h1", @"h2", @"h3", @"h4", @"h5", @"h6",
- @"blockquote", @"pre", @"table", @"dl", @"ol", @"ul",
- @"script", @"noscript", @"form", @"fieldset", @"iframe",
- @"math", @"ins", @"del", nil];
- NSString *tagName = [scanner nextWord];
- if (![htmlBlockTags containsObject:tagName])
- return nil;
-
- // Skip lines until we come across a blank line
- while (![scanner atEndOfLine])
- {
- [scanner advanceToNextLine];
- }
-
- MMElement *element = [MMElement new];
- element.type = MMElementTypeHTML;
- element.range = NSMakeRange(scanner.startLocation, scanner.location-scanner.startLocation);
+ [scanner beginTransaction];
+ MMElement *element = [self _parseStrictBlockTagWithScanner:scanner];
+ [scanner commitTransaction:element != nil];
+ if (element)
+ return element;
- return element;
+ return [self _parseLenientBlockTagWithScanner:scanner];
}
- (MMElement *)parseCommentWithScanner:(MMScanner *)scanner
@@ -107,7 +89,6 @@ - (MMElement *)parseInlineTagWithScanner:(MMScanner *)scanner
return nil;
[self _parseAttributesWithScanner:scanner];
-
[scanner skipWhitespace];
if ([scanner nextCharacter] == '/')
@@ -131,6 +112,132 @@ - (MMElement *)parseInlineTagWithScanner:(MMScanner *)scanner
#pragma mark Private Methods
//==================================================================================================
+- (MMElement *)_parseStrictBlockTagWithScanner:(MMScanner *)scanner
+{
+ // which starts with a '<'
+ if ([scanner nextCharacter] != '<')
+ return nil;
+ [scanner advance];
+
+ NSSet *htmlBlockTags = [NSSet setWithObjects:
+ @"p", @"div", @"h1", @"h2", @"h3", @"h4", @"h5", @"h6",
+ @"blockquote", @"pre", @"table", @"dl", @"ol", @"ul",
+ @"script", @"noscript", @"form", @"fieldset", @"iframe",
+ @"math", @"ins", @"del", nil];
+ NSString *tagName = [scanner nextWord];
+ if (![htmlBlockTags containsObject:tagName])
+ return nil;
+ scanner.location += tagName.length;
+
+ [self _parseAttributesWithScanner:scanner];
+ [scanner skipWhitespace];
+
+ if ([scanner nextCharacter] != '>')
+ return nil;
+ [scanner advance];
+
+ NSCharacterSet *boringChars = [[NSCharacterSet characterSetWithCharactersInString:@"<"] invertedSet];
+ while (1)
+ {
+ if ([scanner atEndOfString])
+ return nil;
+
+ [scanner skipCharactersFromSet:boringChars];
+ if ([scanner atEndOfLine])
+ {
+ [scanner advanceToNextLine];
+ continue;
+ }
+
+ [scanner beginTransaction];
+ if ([self _parseEndTag:tagName withScanner:scanner])
+ {
+ [scanner commitTransaction:YES];
+ break;
+ }
+ [scanner commitTransaction:NO];
+
+ MMElement *element;
+
+ [scanner beginTransaction];
+ element = [self _parseStrictBlockTagWithScanner:scanner];
+ [scanner commitTransaction:element != nil];
+ if (element)
+ continue;
+
+ [scanner beginTransaction];
+ element = [self parseCommentWithScanner:scanner];
+ [scanner commitTransaction:element != nil];
+ if (element)
+ continue;
+
+ [scanner beginTransaction];
+ element = [self parseInlineTagWithScanner:scanner];
+ [scanner commitTransaction:element != nil];
+ if (element)
+ continue;
+
+ return nil;
+ }
+
+ MMElement *element = [MMElement new];
+ element.type = MMElementTypeHTML;
+ element.range = NSMakeRange(scanner.startLocation, scanner.location-scanner.startLocation);
+
+ return element;
+}
+
+- (BOOL)_parseEndTag:(NSString *)tagName withScanner:(MMScanner *)scanner
+{
+ if (scanner.nextCharacter != '<')
+ return NO;
+ [scanner advance];
+
+ if (scanner.nextCharacter != '/')
+ return NO;
+ [scanner advance];
+
+ [scanner skipWhitespace];
+ if (![scanner matchString:tagName])
+ return NO;
+ [scanner skipWhitespace];
+
+ if (scanner.nextCharacter != '>')
+ return NO;
+ [scanner advance];
+
+ return YES;
+}
+
+- (MMElement *)_parseLenientBlockTagWithScanner:(MMScanner *)scanner
+{
+ // which starts with a '<'
+ if ([scanner nextCharacter] != '<')
+ return nil;
+ [scanner advance];
+
+ NSSet *htmlBlockTags = [NSSet setWithObjects:
+ @"p", @"div", @"h1", @"h2", @"h3", @"h4", @"h5", @"h6",
+ @"blockquote", @"pre", @"table", @"dl", @"ol", @"ul",
+ @"script", @"noscript", @"form", @"fieldset", @"iframe",
+ @"math", @"ins", @"del", nil];
+ NSString *tagName = [scanner nextWord];
+ if (![htmlBlockTags containsObject:tagName])
+ return nil;
+
+ // Skip lines until we come across a blank line
+ while (![scanner atEndOfLine])
+ {
+ [scanner advanceToNextLine];
+ }
+
+ MMElement *element = [MMElement new];
+ element.type = MMElementTypeHTML;
+ element.range = NSMakeRange(scanner.startLocation, scanner.location-scanner.startLocation);
+
+ return element;
+}
+
- (NSRange)_parseNameWithScanner:(MMScanner *)scanner
{
NSMutableCharacterSet *nameSet = [NSMutableCharacterSet alphanumericCharacterSet];
View
@@ -194,7 +194,6 @@ - (void)testBlockHTMLOnASingleLine
MMAssertMarkdownEqualsHTML(@"<div>A test.</div>", @"<div>A test.</div>");
}
-#if RUN_KNOWN_FAILURES
- (void)testBlockHTMLBlankLineBetweenCloseTags
{
// Primitive HTML handling might end the HTML block after the first div, since it's a close tag
@@ -207,7 +206,22 @@ - (void)testBlockHTMLBlankLineBetweenCloseTags
"</div>\n";
MMAssertMarkdownEqualsHTML(html, html);
}
-#endif
+
+- (void)testBlockHTMLWithUnclosedTag
+{
+ NSString *markdown = @"<div>\n"
+ "<div>\n"
+ "A\n"
+ "</div>\n"
+ "\n"
+ "div\n";
+ NSString *string = @"<div>\n"
+ "<div>\n"
+ "A\n"
+ "</div>\n"
+ "<p>div</p>\n";
+ MMAssertMarkdownEqualsString(markdown, string);
+}
- (void)testBlockHTMLCommentWithSpans
{

0 comments on commit e47c1a2

Please sign in to comment.