Added .rl files to generate HpricotScanner.cs using Ragel

nrk · Feb 22, 2009 · c6f2802 · c6f2802
1 parent 9143b59
commit c6f2802
Show file tree

Hide file tree

Showing 3 changed files with 497 additions and 12 deletions.
diff --git a/TODO b/TODO
@@ -1,13 +1,8 @@
- * The current version of ironruby-hpricot is based off of hpricot 0.6.164, 
-   but I'm already starting to work on aligning its code base to new features 
-   from 0.6.207 (see _why's git repository http://github.com/why/hpricot/).
-
- * HpricotScanner.rl is still missing: I think I'll give an higher priority to 
-   this task to be able to easily test the generation of HpricotScanner.cs by 
-   passing -F1 and -G1 flags to Ragel, just to see which one is the best 
-   speed and memory wise.
-
- * README needs to be populated with notes and building instructions. The code 
-   needs the right dose of comments.
-
+ * The current version of ironruby-hpricot is based off of hpricot 0.6.164, 
+   but I'm already starting to work on aligning its code base to new features 
+   from 0.6.207 (see _why's git repository http://github.com/why/hpricot/).
+
+ * README needs to be populated with notes and building instructions. The code 
+   needs the right dose of comments.
+
  * Add tests
diff --git a/src/Ragel/HpricotScanner.common.rl b/src/Ragel/HpricotScanner.common.rl
@@ -0,0 +1,76 @@
+%%{
+
+  machine hpricot_common;
+
+  #
+  # HTML tokens
+  # (a blatant rip from HTree)
+  #
+  newline = '\n' @{curline += 1;} ;
+  NameChar = [\-A-Za-z0-9._:?] ;
+  Name = [A-Za-z_:] NameChar* ;
+  StartComment = "<!--" ;
+  EndComment = "-->" ;
+  StartCdata = "<![CDATA[" ;
+  EndCdata = "]]>" ;
+
+  NameCap = Name >_tag %tag;
+  NameAttr = NameChar+ >_akey %akey ;
+  Q1Char = ( "\\\'" | [^'] ) ;
+  Q1Attr = Q1Char* >_aval %aval ;
+  Q2Char = ( "\\\"" | [^"] ) ;
+  Q2Attr = Q2Char* >_aval %aval ;
+  UnqAttr = ( space >_aval | [^ \t\r\n<>"'] >_aval [^ \t\r\n<>]* %aunq ) ; 
+  Nmtoken = NameChar+ >_akey %akey ;
+
+  Attr =  NameAttr space* "=" space* ('"' Q2Attr '"' | "'" Q1Attr "'" | UnqAttr space+ ) space* ;
+  AttrEnd = ( NameAttr space* "=" space* UnqAttr? | Nmtoken >new_attr %save_attr ) ;
+  AttrSet = ( Attr >new_attr %save_attr | Nmtoken >new_attr space+ %save_attr ) ;
+  StartTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? ">" | "<" NameCap ">";
+  EmptyTag = "<" NameCap space+ AttrSet* (AttrEnd >new_attr %save_attr)? "/>" | "<" NameCap "/>" ;
+
+  EndTag = "</" NameCap space* ">" ;
+  XmlVersionNum = [a-zA-Z0-9_.:\-]+ >_aval %xmlver ;
+  XmlVersionInfo = space+ "version" space* "=" space* ("'" XmlVersionNum "'" | '"' XmlVersionNum '"' ) ;
+  XmlEncName = [A-Za-z] >_aval [A-Za-z0-9._\-]* %xmlenc ;
+  XmlEncodingDecl = space+ "encoding" space* "=" space* ("'" XmlEncName "'" | '"' XmlEncName '"' ) ;
+  XmlYesNo = ("yes" | "no") >_aval %xmlsd ;
+  XmlSDDecl = space+ "standalone" space* "=" space* ("'" XmlYesNo "'" | '"' XmlYesNo '"') ;
+  XmlDecl = "<?xml" XmlVersionInfo XmlEncodingDecl? XmlSDDecl? space* "?"? ">" ;
+
+  SystemLiteral = '"' [^"]* >_aval %sysid '"' | "'" [^']* >_aval %sysid "'" ;
+  PubidLiteral = '"' [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]*  >_aval %pubid '"' |
+    "'" [\t a-zA-Z0-9\-'()+,./:=?;!*\#@$_%]* >_aval %pubid "'" ;
+  ExternalID = ( "SYSTEM" | "PUBLIC" space+ PubidLiteral ) (space+ SystemLiteral)? ;
+  DocType = "<!DOCTYPE" space+ NameCap (space+ ExternalID)? space* ("[" [^\]]* "]" space*)? ">" ;
+  StartXmlProcIns = "<?" Name >{ TEXT_PASS(); } space+ ;
+  EndXmlProcIns = "?"? ">" ;
+
+  html_comment := |*
+    EndComment @{ EBLK(comment, 3); fgoto main; };
+    any | newline { TEXT_PASS(); };
+  *|;
+
+  html_cdata := |*
+    EndCdata @{ EBLK(cdata, 3); fgoto main; };
+    any | newline { TEXT_PASS(); };
+  *|;
+
+  html_procins := |*
+    EndXmlProcIns @{ EBLK(procins, 2); fgoto main; };
+    any | newline { TEXT_PASS(); };
+  *|;
+
+  main := |*
+    XmlDecl >newEle { ELE(xmldecl); };
+    DocType >newEle { ELE(doctype); };
+    StartXmlProcIns >newEle { fgoto html_procins; };
+    StartTag >newEle { ELE(stag); };
+    EndTag >newEle { ELE(etag); };
+    EmptyTag >newEle { ELE(emptytag); };
+    StartComment >newEle { fgoto html_comment; };
+    StartCdata >newEle { fgoto html_cdata; };
+    any | newline { TEXT_PASS(); };
+  *|;
+
+}%%;