HTML Parser should produce 8bit substrings for inline style and scrip…

…t elements https://bugs.webkit.org/show_bug.cgi?id=93742 Reviewed by Benjamin Poulain. Source/WebCore: Currently all data associated with a token is stored and processed as UChars. Added code to determine that the contents of token data is all 8 bit by keeping the logical OR value of all prior characters. Also added a flag that the parser can set to indicate when the token data is converted to a String that we want to make an 8 bit string if possible. Enabled this handling for script, style, iframe, noembed, noframes, noscript and xmp tags. No new tests. Existing tests provide coverage. * html/parser/HTMLTokenizer.cpp: (WebCore::HTMLTokenizer::nextToken): * html/parser/HTMLTreeBuilder.cpp: (WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::ExternalCharacterTokenBuffer): (WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::isAll8BitData): (HTMLTreeBuilder::ExternalCharacterTokenBuffer): (WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::takeRemaining): * xml/parser/MarkupTokenBase.h: (WebCore::MarkupTokenBase::clear): (WebCore::MarkupTokenBase::appendToCharacter): (MarkupTokenBase): (WebCore::MarkupTokenBase::eraseCharacters): (WebCore::MarkupTokenBase::setConvertTo8Bit): (WebCore::MarkupTokenBase::isAll8BitData): (WebCore::AtomicMarkupTokenBase::AtomicMarkupTokenBase): (WebCore::AtomicMarkupTokenBase::isAll8BitData): (AtomicMarkupTokenBase): (WebCore::AtomicMarkupTokenBase::clearExternalCharacters): Source/WTF: Added 8 bit path to String::isAllSpecialCharacters(). Added new String creator that takes a pointer to a UChar array that is known to contain only 8 bit characters (LChar's). Added new helper method to copy contents of a UChar buffer to a LChar buffer. The helper method includes X86-64 intrinsics of SSE family instructions for performance. * wtf/Alignment.h: (WTF::isAlignedTo): * wtf/text/ASCIIFastPath.h: (WTF::copyLCharsFromUCharSource): * wtf/text/WTFString.cpp: (WTF::String::make8BitFrom16BitSource): * wtf/text/WTFString.h: (String): (WTF::isAllSpecialCharacters): (WTF::String::isAllSpecialCharacters): git-svn-id: http://svn.webkit.org/repository/webkit/trunk@125846 268f45cc-cd09-0410-ab3c-d52691b4dbfc
mikewest · Aug 17, 2012 · 3e121cf · 3e121cf
1 parent b1ab1b7
commit 3e121cf
Show file tree

Hide file tree

Showing 9 changed files with 186 additions and 5 deletions.
diff --git a/Source/WTF/ChangeLog b/Source/WTF/ChangeLog
@@ -1,3 +1,27 @@
+2012-08-16  Michael Saboff  <msaboff@apple.com>
+
+        HTML Parser should produce 8bit substrings for inline style and script elements
+        https://bugs.webkit.org/show_bug.cgi?id=93742
+
+        Reviewed by Benjamin Poulain.
+
+        Added 8 bit path to String::isAllSpecialCharacters(). Added new String creator
+        that takes a pointer to a UChar array that is known to contain only 8 bit
+        characters (LChar's). Added new helper method to copy contents of a
+        UChar buffer to a LChar buffer. The helper method includes X86-64 intrinsics
+        of SSE family instructions for performance.
+
+        * wtf/Alignment.h:
+        (WTF::isAlignedTo):
+        * wtf/text/ASCIIFastPath.h:
+        (WTF::copyLCharsFromUCharSource):
+        * wtf/text/WTFString.cpp:
+        (WTF::String::make8BitFrom16BitSource):
+        * wtf/text/WTFString.h:
+        (String):
+        (WTF::isAllSpecialCharacters):
+        (WTF::String::isAllSpecialCharacters):
+
 2012-08-16  Benjamin Poulain  <bpoulain@apple.com>
 
         Use initialization from literals for StringStatics

diff --git a/Source/WTF/wtf/Alignment.h b/Source/WTF/wtf/Alignment.h
@@ -23,6 +23,8 @@
 
 #include <wtf/Platform.h>
 #include <algorithm>
+#include <stdint.h>
+
 
 namespace WTF {
 
@@ -58,6 +60,11 @@ namespace WTF {
             std::swap(a.buffer[i], b.buffer[i]);
     }
 
+    template <uintptr_t mask>
+    inline bool isAlignedTo(const void* pointer)
+    {
+        return !(reinterpret_cast<uintptr_t>(pointer) & mask);
+    }
 }
 
 #endif // WTF_Alignment_h
diff --git a/Source/WTF/wtf/text/ASCIIFastPath.h b/Source/WTF/wtf/text/ASCIIFastPath.h
@@ -22,7 +22,11 @@
 #ifndef ASCIIFastPath_h
 #define ASCIIFastPath_h
 
+#if OS(DARWIN) && (CPU(X86) || CPU(X86_64))
+#include <emmintrin.h>
+#endif
 #include <stdint.h>
+#include <wtf/Alignment.h>
 #include <wtf/unicode/Unicode.h>
 
 namespace WTF {
@@ -95,6 +99,45 @@ inline bool charactersAreAllASCII(const CharacterType* characters, size_t length
     return !(allCharBits & nonASCIIBitMask);
 }
 
+inline void copyLCharsFromUCharSource(LChar* destination, const UChar* source, size_t length)
+{
+#if OS(DARWIN) && (CPU(X86) || CPU(X86_64))
+    const uintptr_t memoryAccessSize = 16; // Memory accesses on 16 byte (128 bit) alignment
+    const uintptr_t memoryAccessMask = memoryAccessSize - 1;
+
+    size_t i = 0;
+    for (;i < length && !isAlignedTo<memoryAccessMask>(&source[i]); ++i) {
+        ASSERT(!(source[i] & 0xff00));
+        destination[i] = static_cast<LChar>(source[i]);
+    }
+
+    const uintptr_t sourceLoadSize = 32; // Process 32 bytes (16 UChars) each iteration
+    const unsigned ucharsPerLoop = sourceLoadSize / sizeof(UChar);
+    if (length > ucharsPerLoop) {
+        const unsigned endLength = length - ucharsPerLoop + 1;
+        for (; i < endLength; i += ucharsPerLoop) {
+#ifndef NDEBUG
+            for (unsigned checkIndex = 0; checkIndex < ucharsPerLoop; checkIndex++)
+                ASSERT(!(source[i+checkIndex] & 0xff00));
+#endif
+            __m128i first8UChars = _mm_load_si128(reinterpret_cast<const __m128i*>(&source[i]));
+            __m128i second8UChars = _mm_load_si128(reinterpret_cast<const __m128i*>(&source[i+8]));
+            __m128i packedChars = _mm_packus_epi16(first8UChars, second8UChars);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(&destination[i]), packedChars);
+        }
+    }
+
+    for (; i < length; ++i) {
+        ASSERT(!(source[i] & 0xff00));
+        destination[i] = static_cast<LChar>(source[i]);
+    }
+#else
+    for (size_t i = 0; i < length; ++i) {
+        ASSERT(!(source[i] & 0xff00));
+        destination[i] = static_cast<LChar>(source[i]);
+    }
+#endif
+}
 
 } // namespace WTF
 

diff --git a/Source/WTF/wtf/text/WTFString.cpp b/Source/WTF/wtf/text/WTFString.cpp
@@ -32,6 +32,10 @@
 #include <wtf/dtoa.h>
 #include <wtf/unicode/UTF8.h>
 #include <wtf/unicode/Unicode.h>
+#if OS(DARWIN)
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#endif
 
 using namespace std;
 
@@ -775,6 +779,19 @@ CString String::utf8(bool strict) const
     return CString(bufferVector.data(), buffer - bufferVector.data());
 }
 
+String String::make8BitFrom16BitSource(const UChar* source, size_t length)
+{
+    if (!length)
+        return String();
+
+    LChar* destination;
+    String result = String::createUninitialized(length, destination);
+
+    copyLCharsFromUCharSource(destination, source, length);
+
+    return result;
+}
+
 String String::fromUTF8(const LChar* stringStart, size_t length)
 {
     if (length > numeric_limits<unsigned>::max())

diff --git a/Source/WTF/wtf/text/WTFString.h b/Source/WTF/wtf/text/WTFString.h
@@ -102,7 +102,8 @@ enum FloatConversionFlags {
     ShouldTruncateTrailingZeros = 1 << 2
 };
 
-template<bool isSpecialCharacter(UChar)> bool isAllSpecialCharacters(const UChar*, size_t);
+template<bool isSpecialCharacter(UChar), typename CharacterType>
+bool isAllSpecialCharacters(const CharacterType*, size_t);
 
 class String {
 public:
@@ -404,6 +405,8 @@ class String {
     operator BlackBerry::WebKit::WebString() const;
 #endif
 
+    WTF_EXPORT_STRING_API static String make8BitFrom16BitSource(const UChar*, size_t);
+
     // String::fromUTF8 will return a null string if
     // the input data contains invalid UTF-8 sequences.
     WTF_EXPORT_STRING_API static String fromUTF8(const LChar*, size_t);
@@ -578,7 +581,8 @@ inline void appendNumber(Vector<CharacterType>& vector, unsigned char number)
     }
 }
 
-template<bool isSpecialCharacter(UChar)> inline bool isAllSpecialCharacters(const UChar* characters, size_t length)
+template<bool isSpecialCharacter(UChar), typename CharacterType>
+inline bool isAllSpecialCharacters(const CharacterType* characters, size_t length)
 {
     for (size_t i = 0; i < length; ++i) {
         if (!isSpecialCharacter(characters[i]))
@@ -587,9 +591,17 @@ template<bool isSpecialCharacter(UChar)> inline bool isAllSpecialCharacters(cons
     return true;
 }
 
-template<bool isSpecialCharacter(UChar)> inline bool String::isAllSpecialCharacters() const
+template<bool isSpecialCharacter(UChar)>
+inline bool String::isAllSpecialCharacters() const
 {
-    return WTF::isAllSpecialCharacters<isSpecialCharacter>(characters(), length());
+    size_t len = length();
+
+    if (!len)
+        return true;
+
+    if (is8Bit())
+        return WTF::isAllSpecialCharacters<isSpecialCharacter, LChar>(characters8(), len);
+    return WTF::isAllSpecialCharacters<isSpecialCharacter, UChar>(characters(), len);
 }
 
 // StringHash is the default hash for String

diff --git a/Source/WebCore/ChangeLog b/Source/WebCore/ChangeLog
@@ -1,3 +1,38 @@
+2012-08-16  Michael Saboff  <msaboff@apple.com>
+
+        HTML Parser should produce 8bit substrings for inline style and script elements
+        https://bugs.webkit.org/show_bug.cgi?id=93742
+
+        Reviewed by Benjamin Poulain.
+
+        Currently all data associated with a token is stored and processed as UChars.
+        Added code to determine that the contents of token data is all 8 bit by keeping
+        the logical OR value of all prior characters. Also added a flag that the parser
+        can set to indicate when the token data is converted to a String that we want
+        to make an 8 bit string if possible. Enabled this handling for script, style,
+        iframe, noembed, noframes, noscript and xmp tags.
+
+        No new tests. Existing tests provide coverage.
+
+        * html/parser/HTMLTokenizer.cpp:
+        (WebCore::HTMLTokenizer::nextToken):
+        * html/parser/HTMLTreeBuilder.cpp:
+        (WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::ExternalCharacterTokenBuffer):
+        (WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::isAll8BitData):
+        (HTMLTreeBuilder::ExternalCharacterTokenBuffer):
+        (WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::takeRemaining):
+        * xml/parser/MarkupTokenBase.h:
+        (WebCore::MarkupTokenBase::clear):
+        (WebCore::MarkupTokenBase::appendToCharacter):
+        (MarkupTokenBase):
+        (WebCore::MarkupTokenBase::eraseCharacters):
+        (WebCore::MarkupTokenBase::setConvertTo8Bit):
+        (WebCore::MarkupTokenBase::isAll8BitData):
+        (WebCore::AtomicMarkupTokenBase::AtomicMarkupTokenBase):
+        (WebCore::AtomicMarkupTokenBase::isAll8BitData):
+        (AtomicMarkupTokenBase):
+        (WebCore::AtomicMarkupTokenBase::clearExternalCharacters):
+
 2012-08-16  Michelangelo De Simone  <michelangelo@webkit.org>
 
         [Part 3] Parse the custom() function in -webkit-filter: parse the 3d-transforms parameters

diff --git a/Source/WebCore/html/parser/HTMLTokenizer.cpp b/Source/WebCore/html/parser/HTMLTokenizer.cpp
@@ -474,6 +474,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
             } else if (cc == '>') {
                 if (isAppropriateEndTag()) {
                     m_temporaryBuffer.append(cc);
+                    m_token->setConvertTo8BitIfPossible();
                     return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
                 }
             }
@@ -543,6 +544,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
             } else if (cc == '>') {
                 if (isAppropriateEndTag()) {
                     m_temporaryBuffer.append(cc);
+                    m_token->setConvertTo8BitIfPossible();
                     return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
                 }
             }

diff --git a/Source/WebCore/html/parser/HTMLTreeBuilder.cpp b/Source/WebCore/html/parser/HTMLTreeBuilder.cpp
@@ -249,13 +249,15 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {
     explicit ExternalCharacterTokenBuffer(AtomicHTMLToken* token)
         : m_current(token->characters().data())
         , m_end(m_current + token->characters().size())
+        , m_isAll8BitData(token->isAll8BitData())
     {
         ASSERT(!isEmpty());
     }
 
     explicit ExternalCharacterTokenBuffer(const String& string)
         : m_current(string.characters())
         , m_end(m_current + string.length())
+        , m_isAll8BitData(string.length() && string.is8Bit())
     {
         ASSERT(!isEmpty());
     }
@@ -267,6 +269,8 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {
 
     bool isEmpty() const { return m_current == m_end; }
 
+    bool isAll8BitData() const { return m_isAll8BitData; }
+
     void skipAtMostOneLeadingNewline()
     {
         ASSERT(!isEmpty());
@@ -294,7 +298,12 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {
         ASSERT(!isEmpty());
         const UChar* start = m_current;
         m_current = m_end;
-        return String(start, m_current - start);
+        size_t length = m_current - start;
+
+        if (isAll8BitData())
+            return String::make8BitFrom16BitSource(start, length);
+
+        return String(start, length);
     }
 
     void giveRemainingTo(StringBuilder& recipient)
@@ -344,6 +353,7 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {
 
     const UChar* m_current;
     const UChar* m_end;
+    bool m_isAll8BitData;
 };
 
 

diff --git a/Source/WebCore/xml/parser/MarkupTokenBase.h b/Source/WebCore/xml/parser/MarkupTokenBase.h
@@ -97,6 +97,8 @@ class MarkupTokenBase {
         m_range.m_end = 0;
         m_baseOffset = 0;
         m_data.clear();
+        m_orAllData = 0;
+        m_convertTo8BitIfPossible = false;
     }
 
     bool isUninitialized() { return m_type == TypeSet::Uninitialized; }
@@ -172,6 +174,13 @@ class MarkupTokenBase {
         m_data.append(character);
     }
 
+    void appendToCharacter(UChar character)
+    {
+        ASSERT(m_type == TypeSet::Character);
+        m_data.append(character);
+        m_orAllData |= character;
+    }
+
     template<typename T>
     void appendToCharacter(T characters)
     {
@@ -274,6 +283,7 @@ class MarkupTokenBase {
     {
         ASSERT(m_type == TypeSet::Character);
         m_data.clear();
+        m_orAllData = 0;
     }
 
     void eraseValueOfAttribute(size_t i)
@@ -294,6 +304,16 @@ class MarkupTokenBase {
         return m_data;
     }
 
+    void setConvertTo8BitIfPossible()
+    {
+        m_convertTo8BitIfPossible = true;
+    }
+
+    bool isAll8BitData() const
+    {
+        return m_convertTo8BitIfPossible && (m_orAllData <= 0xff);
+    }
+
     // FIXME: Distinguish between a missing public identifer and an empty one.
     const WTF::Vector<UChar>& publicIdentifier() const
     {
@@ -370,6 +390,8 @@ class MarkupTokenBase {
     typename Attribute::Range m_range; // Always starts at zero.
     int m_baseOffset;
     DataVector m_data;
+    UChar m_orAllData;
+    bool m_convertTo8BitIfPossible;
 
     // For DOCTYPE
     OwnPtr<DoctypeData> m_doctypeData;
@@ -413,6 +435,7 @@ class AtomicMarkupTokenBase {
             break;
         case Token::Type::Character:
             m_externalCharacters = &token->characters();
+            m_isAll8BitData = token->isAll8BitData();
             break;
         default:
             break;
@@ -423,6 +446,7 @@ class AtomicMarkupTokenBase {
         : m_type(type)
         , m_name(name)
         , m_externalCharacters(0)
+        , m_isAll8BitData(false)
         , m_attributes(attributes)
     {
         ASSERT(usesName());
@@ -472,6 +496,11 @@ class AtomicMarkupTokenBase {
         return *m_externalCharacters;
     }
 
+    bool isAll8BitData() const
+    {
+        return m_isAll8BitData;
+    }
+
     const String& comment() const
     {
         ASSERT(m_type == Token::Type::Comment);
@@ -495,6 +524,7 @@ class AtomicMarkupTokenBase {
     void clearExternalCharacters()
     {
         m_externalCharacters = 0;
+        m_isAll8BitData = false;
     }
 
 protected:
@@ -522,6 +552,7 @@ class AtomicMarkupTokenBase {
     // FIXME: Add a mechanism for "internalizing" the characters when the
     //        HTMLToken is destructed.
     const typename Token::DataVector* m_externalCharacters;
+    bool m_isAll8BitData;
 
     // For DOCTYPE
     OwnPtr<typename Token::DoctypeData> m_doctypeData;