Permalink
Browse files

HTML Parser should produce 8bit substrings for inline style and scrip…

…t elements


https://bugs.webkit.org/show_bug.cgi?id=93742

Reviewed by Benjamin Poulain.

Source/WebCore: 

Currently all data associated with a token is stored and processed as UChars.
Added code to determine that the contents of token data is all 8 bit by keeping
the logical OR value of all prior characters. Also added a flag that the parser
can set to indicate when the token data is converted to a String that we want
to make an 8 bit string if possible. Enabled this handling for script, style,
iframe, noembed, noframes, noscript and xmp tags.

No new tests. Existing tests provide coverage.

* html/parser/HTMLTokenizer.cpp:
(WebCore::HTMLTokenizer::nextToken):
* html/parser/HTMLTreeBuilder.cpp:
(WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::ExternalCharacterTokenBuffer):
(WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::isAll8BitData):
(HTMLTreeBuilder::ExternalCharacterTokenBuffer):
(WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::takeRemaining):
* xml/parser/MarkupTokenBase.h:
(WebCore::MarkupTokenBase::clear):
(WebCore::MarkupTokenBase::appendToCharacter):
(MarkupTokenBase):
(WebCore::MarkupTokenBase::eraseCharacters):
(WebCore::MarkupTokenBase::setConvertTo8Bit):
(WebCore::MarkupTokenBase::isAll8BitData):
(WebCore::AtomicMarkupTokenBase::AtomicMarkupTokenBase):
(WebCore::AtomicMarkupTokenBase::isAll8BitData):
(AtomicMarkupTokenBase):
(WebCore::AtomicMarkupTokenBase::clearExternalCharacters):

Source/WTF: 

Added 8 bit path to String::isAllSpecialCharacters(). Added new String creator
that takes a pointer to a UChar array that is known to contain only 8 bit
characters (LChar's). Added new helper method to copy contents of a
UChar buffer to a LChar buffer. The helper method includes X86-64 intrinsics
of SSE family instructions for performance.

* wtf/Alignment.h:
(WTF::isAlignedTo):
* wtf/text/ASCIIFastPath.h:
(WTF::copyLCharsFromUCharSource):
* wtf/text/WTFString.cpp:
(WTF::String::make8BitFrom16BitSource):
* wtf/text/WTFString.h:
(String):
(WTF::isAllSpecialCharacters):
(WTF::String::isAllSpecialCharacters):


git-svn-id: http://svn.webkit.org/repository/webkit/trunk@125846 268f45cc-cd09-0410-ab3c-d52691b4dbfc
  • Loading branch information...
1 parent b1ab1b7 commit 3e121cfcfe475f37f11614b9c531056fe8eea4ac @msaboff msaboff committed Aug 17, 2012
View
@@ -1,3 +1,27 @@
+2012-08-16 Michael Saboff <msaboff@apple.com>
+
+ HTML Parser should produce 8bit substrings for inline style and script elements
+ https://bugs.webkit.org/show_bug.cgi?id=93742
+
+ Reviewed by Benjamin Poulain.
+
+ Added 8 bit path to String::isAllSpecialCharacters(). Added new String creator
+ that takes a pointer to a UChar array that is known to contain only 8 bit
+ characters (LChar's). Added new helper method to copy contents of a
+ UChar buffer to a LChar buffer. The helper method includes X86-64 intrinsics
+ of SSE family instructions for performance.
+
+ * wtf/Alignment.h:
+ (WTF::isAlignedTo):
+ * wtf/text/ASCIIFastPath.h:
+ (WTF::copyLCharsFromUCharSource):
+ * wtf/text/WTFString.cpp:
+ (WTF::String::make8BitFrom16BitSource):
+ * wtf/text/WTFString.h:
+ (String):
+ (WTF::isAllSpecialCharacters):
+ (WTF::String::isAllSpecialCharacters):
+
2012-08-16 Benjamin Poulain <bpoulain@apple.com>
Use initialization from literals for StringStatics
@@ -23,6 +23,8 @@
#include <wtf/Platform.h>
#include <algorithm>
+#include <stdint.h>
+
namespace WTF {
@@ -58,6 +60,11 @@ namespace WTF {
std::swap(a.buffer[i], b.buffer[i]);
}
+ template <uintptr_t mask>
+ inline bool isAlignedTo(const void* pointer)
+ {
+ return !(reinterpret_cast<uintptr_t>(pointer) & mask);
+ }
}
#endif // WTF_Alignment_h
@@ -22,7 +22,11 @@
#ifndef ASCIIFastPath_h
#define ASCIIFastPath_h
+#if OS(DARWIN) && (CPU(X86) || CPU(X86_64))
+#include <emmintrin.h>
+#endif
#include <stdint.h>
+#include <wtf/Alignment.h>
#include <wtf/unicode/Unicode.h>
namespace WTF {
@@ -95,6 +99,45 @@ inline bool charactersAreAllASCII(const CharacterType* characters, size_t length
return !(allCharBits & nonASCIIBitMask);
}
+inline void copyLCharsFromUCharSource(LChar* destination, const UChar* source, size_t length)
+{
+#if OS(DARWIN) && (CPU(X86) || CPU(X86_64))
+ const uintptr_t memoryAccessSize = 16; // Memory accesses on 16 byte (128 bit) alignment
+ const uintptr_t memoryAccessMask = memoryAccessSize - 1;
+
+ size_t i = 0;
+ for (;i < length && !isAlignedTo<memoryAccessMask>(&source[i]); ++i) {
+ ASSERT(!(source[i] & 0xff00));
+ destination[i] = static_cast<LChar>(source[i]);
+ }
+
+ const uintptr_t sourceLoadSize = 32; // Process 32 bytes (16 UChars) each iteration
+ const unsigned ucharsPerLoop = sourceLoadSize / sizeof(UChar);
+ if (length > ucharsPerLoop) {
+ const unsigned endLength = length - ucharsPerLoop + 1;
+ for (; i < endLength; i += ucharsPerLoop) {
+#ifndef NDEBUG
+ for (unsigned checkIndex = 0; checkIndex < ucharsPerLoop; checkIndex++)
+ ASSERT(!(source[i+checkIndex] & 0xff00));
+#endif
+ __m128i first8UChars = _mm_load_si128(reinterpret_cast<const __m128i*>(&source[i]));
+ __m128i second8UChars = _mm_load_si128(reinterpret_cast<const __m128i*>(&source[i+8]));
+ __m128i packedChars = _mm_packus_epi16(first8UChars, second8UChars);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&destination[i]), packedChars);
+ }
+ }
+
+ for (; i < length; ++i) {
+ ASSERT(!(source[i] & 0xff00));
+ destination[i] = static_cast<LChar>(source[i]);
+ }
+#else
+ for (size_t i = 0; i < length; ++i) {
+ ASSERT(!(source[i] & 0xff00));
+ destination[i] = static_cast<LChar>(source[i]);
+ }
+#endif
+}
} // namespace WTF
@@ -32,6 +32,10 @@
#include <wtf/dtoa.h>
#include <wtf/unicode/UTF8.h>
#include <wtf/unicode/Unicode.h>
+#if OS(DARWIN)
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#endif
using namespace std;
@@ -775,6 +779,19 @@ CString String::utf8(bool strict) const
return CString(bufferVector.data(), buffer - bufferVector.data());
}
+String String::make8BitFrom16BitSource(const UChar* source, size_t length)
+{
+ if (!length)
+ return String();
+
+ LChar* destination;
+ String result = String::createUninitialized(length, destination);
+
+ copyLCharsFromUCharSource(destination, source, length);
+
+ return result;
+}
+
String String::fromUTF8(const LChar* stringStart, size_t length)
{
if (length > numeric_limits<unsigned>::max())
@@ -102,7 +102,8 @@ enum FloatConversionFlags {
ShouldTruncateTrailingZeros = 1 << 2
};
-template<bool isSpecialCharacter(UChar)> bool isAllSpecialCharacters(const UChar*, size_t);
+template<bool isSpecialCharacter(UChar), typename CharacterType>
+bool isAllSpecialCharacters(const CharacterType*, size_t);
class String {
public:
@@ -404,6 +405,8 @@ class String {
operator BlackBerry::WebKit::WebString() const;
#endif
+ WTF_EXPORT_STRING_API static String make8BitFrom16BitSource(const UChar*, size_t);
+
// String::fromUTF8 will return a null string if
// the input data contains invalid UTF-8 sequences.
WTF_EXPORT_STRING_API static String fromUTF8(const LChar*, size_t);
@@ -578,7 +581,8 @@ inline void appendNumber(Vector<CharacterType>& vector, unsigned char number)
}
}
-template<bool isSpecialCharacter(UChar)> inline bool isAllSpecialCharacters(const UChar* characters, size_t length)
+template<bool isSpecialCharacter(UChar), typename CharacterType>
+inline bool isAllSpecialCharacters(const CharacterType* characters, size_t length)
{
for (size_t i = 0; i < length; ++i) {
if (!isSpecialCharacter(characters[i]))
@@ -587,9 +591,17 @@ template<bool isSpecialCharacter(UChar)> inline bool isAllSpecialCharacters(cons
return true;
}
-template<bool isSpecialCharacter(UChar)> inline bool String::isAllSpecialCharacters() const
+template<bool isSpecialCharacter(UChar)>
+inline bool String::isAllSpecialCharacters() const
{
- return WTF::isAllSpecialCharacters<isSpecialCharacter>(characters(), length());
+ size_t len = length();
+
+ if (!len)
+ return true;
+
+ if (is8Bit())
+ return WTF::isAllSpecialCharacters<isSpecialCharacter, LChar>(characters8(), len);
+ return WTF::isAllSpecialCharacters<isSpecialCharacter, UChar>(characters(), len);
}
// StringHash is the default hash for String
View
@@ -1,3 +1,38 @@
+2012-08-16 Michael Saboff <msaboff@apple.com>
+
+ HTML Parser should produce 8bit substrings for inline style and script elements
+ https://bugs.webkit.org/show_bug.cgi?id=93742
+
+ Reviewed by Benjamin Poulain.
+
+ Currently all data associated with a token is stored and processed as UChars.
+ Added code to determine that the contents of token data is all 8 bit by keeping
+ the logical OR value of all prior characters. Also added a flag that the parser
+ can set to indicate when the token data is converted to a String that we want
+ to make an 8 bit string if possible. Enabled this handling for script, style,
+ iframe, noembed, noframes, noscript and xmp tags.
+
+ No new tests. Existing tests provide coverage.
+
+ * html/parser/HTMLTokenizer.cpp:
+ (WebCore::HTMLTokenizer::nextToken):
+ * html/parser/HTMLTreeBuilder.cpp:
+ (WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::ExternalCharacterTokenBuffer):
+ (WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::isAll8BitData):
+ (HTMLTreeBuilder::ExternalCharacterTokenBuffer):
+ (WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::takeRemaining):
+ * xml/parser/MarkupTokenBase.h:
+ (WebCore::MarkupTokenBase::clear):
+ (WebCore::MarkupTokenBase::appendToCharacter):
+ (MarkupTokenBase):
+ (WebCore::MarkupTokenBase::eraseCharacters):
+ (WebCore::MarkupTokenBase::setConvertTo8Bit):
+ (WebCore::MarkupTokenBase::isAll8BitData):
+ (WebCore::AtomicMarkupTokenBase::AtomicMarkupTokenBase):
+ (WebCore::AtomicMarkupTokenBase::isAll8BitData):
+ (AtomicMarkupTokenBase):
+ (WebCore::AtomicMarkupTokenBase::clearExternalCharacters):
+
2012-08-16 Michelangelo De Simone <michelangelo@webkit.org>
[Part 3] Parse the custom() function in -webkit-filter: parse the 3d-transforms parameters
@@ -474,6 +474,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
} else if (cc == '>') {
if (isAppropriateEndTag()) {
m_temporaryBuffer.append(cc);
+ m_token->setConvertTo8BitIfPossible();
return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
}
}
@@ -543,6 +544,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
} else if (cc == '>') {
if (isAppropriateEndTag()) {
m_temporaryBuffer.append(cc);
+ m_token->setConvertTo8BitIfPossible();
return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
}
}
@@ -249,13 +249,15 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {
explicit ExternalCharacterTokenBuffer(AtomicHTMLToken* token)
: m_current(token->characters().data())
, m_end(m_current + token->characters().size())
+ , m_isAll8BitData(token->isAll8BitData())
{
ASSERT(!isEmpty());
}
explicit ExternalCharacterTokenBuffer(const String& string)
: m_current(string.characters())
, m_end(m_current + string.length())
+ , m_isAll8BitData(string.length() && string.is8Bit())
{
ASSERT(!isEmpty());
}
@@ -267,6 +269,8 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {
bool isEmpty() const { return m_current == m_end; }
+ bool isAll8BitData() const { return m_isAll8BitData; }
+
void skipAtMostOneLeadingNewline()
{
ASSERT(!isEmpty());
@@ -294,7 +298,12 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {
ASSERT(!isEmpty());
const UChar* start = m_current;
m_current = m_end;
- return String(start, m_current - start);
+ size_t length = m_current - start;
+
+ if (isAll8BitData())
+ return String::make8BitFrom16BitSource(start, length);
+
+ return String(start, length);
}
void giveRemainingTo(StringBuilder& recipient)
@@ -344,6 +353,7 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {
const UChar* m_current;
const UChar* m_end;
+ bool m_isAll8BitData;
};
@@ -97,6 +97,8 @@ class MarkupTokenBase {
m_range.m_end = 0;
m_baseOffset = 0;
m_data.clear();
+ m_orAllData = 0;
+ m_convertTo8BitIfPossible = false;
}
bool isUninitialized() { return m_type == TypeSet::Uninitialized; }
@@ -172,6 +174,13 @@ class MarkupTokenBase {
m_data.append(character);
}
+ void appendToCharacter(UChar character)
+ {
+ ASSERT(m_type == TypeSet::Character);
+ m_data.append(character);
+ m_orAllData |= character;
+ }
+
template<typename T>
void appendToCharacter(T characters)
{
@@ -274,6 +283,7 @@ class MarkupTokenBase {
{
ASSERT(m_type == TypeSet::Character);
m_data.clear();
+ m_orAllData = 0;
}
void eraseValueOfAttribute(size_t i)
@@ -294,6 +304,16 @@ class MarkupTokenBase {
return m_data;
}
+ void setConvertTo8BitIfPossible()
+ {
+ m_convertTo8BitIfPossible = true;
+ }
+
+ bool isAll8BitData() const
+ {
+ return m_convertTo8BitIfPossible && (m_orAllData <= 0xff);
+ }
+
// FIXME: Distinguish between a missing public identifer and an empty one.
const WTF::Vector<UChar>& publicIdentifier() const
{
@@ -370,6 +390,8 @@ class MarkupTokenBase {
typename Attribute::Range m_range; // Always starts at zero.
int m_baseOffset;
DataVector m_data;
+ UChar m_orAllData;
+ bool m_convertTo8BitIfPossible;
// For DOCTYPE
OwnPtr<DoctypeData> m_doctypeData;
@@ -413,6 +435,7 @@ class AtomicMarkupTokenBase {
break;
case Token::Type::Character:
m_externalCharacters = &token->characters();
+ m_isAll8BitData = token->isAll8BitData();
break;
default:
break;
@@ -423,6 +446,7 @@ class AtomicMarkupTokenBase {
: m_type(type)
, m_name(name)
, m_externalCharacters(0)
+ , m_isAll8BitData(false)
, m_attributes(attributes)
{
ASSERT(usesName());
@@ -472,6 +496,11 @@ class AtomicMarkupTokenBase {
return *m_externalCharacters;
}
+ bool isAll8BitData() const
+ {
+ return m_isAll8BitData;
+ }
+
const String& comment() const
{
ASSERT(m_type == Token::Type::Comment);
@@ -495,6 +524,7 @@ class AtomicMarkupTokenBase {
void clearExternalCharacters()
{
m_externalCharacters = 0;
+ m_isAll8BitData = false;
}
protected:
@@ -522,6 +552,7 @@ class AtomicMarkupTokenBase {
// FIXME: Add a mechanism for "internalizing" the characters when the
// HTMLToken is destructed.
const typename Token::DataVector* m_externalCharacters;
+ bool m_isAll8BitData;
// For DOCTYPE
OwnPtr<typename Token::DoctypeData> m_doctypeData;

0 comments on commit 3e121cf

Please sign in to comment.