Skip to content

Commit

Permalink
HTML Parser should produce 8bit substrings for inline style and scrip…
Browse files Browse the repository at this point in the history
…t elements

https://bugs.webkit.org/show_bug.cgi?id=93742

Reviewed by Benjamin Poulain.

Source/WebCore: 

Currently all data associated with a token is stored and processed as UChars.
Added code to determine that the contents of token data is all 8 bit by keeping
the logical OR value of all prior characters. Also added a flag that the parser
can set to indicate when the token data is converted to a String that we want
to make an 8 bit string if possible. Enabled this handling for script, style,
iframe, noembed, noframes, noscript and xmp tags.

No new tests. Existing tests provide coverage.

* html/parser/HTMLTokenizer.cpp:
(WebCore::HTMLTokenizer::nextToken):
* html/parser/HTMLTreeBuilder.cpp:
(WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::ExternalCharacterTokenBuffer):
(WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::isAll8BitData):
(HTMLTreeBuilder::ExternalCharacterTokenBuffer):
(WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::takeRemaining):
* xml/parser/MarkupTokenBase.h:
(WebCore::MarkupTokenBase::clear):
(WebCore::MarkupTokenBase::appendToCharacter):
(MarkupTokenBase):
(WebCore::MarkupTokenBase::eraseCharacters):
(WebCore::MarkupTokenBase::setConvertTo8Bit):
(WebCore::MarkupTokenBase::isAll8BitData):
(WebCore::AtomicMarkupTokenBase::AtomicMarkupTokenBase):
(WebCore::AtomicMarkupTokenBase::isAll8BitData):
(AtomicMarkupTokenBase):
(WebCore::AtomicMarkupTokenBase::clearExternalCharacters):

Source/WTF: 

Added 8 bit path to String::isAllSpecialCharacters(). Added new String creator
that takes a pointer to a UChar array that is known to contain only 8 bit
characters (LChar's). Added new helper method to copy contents of a
UChar buffer to a LChar buffer. The helper method includes X86-64 intrinsics
of SSE family instructions for performance.

* wtf/Alignment.h:
(WTF::isAlignedTo):
* wtf/text/ASCIIFastPath.h:
(WTF::copyLCharsFromUCharSource):
* wtf/text/WTFString.cpp:
(WTF::String::make8BitFrom16BitSource):
* wtf/text/WTFString.h:
(String):
(WTF::isAllSpecialCharacters):
(WTF::String::isAllSpecialCharacters):


git-svn-id: http://svn.webkit.org/repository/webkit/trunk@125846 268f45cc-cd09-0410-ab3c-d52691b4dbfc
  • Loading branch information
msaboff committed Aug 17, 2012
1 parent b1ab1b7 commit 3e121cf
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 5 deletions.
24 changes: 24 additions & 0 deletions Source/WTF/ChangeLog
@@ -1,3 +1,27 @@
2012-08-16 Michael Saboff <msaboff@apple.com>

HTML Parser should produce 8bit substrings for inline style and script elements
https://bugs.webkit.org/show_bug.cgi?id=93742

Reviewed by Benjamin Poulain.

Added 8 bit path to String::isAllSpecialCharacters(). Added new String creator
that takes a pointer to a UChar array that is known to contain only 8 bit
characters (LChar's). Added new helper method to copy contents of a
UChar buffer to a LChar buffer. The helper method includes X86-64 intrinsics
of SSE family instructions for performance.

* wtf/Alignment.h:
(WTF::isAlignedTo):
* wtf/text/ASCIIFastPath.h:
(WTF::copyLCharsFromUCharSource):
* wtf/text/WTFString.cpp:
(WTF::String::make8BitFrom16BitSource):
* wtf/text/WTFString.h:
(String):
(WTF::isAllSpecialCharacters):
(WTF::String::isAllSpecialCharacters):

2012-08-16 Benjamin Poulain <bpoulain@apple.com> 2012-08-16 Benjamin Poulain <bpoulain@apple.com>


Use initialization from literals for StringStatics Use initialization from literals for StringStatics
Expand Down
7 changes: 7 additions & 0 deletions Source/WTF/wtf/Alignment.h
Expand Up @@ -23,6 +23,8 @@


#include <wtf/Platform.h> #include <wtf/Platform.h>
#include <algorithm> #include <algorithm>
#include <stdint.h>



namespace WTF { namespace WTF {


Expand Down Expand Up @@ -58,6 +60,11 @@ namespace WTF {
std::swap(a.buffer[i], b.buffer[i]); std::swap(a.buffer[i], b.buffer[i]);
} }


template <uintptr_t mask>
inline bool isAlignedTo(const void* pointer)
{
return !(reinterpret_cast<uintptr_t>(pointer) & mask);
}
} }


#endif // WTF_Alignment_h #endif // WTF_Alignment_h
43 changes: 43 additions & 0 deletions Source/WTF/wtf/text/ASCIIFastPath.h
Expand Up @@ -22,7 +22,11 @@
#ifndef ASCIIFastPath_h #ifndef ASCIIFastPath_h
#define ASCIIFastPath_h #define ASCIIFastPath_h


#if OS(DARWIN) && (CPU(X86) || CPU(X86_64))
#include <emmintrin.h>
#endif
#include <stdint.h> #include <stdint.h>
#include <wtf/Alignment.h>
#include <wtf/unicode/Unicode.h> #include <wtf/unicode/Unicode.h>


namespace WTF { namespace WTF {
Expand Down Expand Up @@ -95,6 +99,45 @@ inline bool charactersAreAllASCII(const CharacterType* characters, size_t length
return !(allCharBits & nonASCIIBitMask); return !(allCharBits & nonASCIIBitMask);
} }


inline void copyLCharsFromUCharSource(LChar* destination, const UChar* source, size_t length)
{
#if OS(DARWIN) && (CPU(X86) || CPU(X86_64))
const uintptr_t memoryAccessSize = 16; // Memory accesses on 16 byte (128 bit) alignment
const uintptr_t memoryAccessMask = memoryAccessSize - 1;

size_t i = 0;
for (;i < length && !isAlignedTo<memoryAccessMask>(&source[i]); ++i) {
ASSERT(!(source[i] & 0xff00));
destination[i] = static_cast<LChar>(source[i]);
}

const uintptr_t sourceLoadSize = 32; // Process 32 bytes (16 UChars) each iteration
const unsigned ucharsPerLoop = sourceLoadSize / sizeof(UChar);
if (length > ucharsPerLoop) {
const unsigned endLength = length - ucharsPerLoop + 1;
for (; i < endLength; i += ucharsPerLoop) {
#ifndef NDEBUG
for (unsigned checkIndex = 0; checkIndex < ucharsPerLoop; checkIndex++)
ASSERT(!(source[i+checkIndex] & 0xff00));
#endif
__m128i first8UChars = _mm_load_si128(reinterpret_cast<const __m128i*>(&source[i]));
__m128i second8UChars = _mm_load_si128(reinterpret_cast<const __m128i*>(&source[i+8]));
__m128i packedChars = _mm_packus_epi16(first8UChars, second8UChars);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&destination[i]), packedChars);
}
}

for (; i < length; ++i) {
ASSERT(!(source[i] & 0xff00));
destination[i] = static_cast<LChar>(source[i]);
}
#else
for (size_t i = 0; i < length; ++i) {
ASSERT(!(source[i] & 0xff00));
destination[i] = static_cast<LChar>(source[i]);
}
#endif
}


} // namespace WTF } // namespace WTF


Expand Down
17 changes: 17 additions & 0 deletions Source/WTF/wtf/text/WTFString.cpp
Expand Up @@ -32,6 +32,10 @@
#include <wtf/dtoa.h> #include <wtf/dtoa.h>
#include <wtf/unicode/UTF8.h> #include <wtf/unicode/UTF8.h>
#include <wtf/unicode/Unicode.h> #include <wtf/unicode/Unicode.h>
#if OS(DARWIN)
#include <emmintrin.h>
#include <tmmintrin.h>
#endif


using namespace std; using namespace std;


Expand Down Expand Up @@ -775,6 +779,19 @@ CString String::utf8(bool strict) const
return CString(bufferVector.data(), buffer - bufferVector.data()); return CString(bufferVector.data(), buffer - bufferVector.data());
} }


String String::make8BitFrom16BitSource(const UChar* source, size_t length)
{
if (!length)
return String();

LChar* destination;
String result = String::createUninitialized(length, destination);

copyLCharsFromUCharSource(destination, source, length);

return result;
}

String String::fromUTF8(const LChar* stringStart, size_t length) String String::fromUTF8(const LChar* stringStart, size_t length)
{ {
if (length > numeric_limits<unsigned>::max()) if (length > numeric_limits<unsigned>::max())
Expand Down
20 changes: 16 additions & 4 deletions Source/WTF/wtf/text/WTFString.h
Expand Up @@ -102,7 +102,8 @@ enum FloatConversionFlags {
ShouldTruncateTrailingZeros = 1 << 2 ShouldTruncateTrailingZeros = 1 << 2
}; };


template<bool isSpecialCharacter(UChar)> bool isAllSpecialCharacters(const UChar*, size_t); template<bool isSpecialCharacter(UChar), typename CharacterType>
bool isAllSpecialCharacters(const CharacterType*, size_t);


class String { class String {
public: public:
Expand Down Expand Up @@ -404,6 +405,8 @@ class String {
operator BlackBerry::WebKit::WebString() const; operator BlackBerry::WebKit::WebString() const;
#endif #endif


WTF_EXPORT_STRING_API static String make8BitFrom16BitSource(const UChar*, size_t);

// String::fromUTF8 will return a null string if // String::fromUTF8 will return a null string if
// the input data contains invalid UTF-8 sequences. // the input data contains invalid UTF-8 sequences.
WTF_EXPORT_STRING_API static String fromUTF8(const LChar*, size_t); WTF_EXPORT_STRING_API static String fromUTF8(const LChar*, size_t);
Expand Down Expand Up @@ -578,7 +581,8 @@ inline void appendNumber(Vector<CharacterType>& vector, unsigned char number)
} }
} }


template<bool isSpecialCharacter(UChar)> inline bool isAllSpecialCharacters(const UChar* characters, size_t length) template<bool isSpecialCharacter(UChar), typename CharacterType>
inline bool isAllSpecialCharacters(const CharacterType* characters, size_t length)
{ {
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
if (!isSpecialCharacter(characters[i])) if (!isSpecialCharacter(characters[i]))
Expand All @@ -587,9 +591,17 @@ template<bool isSpecialCharacter(UChar)> inline bool isAllSpecialCharacters(cons
return true; return true;
} }


template<bool isSpecialCharacter(UChar)> inline bool String::isAllSpecialCharacters() const template<bool isSpecialCharacter(UChar)>
inline bool String::isAllSpecialCharacters() const
{ {
return WTF::isAllSpecialCharacters<isSpecialCharacter>(characters(), length()); size_t len = length();

if (!len)
return true;

if (is8Bit())
return WTF::isAllSpecialCharacters<isSpecialCharacter, LChar>(characters8(), len);
return WTF::isAllSpecialCharacters<isSpecialCharacter, UChar>(characters(), len);
} }


// StringHash is the default hash for String // StringHash is the default hash for String
Expand Down
35 changes: 35 additions & 0 deletions Source/WebCore/ChangeLog
@@ -1,3 +1,38 @@
2012-08-16 Michael Saboff <msaboff@apple.com>

HTML Parser should produce 8bit substrings for inline style and script elements
https://bugs.webkit.org/show_bug.cgi?id=93742

Reviewed by Benjamin Poulain.

Currently all data associated with a token is stored and processed as UChars.
Added code to determine that the contents of token data is all 8 bit by keeping
the logical OR value of all prior characters. Also added a flag that the parser
can set to indicate when the token data is converted to a String that we want
to make an 8 bit string if possible. Enabled this handling for script, style,
iframe, noembed, noframes, noscript and xmp tags.

No new tests. Existing tests provide coverage.

* html/parser/HTMLTokenizer.cpp:
(WebCore::HTMLTokenizer::nextToken):
* html/parser/HTMLTreeBuilder.cpp:
(WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::ExternalCharacterTokenBuffer):
(WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::isAll8BitData):
(HTMLTreeBuilder::ExternalCharacterTokenBuffer):
(WebCore::HTMLTreeBuilder::ExternalCharacterTokenBuffer::takeRemaining):
* xml/parser/MarkupTokenBase.h:
(WebCore::MarkupTokenBase::clear):
(WebCore::MarkupTokenBase::appendToCharacter):
(MarkupTokenBase):
(WebCore::MarkupTokenBase::eraseCharacters):
(WebCore::MarkupTokenBase::setConvertTo8Bit):
(WebCore::MarkupTokenBase::isAll8BitData):
(WebCore::AtomicMarkupTokenBase::AtomicMarkupTokenBase):
(WebCore::AtomicMarkupTokenBase::isAll8BitData):
(AtomicMarkupTokenBase):
(WebCore::AtomicMarkupTokenBase::clearExternalCharacters):

2012-08-16 Michelangelo De Simone <michelangelo@webkit.org> 2012-08-16 Michelangelo De Simone <michelangelo@webkit.org>


[Part 3] Parse the custom() function in -webkit-filter: parse the 3d-transforms parameters [Part 3] Parse the custom() function in -webkit-filter: parse the 3d-transforms parameters
Expand Down
2 changes: 2 additions & 0 deletions Source/WebCore/html/parser/HTMLTokenizer.cpp
Expand Up @@ -474,6 +474,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
} else if (cc == '>') { } else if (cc == '>') {
if (isAppropriateEndTag()) { if (isAppropriateEndTag()) {
m_temporaryBuffer.append(cc); m_temporaryBuffer.append(cc);
m_token->setConvertTo8BitIfPossible();
return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState); return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
} }
} }
Expand Down Expand Up @@ -543,6 +544,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
} else if (cc == '>') { } else if (cc == '>') {
if (isAppropriateEndTag()) { if (isAppropriateEndTag()) {
m_temporaryBuffer.append(cc); m_temporaryBuffer.append(cc);
m_token->setConvertTo8BitIfPossible();
return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState); return flushEmitAndResumeIn(source, HTMLTokenizerState::DataState);
} }
} }
Expand Down
12 changes: 11 additions & 1 deletion Source/WebCore/html/parser/HTMLTreeBuilder.cpp
Expand Up @@ -249,13 +249,15 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {
explicit ExternalCharacterTokenBuffer(AtomicHTMLToken* token) explicit ExternalCharacterTokenBuffer(AtomicHTMLToken* token)
: m_current(token->characters().data()) : m_current(token->characters().data())
, m_end(m_current + token->characters().size()) , m_end(m_current + token->characters().size())
, m_isAll8BitData(token->isAll8BitData())
{ {
ASSERT(!isEmpty()); ASSERT(!isEmpty());
} }


explicit ExternalCharacterTokenBuffer(const String& string) explicit ExternalCharacterTokenBuffer(const String& string)
: m_current(string.characters()) : m_current(string.characters())
, m_end(m_current + string.length()) , m_end(m_current + string.length())
, m_isAll8BitData(string.length() && string.is8Bit())
{ {
ASSERT(!isEmpty()); ASSERT(!isEmpty());
} }
Expand All @@ -267,6 +269,8 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {


bool isEmpty() const { return m_current == m_end; } bool isEmpty() const { return m_current == m_end; }


bool isAll8BitData() const { return m_isAll8BitData; }

void skipAtMostOneLeadingNewline() void skipAtMostOneLeadingNewline()
{ {
ASSERT(!isEmpty()); ASSERT(!isEmpty());
Expand Down Expand Up @@ -294,7 +298,12 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {
ASSERT(!isEmpty()); ASSERT(!isEmpty());
const UChar* start = m_current; const UChar* start = m_current;
m_current = m_end; m_current = m_end;
return String(start, m_current - start); size_t length = m_current - start;

if (isAll8BitData())
return String::make8BitFrom16BitSource(start, length);

return String(start, length);
} }


void giveRemainingTo(StringBuilder& recipient) void giveRemainingTo(StringBuilder& recipient)
Expand Down Expand Up @@ -344,6 +353,7 @@ class HTMLTreeBuilder::ExternalCharacterTokenBuffer {


const UChar* m_current; const UChar* m_current;
const UChar* m_end; const UChar* m_end;
bool m_isAll8BitData;
}; };




Expand Down
31 changes: 31 additions & 0 deletions Source/WebCore/xml/parser/MarkupTokenBase.h
Expand Up @@ -97,6 +97,8 @@ class MarkupTokenBase {
m_range.m_end = 0; m_range.m_end = 0;
m_baseOffset = 0; m_baseOffset = 0;
m_data.clear(); m_data.clear();
m_orAllData = 0;
m_convertTo8BitIfPossible = false;
} }


bool isUninitialized() { return m_type == TypeSet::Uninitialized; } bool isUninitialized() { return m_type == TypeSet::Uninitialized; }
Expand Down Expand Up @@ -172,6 +174,13 @@ class MarkupTokenBase {
m_data.append(character); m_data.append(character);
} }


void appendToCharacter(UChar character)
{
ASSERT(m_type == TypeSet::Character);
m_data.append(character);
m_orAllData |= character;
}

template<typename T> template<typename T>
void appendToCharacter(T characters) void appendToCharacter(T characters)
{ {
Expand Down Expand Up @@ -274,6 +283,7 @@ class MarkupTokenBase {
{ {
ASSERT(m_type == TypeSet::Character); ASSERT(m_type == TypeSet::Character);
m_data.clear(); m_data.clear();
m_orAllData = 0;
} }


void eraseValueOfAttribute(size_t i) void eraseValueOfAttribute(size_t i)
Expand All @@ -294,6 +304,16 @@ class MarkupTokenBase {
return m_data; return m_data;
} }


void setConvertTo8BitIfPossible()
{
m_convertTo8BitIfPossible = true;
}

bool isAll8BitData() const
{
return m_convertTo8BitIfPossible && (m_orAllData <= 0xff);
}

// FIXME: Distinguish between a missing public identifer and an empty one. // FIXME: Distinguish between a missing public identifer and an empty one.
const WTF::Vector<UChar>& publicIdentifier() const const WTF::Vector<UChar>& publicIdentifier() const
{ {
Expand Down Expand Up @@ -370,6 +390,8 @@ class MarkupTokenBase {
typename Attribute::Range m_range; // Always starts at zero. typename Attribute::Range m_range; // Always starts at zero.
int m_baseOffset; int m_baseOffset;
DataVector m_data; DataVector m_data;
UChar m_orAllData;
bool m_convertTo8BitIfPossible;


// For DOCTYPE // For DOCTYPE
OwnPtr<DoctypeData> m_doctypeData; OwnPtr<DoctypeData> m_doctypeData;
Expand Down Expand Up @@ -413,6 +435,7 @@ class AtomicMarkupTokenBase {
break; break;
case Token::Type::Character: case Token::Type::Character:
m_externalCharacters = &token->characters(); m_externalCharacters = &token->characters();
m_isAll8BitData = token->isAll8BitData();
break; break;
default: default:
break; break;
Expand All @@ -423,6 +446,7 @@ class AtomicMarkupTokenBase {
: m_type(type) : m_type(type)
, m_name(name) , m_name(name)
, m_externalCharacters(0) , m_externalCharacters(0)
, m_isAll8BitData(false)
, m_attributes(attributes) , m_attributes(attributes)
{ {
ASSERT(usesName()); ASSERT(usesName());
Expand Down Expand Up @@ -472,6 +496,11 @@ class AtomicMarkupTokenBase {
return *m_externalCharacters; return *m_externalCharacters;
} }


bool isAll8BitData() const
{
return m_isAll8BitData;
}

const String& comment() const const String& comment() const
{ {
ASSERT(m_type == Token::Type::Comment); ASSERT(m_type == Token::Type::Comment);
Expand All @@ -495,6 +524,7 @@ class AtomicMarkupTokenBase {
void clearExternalCharacters() void clearExternalCharacters()
{ {
m_externalCharacters = 0; m_externalCharacters = 0;
m_isAll8BitData = false;
} }


protected: protected:
Expand Down Expand Up @@ -522,6 +552,7 @@ class AtomicMarkupTokenBase {
// FIXME: Add a mechanism for "internalizing" the characters when the // FIXME: Add a mechanism for "internalizing" the characters when the
// HTMLToken is destructed. // HTMLToken is destructed.
const typename Token::DataVector* m_externalCharacters; const typename Token::DataVector* m_externalCharacters;
bool m_isAll8BitData;


// For DOCTYPE // For DOCTYPE
OwnPtr<typename Token::DoctypeData> m_doctypeData; OwnPtr<typename Token::DoctypeData> m_doctypeData;
Expand Down

0 comments on commit 3e121cf

Please sign in to comment.