diff --git a/third_party/WebKit/Source/core/frame/MHTMLTest.cpp b/third_party/WebKit/Source/core/frame/MHTMLTest.cpp index cb1fa7011f768..37f992a46e98d 100644 --- a/third_party/WebKit/Source/core/frame/MHTMLTest.cpp +++ b/third_party/WebKit/Source/core/frame/MHTMLTest.cpp @@ -157,13 +157,29 @@ class MHTMLTest : public ::testing::Test { LineReader line_reader( std::string(mhtml_data->data(), mhtml_data->length())); std::string line; - while (line_reader.GetNextLine(&line) && line.length()) { + line_reader.GetNextLine(&line); + while (line.length()) { + // Peek next line to see if it starts with soft line break. If yes, append + // to current line. + std::string next_line; + while (true) { + line_reader.GetNextLine(&next_line); + if (next_line.length() > 1 && + (next_line[0] == ' ' || next_line[0] == '\t')) { + line += &(next_line.at(1)); + continue; + } + break; + } + std::string::size_type pos = line.find(':'); if (pos == std::string::npos) continue; std::string key = line.substr(0, pos); std::string value = line.substr(pos + 2); mhtml_headers.emplace(key, value); + + line = next_line; } return mhtml_headers; } @@ -251,14 +267,16 @@ TEST_F(MHTMLTest, TestMHTMLHeadersWithTitleContainingAllPrintableCharacters) { EXPECT_EQ("", mhtml_headers["From"]); EXPECT_FALSE(mhtml_headers["Date"].empty()); - EXPECT_EQ("multipart/related;", mhtml_headers["Content-Type"]); + EXPECT_EQ( + "multipart/related;type=\"text/html\";boundary=\"boundary-example\"", + mhtml_headers["Content-Type"]); EXPECT_EQ("abc", mhtml_headers["Subject"]); EXPECT_EQ(kURL, mhtml_headers["Snapshot-Content-Location"]); } TEST_F(MHTMLTest, TestMHTMLHeadersWithTitleContainingNonPrintableCharacters) { const char kURL[] = "http://www.example.com/"; - const char kTitle[] = u8"abc=\u261D\U0001F3FB"; + const char kTitle[] = "abc \t=\xe2\x98\x9d\xf0\x9f\x8f\xbb"; AddTestResources(); scoped_refptr data = Serialize(ToKURL(kURL), String::FromUTF8(kTitle), "text/html", @@ -268,12 +286,43 @@ TEST_F(MHTMLTest, TestMHTMLHeadersWithTitleContainingNonPrintableCharacters) { EXPECT_EQ("", mhtml_headers["From"]); EXPECT_FALSE(mhtml_headers["Date"].empty()); - EXPECT_EQ("multipart/related;", mhtml_headers["Content-Type"]); - EXPECT_EQ("=?utf-8?Q?abc=3D=E2=98=9D=F0=9F=8F=BB?=", + EXPECT_EQ( + "multipart/related;type=\"text/html\";boundary=\"boundary-example\"", + mhtml_headers["Content-Type"]); + EXPECT_EQ("=?utf-8?Q?abc=20=09=3D=E2=98=9D=F0=9F=8F=BB?=", mhtml_headers["Subject"]); EXPECT_EQ(kURL, mhtml_headers["Snapshot-Content-Location"]); } +TEST_F(MHTMLTest, + TestMHTMLHeadersWithLongTitleContainingNonPrintableCharacters) { + const char kURL[] = "http://www.example.com/"; + const char kTitle[] = + "01234567890123456789012345678901234567890123456789" + "01234567890123456789012345678901234567890123456789" + " \t=\xe2\x98\x9d\xf0\x9f\x8f\xbb"; + AddTestResources(); + scoped_refptr data = + Serialize(ToKURL(kURL), String::FromUTF8(kTitle), "text/html", + MHTMLArchive::kUseDefaultEncoding); + + std::map mhtml_headers = ExtractMHTMLHeaders(data); + + EXPECT_EQ("", mhtml_headers["From"]); + EXPECT_FALSE(mhtml_headers["Date"].empty()); + EXPECT_EQ( + "multipart/related;type=\"text/html\";boundary=\"boundary-example\"", + mhtml_headers["Content-Type"]); + EXPECT_EQ( + "=?utf-8?Q?012345678901234567890123456789" + "012345678901234567890123456789012?=" + "=?utf-8?Q?345678901234567890123456789" + "0123456789=20=09=3D=E2=98=9D=F0=9F?=" + "=?utf-8?Q?=8F=BB?=", + mhtml_headers["Subject"]); + EXPECT_EQ(kURL, mhtml_headers["Snapshot-Content-Location"]); +} + TEST_F(MHTMLTest, TestMHTMLEncoding) { const char kURL[] = "http://www.example.com"; AddTestResources(); @@ -466,4 +515,24 @@ TEST_F(MHTMLTest, FormControlElements) { EXPECT_FALSE(document->getElementById("fm")->IsDisabledFormControl()); } +TEST_F(MHTMLTest, LoadMHTMLContainingSoftLineBreaks) { + const char kURL[] = "http://www.example.com"; + + // Register the mocked frame and load it. + RegisterMockedURLLoad(kURL, "soft_line_break.mht"); + LoadURLInTopFrame(ToKURL(kURL)); + ASSERT_TRUE(GetPage()); + LocalFrame* frame = ToLocalFrame(GetPage()->MainFrame()); + ASSERT_TRUE(frame); + // We should not have problem to concatenate header lines separated by soft + // line breaks. + Document* document = frame->GetDocument(); + ASSERT_TRUE(document); + + // We should not have problem to concatenate body lines separated by soft + // line breaks. + EXPECT_TRUE(document->getElementById( + "AVeryLongID012345678901234567890123456789012345678901234567890End")); +} + } // namespace blink diff --git a/third_party/WebKit/Source/core/testing/data/mhtml/soft_line_break.mht b/third_party/WebKit/Source/core/testing/data/mhtml/soft_line_break.mht new file mode 100644 index 0000000000000..fdd9c1d3f75ac --- /dev/null +++ b/third_party/WebKit/Source/core/testing/data/mhtml/soft_line_break.mht @@ -0,0 +1,27 @@ +From: +Subject: =?utf-8?Q?012345678901234567890123456789012345678901234567890 + 1234567890123456789012345678901234567890123456789 + =3D=E2=98=9D=F0=9F=8F=BB?= +Date: Thu, 4 Oct 2017 21:18:18 -0000 +MIME-Version: 1.0 +Content-Type: multipart/related; + type="text/html"; + boundary="----MultipartBoundary--e77OylKXx1PBMEF67x53AwnQLf4DUmwdt037X9MjPK----" + +------MultipartBoundary--e77OylKXx1PBMEF67x53AwnQLf4DUmwdt037X9MjPK---- +Content-Type: text/html +Content-ID: +Content-Transfer-Encoding: quoted-printable +Content-Location: http://localhost/soft_line_break.html + + + + + +
+
+ + +------MultipartBoundary--e77OylKXx1PBMEF67x53AwnQLf4DUmwdt037X9MjPK------ diff --git a/third_party/WebKit/Source/platform/mhtml/MHTMLArchive.cpp b/third_party/WebKit/Source/platform/mhtml/MHTMLArchive.cpp index f4852126792c4..16a78823e9983 100644 --- a/third_party/WebKit/Source/platform/mhtml/MHTMLArchive.cpp +++ b/third_party/WebKit/Source/platform/mhtml/MHTMLArchive.cpp @@ -48,9 +48,78 @@ namespace blink { -const char* const kQuotedPrintable = "quoted-printable"; -const char* const kBase64 = "base64"; -const char* const kBinary = "binary"; +namespace { + +const size_t kMaximumLineLength = 76; +const char kCrlfLineEnding[] = "\r\n"; + +const char kRFC2047EncodingPrefix[] = "=?utf-8?Q?"; +const size_t kRFC2047EncodingPrefixLength = 10; +const char kRFC2047EncodingSuffix[] = "?="; +const size_t kRFC2047EncodingSuffixLength = 2; + +const char kQuotedPrintable[] = "quoted-printable"; +const char kBase64[] = "base64"; +const char kBinary[] = "binary"; + +} // namespace + +// Controls quoted-printable encoding characters in body, per RFC 2045. +class QuotedPrintableEncodeBodyDelegate : public QuotedPrintableEncodeDelegate { + public: + QuotedPrintableEncodeBodyDelegate() = default; + ~QuotedPrintableEncodeBodyDelegate() override = default; + + size_t GetMaxLineLengthForEncodedContent() const override { + return kMaximumLineLength; + } + + bool ShouldEncodeWhiteSpaceCharacters(bool end_of_line) const override { + // They should be encoded only if they appear at the end of a body line. + return end_of_line; + } + + void DidStartLine(Vector& out) override { + // Nothing to add. + } + + void DidFinishLine(bool last_line, Vector& out) override { + if (!last_line) { + out.push_back('='); + out.Append(kCrlfLineEnding, strlen(kCrlfLineEnding)); + } + } +}; + +// Controls quoted-printable encoding characters in headers, per RFC 2047. +class QuotedPrintableEncodeHeaderDelegate + : public QuotedPrintableEncodeDelegate { + public: + QuotedPrintableEncodeHeaderDelegate() = default; + ~QuotedPrintableEncodeHeaderDelegate() override = default; + + size_t GetMaxLineLengthForEncodedContent() const override { + return kMaximumLineLength - kRFC2047EncodingPrefixLength - + kRFC2047EncodingSuffixLength; + } + + bool ShouldEncodeWhiteSpaceCharacters(bool end_of_line) const override { + // They should always be encoded if they appear anywhere in the header. + return true; + } + + void DidStartLine(Vector& out) override { + out.Append(kRFC2047EncodingPrefix, kRFC2047EncodingPrefixLength); + } + + void DidFinishLine(bool last_line, Vector& out) override { + out.Append(kRFC2047EncodingSuffix, kRFC2047EncodingSuffixLength); + if (!last_line) { + out.Append(kCrlfLineEnding, strlen(kCrlfLineEnding)); + out.push_back(' '); + } + } +}; static String ConvertToPrintableCharacters(const String& text) { // If the text contains all printable ASCII characters, no need for encoding. @@ -70,9 +139,11 @@ static String ConvertToPrintableCharacters(const String& text) { // where, "utf-8" is the chosen charset to represent the text and "Q" is the // Quoted-Printable format to convert to 7-bit printable ASCII characters. CString utf8_text = text.Utf8(); + QuotedPrintableEncodeHeaderDelegate header_delegate; Vector encoded_text; - QuotedPrintableEncode(utf8_text.data(), utf8_text.length(), encoded_text); - return "=?utf-8?Q?" + String(encoded_text.data(), encoded_text.size()) + "?="; + QuotedPrintableEncode(utf8_text.data(), utf8_text.length(), &header_delegate, + encoded_text); + return String(encoded_text.data(), encoded_text.size()); } MHTMLArchive::MHTMLArchive() = default; @@ -245,7 +316,8 @@ void MHTMLArchive::GenerateMHTMLPart(const String& boundary, size_t data_length = flat_data.size(); Vector encoded_data; if (!strcmp(content_encoding, kQuotedPrintable)) { - QuotedPrintableEncode(data, data_length, encoded_data); + QuotedPrintableEncodeBodyDelegate body_delegate; + QuotedPrintableEncode(data, data_length, &body_delegate, encoded_data); output_buffer.Append(encoded_data.data(), encoded_data.size()); output_buffer.Append("\r\n", 2u); } else { @@ -253,7 +325,6 @@ void MHTMLArchive::GenerateMHTMLPart(const String& boundary, // We are not specifying insertLFs = true below as it would cut the lines // with LFs and MHTML requires CRLFs. Base64Encode(data, data_length, encoded_data); - const size_t kMaximumLineLength = 76; size_t index = 0; size_t encoded_data_length = encoded_data.size(); do { diff --git a/third_party/WebKit/Source/platform/text/QuotedPrintable.cpp b/third_party/WebKit/Source/platform/text/QuotedPrintable.cpp index e0fe4664df572..9f80140a5afc3 100644 --- a/third_party/WebKit/Source/platform/text/QuotedPrintable.cpp +++ b/third_party/WebKit/Source/platform/text/QuotedPrintable.cpp @@ -34,8 +34,6 @@ namespace blink { -static const size_t kMaximumLineLength = 76; - static const char kCrlfLineEnding[] = "\r\n"; static size_t LengthOfLineEndingAtIndex(const char* input, @@ -54,15 +52,13 @@ static size_t LengthOfLineEndingAtIndex(const char* input, return 0; } -void QuotedPrintableEncode(const Vector& in, Vector& out) { - QuotedPrintableEncode(in.data(), in.size(), out); -} - void QuotedPrintableEncode(const char* input, size_t input_length, + QuotedPrintableEncodeDelegate* delegate, Vector& out) { out.clear(); out.ReserveCapacity(input_length); + delegate->DidStartLine(out); size_t current_line_length = 0; for (size_t i = 0; i < input_length; ++i) { bool is_last_character = (i == input_length - 1); @@ -74,13 +70,14 @@ void QuotedPrintableEncode(const char* input, current_character != '\t') requires_encoding = true; - // Space and tab characters have to be encoded if they appear at the end of - // a line. + // Decide if space and tab characters need to be encoded. if (!requires_encoding && - (current_character == '\t' || current_character == ' ') && - (is_last_character || - LengthOfLineEndingAtIndex(input, input_length, i + 1))) - requires_encoding = true; + (current_character == '\t' || current_character == ' ')) { + bool end_of_line = is_last_character || + LengthOfLineEndingAtIndex(input, input_length, i + 1); + requires_encoding = + delegate->ShouldEncodeWhiteSpaceCharacters(end_of_line); + } // End of line should be converted to CR-LF sequences. if (!is_last_character) { @@ -103,10 +100,10 @@ void QuotedPrintableEncode(const char* input, // Insert a soft line break if necessary. if (current_line_length + length_of_encoded_character > - kMaximumLineLength) { - out.push_back('='); - out.Append(kCrlfLineEnding, strlen(kCrlfLineEnding)); + delegate->GetMaxLineLengthForEncodedContent()) { + delegate->DidFinishLine(false /*last_line*/, out); current_line_length = 0; + delegate->DidStartLine(out); } // Finally, insert the actual character(s). @@ -120,6 +117,7 @@ void QuotedPrintableEncode(const char* input, current_line_length++; } } + delegate->DidFinishLine(true /*last_line*/, out); } void QuotedPrintableDecode(const Vector& in, Vector& out) { diff --git a/third_party/WebKit/Source/platform/text/QuotedPrintable.h b/third_party/WebKit/Source/platform/text/QuotedPrintable.h index c37293e124df5..20369704bc205 100644 --- a/third_party/WebKit/Source/platform/text/QuotedPrintable.h +++ b/third_party/WebKit/Source/platform/text/QuotedPrintable.h @@ -36,10 +36,38 @@ namespace blink { -PLATFORM_EXPORT void QuotedPrintableEncode(const Vector&, Vector&); -PLATFORM_EXPORT void QuotedPrintableEncode(const char*, size_t, Vector&); +// Delegate for controling the behavior of quoted-printable encoding. The +// original characters may be encoded a bit differently depending on where +// they live, header or body. For example, "=CRLF" should be used to break +// long line in body while "CRLF+SPACE" should be used to break long line in +// header. +class PLATFORM_EXPORT QuotedPrintableEncodeDelegate { + public: + QuotedPrintableEncodeDelegate() = default; + virtual ~QuotedPrintableEncodeDelegate() = default; + + // Returns maximum number of characters allowed for an encoded line, excluding + // prefix and soft line break. + virtual size_t GetMaxLineLengthForEncodedContent() const = 0; + + // Returns true if space and tab characters need to be encoded. + virtual bool ShouldEncodeWhiteSpaceCharacters(bool end_of_line) const = 0; + + // Called when an encoded line starts. The delegate can take this chance to + // add any prefix. + virtual void DidStartLine(Vector& out) = 0; + + // Called when an encoded line ends. The delegate can take this chance to add + // any suffix. If it is not last line, a soft line break should also + // be added after the suffix. + virtual void DidFinishLine(bool last_line, Vector& out) = 0; +}; + +PLATFORM_EXPORT void QuotedPrintableEncode(const char*, + size_t, + QuotedPrintableEncodeDelegate*, + Vector&); -PLATFORM_EXPORT void QuotedPrintableDecode(const Vector&, Vector&); PLATFORM_EXPORT void QuotedPrintableDecode(const char*, size_t, Vector&); } // namespace blink