Skip to content

Commit

Permalink
Fixed encoding guessing: only search for meta tags in HTML bodies (#4566
Browse files Browse the repository at this point in the history
)
  • Loading branch information
Prinzhorn committed Mar 17, 2022
1 parent a9283be commit e8ae38c
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
* Fix processing of `--set` options (#5067, @marwinxxii)
* Lowercase user-added header names and emit a log message to notify the user when using HTTP/2 (#4746, @mhils)
* Exit early if there are errors on startup (#4544, @mhils)
* Fixed encoding guessing: only search for meta tags in HTML bodies (##4566, @Prinzhorn)

## 28 September 2021: mitmproxy 7.0.4

Expand Down
7 changes: 4 additions & 3 deletions mitmproxy/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,9 +414,10 @@ def _guess_encoding(self, content: bytes = b"") -> str:
if "json" in self.headers.get("content-type", ""):
enc = "utf8"
if not enc:
meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE)
if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore")
if "html" in self.headers.get("content-type", ""):
meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE)
if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore")
if not enc:
if "text/css" in self.headers.get("content-type", ""):
# @charset rule must be the very first thing.
Expand Down
1 change: 1 addition & 0 deletions test/mitmproxy/test_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -1098,6 +1098,7 @@ def test_guess_json(self):
def test_guess_meta_charset(self):
r = tresp(content=b'<meta http-equiv="content-type" '
b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf')
r.headers["content-type"] = "text/html"
# "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
assert "鏄庝集" in r.text

Expand Down

0 comments on commit e8ae38c

Please sign in to comment.