diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fd45dd331..dce13fadbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ * Fix processing of `--set` options (#5067, @marwinxxii) * Lowercase user-added header names and emit a log message to notify the user when using HTTP/2 (#4746, @mhils) * Exit early if there are errors on startup (#4544, @mhils) +* Fixed encoding guessing: only search for meta tags in HTML bodies (##4566, @Prinzhorn) ## 28 September 2021: mitmproxy 7.0.4 diff --git a/mitmproxy/http.py b/mitmproxy/http.py index 11af7b3e37..743c46c99b 100644 --- a/mitmproxy/http.py +++ b/mitmproxy/http.py @@ -414,9 +414,10 @@ def _guess_encoding(self, content: bytes = b"") -> str: if "json" in self.headers.get("content-type", ""): enc = "utf8" if not enc: - meta_charset = re.search(rb"""]+charset=['"]?([^'">]+)""", content, re.IGNORECASE) - if meta_charset: - enc = meta_charset.group(1).decode("ascii", "ignore") + if "html" in self.headers.get("content-type", ""): + meta_charset = re.search(rb"""]+charset=['"]?([^'">]+)""", content, re.IGNORECASE) + if meta_charset: + enc = meta_charset.group(1).decode("ascii", "ignore") if not enc: if "text/css" in self.headers.get("content-type", ""): # @charset rule must be the very first thing. diff --git a/test/mitmproxy/test_http.py b/test/mitmproxy/test_http.py index ee8c9600d1..eb64821f3f 100644 --- a/test/mitmproxy/test_http.py +++ b/test/mitmproxy/test_http.py @@ -1098,6 +1098,7 @@ def test_guess_json(self): def test_guess_meta_charset(self): r = tresp(content=b'\xe6\x98\x8e\xe4\xbc\xaf') + r.headers["content-type"] = "text/html" # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030 assert "鏄庝集" in r.text