diff --git a/CHANGELOG.md b/CHANGELOG.md index 7db955d1f3..4aaab21f41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ ## Unreleased: mitmproxy next +* Fix bug where response flows from HAR files had incorrect `content-length` headers + ([#6548](https://github.com/mitmproxy/mitmproxy/pull/6548), @zanieb) * Improved handling for `--allow-hosts`/`--ignore-hosts` options in WireGuard mode (#5930). ([#6513](https://github.com/mitmproxy/mitmproxy/pull/6513), @dsphper) * DNS resolution is now exempted from `--ignore-hosts` in WireGuard Mode. diff --git a/mitmproxy/http.py b/mitmproxy/http.py index 986e6b8983..bfffac567c 100644 --- a/mitmproxy/http.py +++ b/mitmproxy/http.py @@ -1,7 +1,6 @@ import binascii import json import os -import re import time import urllib.parse import warnings @@ -27,6 +26,7 @@ from mitmproxy.net.http import status_codes from mitmproxy.net.http import url from mitmproxy.net.http.headers import assemble_content_type +from mitmproxy.net.http.headers import infer_content_encoding from mitmproxy.net.http.headers import parse_content_type from mitmproxy.utils import human from mitmproxy.utils import strutils @@ -402,45 +402,11 @@ def get_content(self, strict: bool = True) -> bytes | None: else: return self.raw_content - def _get_content_type_charset(self) -> str | None: - ct = parse_content_type(self.headers.get("content-type", "")) - if ct: - return ct[2].get("charset") - return None - - def _guess_encoding(self, content: bytes = b"") -> str: - enc = self._get_content_type_charset() - if not enc: - if "json" in self.headers.get("content-type", ""): - enc = "utf8" - if not enc: - if "html" in self.headers.get("content-type", ""): - meta_charset = re.search( - rb"""]+charset=['"]?([^'">]+)""", content, re.IGNORECASE - ) - if meta_charset: - enc = meta_charset.group(1).decode("ascii", "ignore") - if not enc: - if "text/css" in self.headers.get("content-type", ""): - # @charset rule must be the very first thing. - css_charset = re.match( - rb"""@charset "([^"]+)";""", content, re.IGNORECASE - ) - if css_charset: - enc = css_charset.group(1).decode("ascii", "ignore") - if not enc: - enc = "latin-1" - # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites. - if enc.lower() in ("gb2312", "gbk"): - enc = "gb18030" - - return enc - def set_text(self, text: str | None) -> None: if text is None: self.content = None return - enc = self._guess_encoding() + enc = infer_content_encoding(self.headers.get("content-type", "")) try: self.content = cast(bytes, encoding.encode(text, enc)) @@ -464,7 +430,7 @@ def get_text(self, strict: bool = True) -> str | None: content = self.get_content(strict) if content is None: return None - enc = self._guess_encoding(content) + enc = infer_content_encoding(self.headers.get("content-type", ""), content) try: return cast(str, encoding.decode(content, enc)) except ValueError: diff --git a/mitmproxy/io/har.py b/mitmproxy/io/har.py index cf2dc0ce3a..c3178214bc 100644 --- a/mitmproxy/io/har.py +++ b/mitmproxy/io/har.py @@ -7,6 +7,7 @@ from mitmproxy import connection from mitmproxy import exceptions from mitmproxy import http +from mitmproxy.net.http.headers import infer_content_encoding logger = logging.getLogger(__name__) @@ -85,24 +86,52 @@ def request_to_flow(request_json: dict) -> http.HTTPFlow: # In Firefox HAR files images don't include response bodies response_content = request_json["response"]["content"].get("text", "") content_encoding = request_json["response"]["content"].get("encoding", None) + response_headers = fix_headers(request_json["response"]["headers"]) + if content_encoding == "base64": response_content = base64.b64decode(response_content) - response_headers = fix_headers(request_json["response"]["headers"]) + elif isinstance(response_content, str): + # Convert text to bytes, as in `Response.set_text` + try: + response_content = http.encoding.encode( + response_content, + ( + content_encoding + or infer_content_encoding(response_headers.get("content-type", "")) + ), + ) + except ValueError: + # Fallback to UTF-8 + response_content = response_content.encode( + "utf-8", errors="surrogateescape" + ) + + # Then encode the content, as in `Response.set_content` + response_content = http.encoding.encode( + response_content, response_headers.get("content-encoding") or "identity" + ) - new_flow.response = http.Response.make( - response_code, response_content, response_headers + new_flow.response = http.Response( + b"HTTP/1.1", + response_code, + http.status_codes.RESPONSES.get(response_code, "").encode(), + response_headers, + response_content, + None, + timestamp_start, + timestamp_end, ) - # Change time to match HAR file + # Update timestamps + new_flow.request.timestamp_start = timestamp_start new_flow.request.timestamp_end = timestamp_end - new_flow.response.timestamp_start = timestamp_start - new_flow.response.timestamp_end = timestamp_end - new_flow.client_conn.timestamp_start = timestamp_start new_flow.client_conn.timestamp_end = timestamp_end + # Update HTTP version + match http_version_req: case "http/2.0": new_flow.request.http_version = "HTTP/2" diff --git a/mitmproxy/net/http/headers.py b/mitmproxy/net/http/headers.py index e87efc5032..7e14b2a77c 100644 --- a/mitmproxy/net/http/headers.py +++ b/mitmproxy/net/http/headers.py @@ -1,4 +1,5 @@ import collections +import re def parse_content_type(c: str) -> tuple[str, str, dict[str, str]] | None: @@ -33,3 +34,39 @@ def assemble_content_type(type, subtype, parameters): return f"{type}/{subtype}" params = "; ".join(f"{k}={v}" for k, v in parameters.items()) return f"{type}/{subtype}; {params}" + + +def infer_content_encoding(content_type: str, content: bytes = b"") -> str: + """ + Infer the encoding of content from the content-type header. + """ + # Use the charset from the header if possible + parsed_content_type = parse_content_type(content_type) + enc = parsed_content_type[2].get("charset") if parsed_content_type else None + + # Otherwise, infer the encoding + if not enc and "json" in content_type: + enc = "utf8" + + if not enc and "html" in content_type: + meta_charset = re.search( + rb"""]+charset=['"]?([^'">]+)""", content, re.IGNORECASE + ) + if meta_charset: + enc = meta_charset.group(1).decode("ascii", "ignore") + + if not enc and "text/css" in content_type: + # @charset rule must be the very first thing. + css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE) + if css_charset: + enc = css_charset.group(1).decode("ascii", "ignore") + + # Fallback to latin-1 + if not enc: + enc = "latin-1" + + # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites. + if enc.lower() in ("gb2312", "gbk"): + enc = "gb18030" + + return enc diff --git a/test/mitmproxy/data/har_files/charles.json b/test/mitmproxy/data/har_files/charles.json index d38a849a9a..c4e5de8226 100644 --- a/test/mitmproxy/data/har_files/charles.json +++ b/test/mitmproxy/data/har_files/charles.json @@ -81,7 +81,7 @@ "headers": [ [ "Content-Type", - "text/html; charset=utf-8" + "text/html" ], [ "Content-Length", diff --git a/test/mitmproxy/data/har_files/chrome.json b/test/mitmproxy/data/har_files/chrome.json index d7fdc4035d..7fbf26aa17 100644 --- a/test/mitmproxy/data/har_files/chrome.json +++ b/test/mitmproxy/data/har_files/chrome.json @@ -194,10 +194,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "23866" ] ], "contentLength": 23866, @@ -326,12 +322,7 @@ "http_version": "HTTP/1.1", "status_code": 0, "reason": "", - "headers": [ - [ - "content-length", - "0" - ] - ], + "headers": [], "contentLength": 0, "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", "timestamp_start": 1689251552.676, diff --git a/test/mitmproxy/data/har_files/firefox.json b/test/mitmproxy/data/har_files/firefox.json index 849cb79edc..fc7a4771b1 100644 --- a/test/mitmproxy/data/har_files/firefox.json +++ b/test/mitmproxy/data/har_files/firefox.json @@ -174,14 +174,6 @@ [ "x-amz-cf-id", "DPEkuUbeK1ZXMsRuUHqgk6iE4l7ShgyrJntkqIbLaSJ5646Ptc2Xew==" - ], - [ - "content-type", - "text/plain; charset=utf-8" - ], - [ - "content-length", - "23866" ] ], "contentLength": 23866, @@ -282,12 +274,7 @@ "http_version": "HTTP/1.1", "status_code": 200, "reason": "OK", - "headers": [ - [ - "content-length", - "0" - ] - ], + "headers": [], "contentLength": 0, "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", "timestamp_start": 1680134339.418, @@ -386,12 +373,7 @@ "http_version": "HTTP/1.1", "status_code": 200, "reason": "OK", - "headers": [ - [ - "content-length", - "0" - ] - ], + "headers": [], "contentLength": 0, "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", "timestamp_start": 1680134339.456, @@ -490,12 +472,7 @@ "http_version": "HTTP/1.1", "status_code": 200, "reason": "OK", - "headers": [ - [ - "content-length", - "0" - ] - ], + "headers": [], "contentLength": 0, "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", "timestamp_start": 1680134339.457, @@ -594,12 +571,7 @@ "http_version": "HTTP/1.1", "status_code": 200, "reason": "OK", - "headers": [ - [ - "content-length", - "0" - ] - ], + "headers": [], "contentLength": 0, "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", "timestamp_start": 1680134339.457, @@ -698,12 +670,7 @@ "http_version": "HTTP/1.1", "status_code": 200, "reason": "OK", - "headers": [ - [ - "content-length", - "0" - ] - ], + "headers": [], "contentLength": 0, "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", "timestamp_start": 1680134339.458, @@ -878,10 +845,6 @@ [ "x-amz-cf-id", "EufbwzXQheESUTNil_OCjKK8cvVL51cQpYfmF7iZSHloCTvVhfZ8yQ==" - ], - [ - "content-length", - "3969" ] ], "contentLength": 3969, @@ -1058,10 +1021,6 @@ [ "x-amz-cf-id", "UuHHTyGhTqqTO07G_oZCw7rxjb9RJUGTN3OW0EUS77RH4GiQ-LkAvw==" - ], - [ - "content-length", - "3346" ] ], "contentLength": 3346, @@ -1238,10 +1197,6 @@ [ "x-amz-cf-id", "TOLqHpQWMFHQDWnv2yHHFWI5xkA3R13TTJQDJe1ARViKrgihxZdhxA==" - ], - [ - "content-length", - "794" ] ], "contentLength": 794, @@ -1425,10 +1380,6 @@ [ "x-amz-cf-id", "nnIrWtgAMt42ua4HYBtNAao6m_iD9WjIzLAFyURb8mjOr5MriSQXRA==" - ], - [ - "content-length", - "9689" ] ], "contentLength": 9689, @@ -1612,10 +1563,6 @@ [ "x-amz-cf-id", "PMTLvP_yUCVocnhd1i1ir7_FRAJRw0ayMhK3KaZKELDO3pxxoqLWjg==" - ], - [ - "content-length", - "9689" ] ], "contentLength": 9689, @@ -1958,10 +1905,6 @@ [ "x-amz-cf-id", "0okGWJw6nYo7R-4egQWE-WfonThN2EXyRSLO9MlCNKyMfD-2v1AU0Q==" - ], - [ - "content-length", - "6986" ] ], "contentLength": 6986, diff --git a/test/mitmproxy/data/har_files/head-content-length.har b/test/mitmproxy/data/har_files/head-content-length.har new file mode 100644 index 0000000000..ec549cbe9d --- /dev/null +++ b/test/mitmproxy/data/har_files/head-content-length.har @@ -0,0 +1,179 @@ +{ + "log": { + "version": "1.2", + "creator": { + "name": "Firefox", + "version": "111.0.1" + }, + "browser": { + "name": "Firefox", + "version": "111.0.1" + }, + "pages": [ + { + "startedDateTime": "2023-03-29T16:58:59.303-07:00", + "id": "page_1", + "title": "mitmproxy - an interactive HTTPS proxy", + "pageTimings": { + "onContentLoad": 208, + "onLoad": 270 + } + } + ], + "entries": [ + { + "startedDateTime": "2023-12-12T16:23:16.067544+00:00", + "time": 101.56393051147461, + "request": { + "method": "HEAD", + "url": "https://files.pythonhosted.org/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl", + "httpVersion": "HTTP/2.0", + "cookies": [], + "headers": [ + { + "name": "accept", + "value": "*/*" + }, + { + "name": "user-agent", + "value": "puffin" + }, + { + "name": "accept-encoding", + "value": "gzip, br" + } + ], + "queryString": [], + "headersSize": 91, + "bodySize": 0 + }, + "response": { + "status": 200, + "statusText": "", + "httpVersion": "HTTP/2.0", + "cookies": [], + "headers": [ + { + "name": "last-modified", + "value": "Thu, 10 Aug 2023 11:39:00 GMT" + }, + { + "name": "etag", + "value": "\"a296c6e224c118b0d08cd77e8c08f4b1\"" + }, + { + "name": "x-amz-request-id", + "value": "aeb4d3335548af85" + }, + { + "name": "x-amz-id-2", + "value": "aN65jxTFgNrlm8zEJMNdk7mYLYwUwTzh0" + }, + { + "name": "x-amz-version-id", + "value": "4_z179c51e67f11a0ad8f6c0018_f10789ff3151435c8_d20230810_m113900_c005_v0501001_t0045_u01691667540984" + }, + { + "name": "content-type", + "value": "application/octet-stream" + }, + { + "name": "cache-control", + "value": "max-age=365000000, immutable, public" + }, + { + "name": "accept-ranges", + "value": "bytes" + }, + { + "name": "date", + "value": "Tue, 12 Dec 2023 16:23:16 GMT" + }, + { + "name": "age", + "value": "2335275" + }, + { + "name": "x-served-by", + "value": "cache-iad-kcgs7200038-IAD, cache-lck10926-LCK" + }, + { + "name": "x-cache", + "value": "HIT, HIT" + }, + { + "name": "x-cache-hits", + "value": "21459, 139679" + }, + { + "name": "x-timer", + "value": "S1702398196.107663,VS0,VE0" + }, + { + "name": "strict-transport-security", + "value": "max-age=31536000; includeSubDomains; preload" + }, + { + "name": "x-frame-options", + "value": "deny" + }, + { + "name": "x-xss-protection", + "value": "1; mode=block" + }, + { + "name": "x-content-type-options", + "value": "nosniff" + }, + { + "name": "x-permitted-cross-domain-policies", + "value": "none" + }, + { + "name": "x-robots-header", + "value": "noindex" + }, + { + "name": "x-pypi-file-python-version", + "value": "py3" + }, + { + "name": "x-pypi-file-version", + "value": "4.66.1" + }, + { + "name": "x-pypi-file-package-type", + "value": "bdist_wheel" + }, + { + "name": "x-pypi-file-project", + "value": "tqdm" + }, + { + "name": "content-length", + "value": "78258" + } + ], + "content": { + "size": 0, + "compression": 0, + "mimeType": "application/octet-stream", + "text": "" + }, + "redirectURL": "", + "headersSize": 1188, + "bodySize": 0 + }, + "cache": {}, + "timings": { + "connect": 31.686782836914062, + "ssl": 35.33315658569336, + "send": 0.2961158752441406, + "receive": 0.6310939788818359, + "wait": 33.61678123474121 + }, + "serverIPAddress": "199.232.96.223" + } + ] + } +} \ No newline at end of file diff --git a/test/mitmproxy/data/har_files/head-content-length.json b/test/mitmproxy/data/har_files/head-content-length.json new file mode 100644 index 0000000000..b28343587f --- /dev/null +++ b/test/mitmproxy/data/har_files/head-content-length.json @@ -0,0 +1,193 @@ +[ + { + "id": "hardcoded_for_test", + "intercepted": false, + "is_replay": null, + "type": "http", + "modified": false, + "marked": "", + "comment": "", + "timestamp_created": 0, + "client_conn": { + "id": "hardcoded_for_test", + "peername": [ + "127.0.0.1", + 0 + ], + "sockname": [ + "127.0.0.1", + 0 + ], + "tls_established": false, + "cert": null, + "sni": null, + "cipher": null, + "alpn": null, + "tls_version": null, + "timestamp_start": 1702398196.067544, + "timestamp_tls_setup": null, + "timestamp_end": 1702398297.6314745 + }, + "server_conn": { + "id": "hardcoded_for_test", + "peername": null, + "sockname": null, + "address": [ + "199.232.96.223", + 443 + ], + "tls_established": false, + "cert": null, + "sni": null, + "cipher": null, + "alpn": null, + "tls_version": null, + "timestamp_start": null, + "timestamp_tcp_setup": null, + "timestamp_tls_setup": null, + "timestamp_end": null + }, + "request": { + "method": "HEAD", + "scheme": "https", + "host": "files.pythonhosted.org", + "port": 443, + "path": "/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl", + "http_version": "HTTP/1.1", + "headers": [ + [ + "accept", + "*/*" + ], + [ + "user-agent", + "puffin" + ], + [ + "accept-encoding", + "gzip, br" + ], + [ + "content-length", + "0" + ] + ], + "contentLength": 0, + "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + "timestamp_start": 1702398196.067544, + "timestamp_end": 1702398297.6314745, + "pretty_host": "files.pythonhosted.org" + }, + "response": { + "http_version": "HTTP/1.1", + "status_code": 200, + "reason": "OK", + "headers": [ + [ + "last-modified", + "Thu, 10 Aug 2023 11:39:00 GMT" + ], + [ + "etag", + "\"a296c6e224c118b0d08cd77e8c08f4b1\"" + ], + [ + "x-amz-request-id", + "aeb4d3335548af85" + ], + [ + "x-amz-id-2", + "aN65jxTFgNrlm8zEJMNdk7mYLYwUwTzh0" + ], + [ + "x-amz-version-id", + "4_z179c51e67f11a0ad8f6c0018_f10789ff3151435c8_d20230810_m113900_c005_v0501001_t0045_u01691667540984" + ], + [ + "content-type", + "application/octet-stream" + ], + [ + "cache-control", + "max-age=365000000, immutable, public" + ], + [ + "accept-ranges", + "bytes" + ], + [ + "date", + "Tue, 12 Dec 2023 16:23:16 GMT" + ], + [ + "age", + "2335275" + ], + [ + "x-served-by", + "cache-iad-kcgs7200038-IAD, cache-lck10926-LCK" + ], + [ + "x-cache", + "HIT, HIT" + ], + [ + "x-cache-hits", + "21459, 139679" + ], + [ + "x-timer", + "S1702398196.107663,VS0,VE0" + ], + [ + "strict-transport-security", + "max-age=31536000; includeSubDomains; preload" + ], + [ + "x-frame-options", + "deny" + ], + [ + "x-xss-protection", + "1; mode=block" + ], + [ + "x-content-type-options", + "nosniff" + ], + [ + "x-permitted-cross-domain-policies", + "none" + ], + [ + "x-robots-header", + "noindex" + ], + [ + "x-pypi-file-python-version", + "py3" + ], + [ + "x-pypi-file-version", + "4.66.1" + ], + [ + "x-pypi-file-package-type", + "bdist_wheel" + ], + [ + "x-pypi-file-project", + "tqdm" + ], + [ + "content-length", + "78258" + ] + ], + "contentLength": 0, + "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + "timestamp_start": 1702398196.067544, + "timestamp_end": 1702398297.6314745 + } + } +] \ No newline at end of file diff --git a/test/mitmproxy/data/har_files/safari.json b/test/mitmproxy/data/har_files/safari.json index f1a9b01898..299689158b 100644 --- a/test/mitmproxy/data/har_files/safari.json +++ b/test/mitmproxy/data/har_files/safari.json @@ -101,7 +101,7 @@ "headers": [ [ "Content-Type", - "text/html; charset=utf-8" + "text/html" ], [ "Last-Modified", @@ -150,10 +150,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "4946" ] ], "contentLength": 4946, @@ -282,10 +278,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "36819" ] ], "contentLength": 36819, @@ -926,10 +918,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "5167" ] ], "contentLength": 5167, @@ -1058,10 +1046,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "3346" ] ], "contentLength": 3346, @@ -1190,10 +1174,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "794" ] ], "contentLength": 794, @@ -1334,10 +1314,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "3346" ] ], "contentLength": 3346, @@ -1478,10 +1454,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "3346" ] ], "contentLength": 3346, @@ -1622,10 +1594,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "3346" ] ], "contentLength": 3346, @@ -1766,10 +1734,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "3346" ] ], "contentLength": 3346, @@ -2282,10 +2246,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "3969" ] ], "contentLength": 3969, @@ -2596,10 +2556,6 @@ [ "x-cache", "Hit from cloudfront" - ], - [ - "content-length", - "1421" ] ], "contentLength": 1421, diff --git a/test/mitmproxy/net/http/test_headers.py b/test/mitmproxy/net/http/test_headers.py index 473b930f84..d7fb1999a3 100644 --- a/test/mitmproxy/net/http/test_headers.py +++ b/test/mitmproxy/net/http/test_headers.py @@ -1,6 +1,9 @@ import collections +import pytest + from mitmproxy.net.http.headers import assemble_content_type +from mitmproxy.net.http.headers import infer_content_encoding from mitmproxy.net.http.headers import parse_content_type @@ -25,3 +28,34 @@ def test_assemble_content_type(): ) == "text/html; charset=utf8; foo=bar" ) + + +@pytest.mark.parametrize( + "content_type,content,expected", + [ + ("", b"", "latin-1"), + ("", b"foo", "latin-1"), + ("", b"\xfc", "latin-1"), + ("", b"\xF0\xE2", "latin-1"), + ("text/html; charset=latin1", b"\xc3\xbc", "latin1"), + ("text/html; charset=utf8", b"\xc3\xbc", "utf8"), + # json + ("application/json", b'"\xc3\xbc"', "utf8"), + # meta charset + ( + "text/html", + b'\xe6\x98\x8e\xe4\xbc\xaf', + "gb18030", + ), + # css charset + ( + "text/css", + b'@charset "gb2312";' b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}', + "gb18030", + ), + ], +) +def test_infer_content_encoding(content_type, content, expected): + # Additional test coverage in `test_http::TestMessageText` + assert infer_content_encoding(content_type, content) == expected