diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7db955d1f3..4aaab21f41 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@
## Unreleased: mitmproxy next
+* Fix bug where response flows from HAR files had incorrect `content-length` headers
+ ([#6548](https://github.com/mitmproxy/mitmproxy/pull/6548), @zanieb)
* Improved handling for `--allow-hosts`/`--ignore-hosts` options in WireGuard mode (#5930).
([#6513](https://github.com/mitmproxy/mitmproxy/pull/6513), @dsphper)
* DNS resolution is now exempted from `--ignore-hosts` in WireGuard Mode.
diff --git a/mitmproxy/http.py b/mitmproxy/http.py
index 986e6b8983..bfffac567c 100644
--- a/mitmproxy/http.py
+++ b/mitmproxy/http.py
@@ -1,7 +1,6 @@
import binascii
import json
import os
-import re
import time
import urllib.parse
import warnings
@@ -27,6 +26,7 @@
from mitmproxy.net.http import status_codes
from mitmproxy.net.http import url
from mitmproxy.net.http.headers import assemble_content_type
+from mitmproxy.net.http.headers import infer_content_encoding
from mitmproxy.net.http.headers import parse_content_type
from mitmproxy.utils import human
from mitmproxy.utils import strutils
@@ -402,45 +402,11 @@ def get_content(self, strict: bool = True) -> bytes | None:
else:
return self.raw_content
- def _get_content_type_charset(self) -> str | None:
- ct = parse_content_type(self.headers.get("content-type", ""))
- if ct:
- return ct[2].get("charset")
- return None
-
- def _guess_encoding(self, content: bytes = b"") -> str:
- enc = self._get_content_type_charset()
- if not enc:
- if "json" in self.headers.get("content-type", ""):
- enc = "utf8"
- if not enc:
- if "html" in self.headers.get("content-type", ""):
- meta_charset = re.search(
- rb"""]+charset=['"]?([^'">]+)""", content, re.IGNORECASE
- )
- if meta_charset:
- enc = meta_charset.group(1).decode("ascii", "ignore")
- if not enc:
- if "text/css" in self.headers.get("content-type", ""):
- # @charset rule must be the very first thing.
- css_charset = re.match(
- rb"""@charset "([^"]+)";""", content, re.IGNORECASE
- )
- if css_charset:
- enc = css_charset.group(1).decode("ascii", "ignore")
- if not enc:
- enc = "latin-1"
- # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
- if enc.lower() in ("gb2312", "gbk"):
- enc = "gb18030"
-
- return enc
-
def set_text(self, text: str | None) -> None:
if text is None:
self.content = None
return
- enc = self._guess_encoding()
+ enc = infer_content_encoding(self.headers.get("content-type", ""))
try:
self.content = cast(bytes, encoding.encode(text, enc))
@@ -464,7 +430,7 @@ def get_text(self, strict: bool = True) -> str | None:
content = self.get_content(strict)
if content is None:
return None
- enc = self._guess_encoding(content)
+ enc = infer_content_encoding(self.headers.get("content-type", ""), content)
try:
return cast(str, encoding.decode(content, enc))
except ValueError:
diff --git a/mitmproxy/io/har.py b/mitmproxy/io/har.py
index cf2dc0ce3a..c3178214bc 100644
--- a/mitmproxy/io/har.py
+++ b/mitmproxy/io/har.py
@@ -7,6 +7,7 @@
from mitmproxy import connection
from mitmproxy import exceptions
from mitmproxy import http
+from mitmproxy.net.http.headers import infer_content_encoding
logger = logging.getLogger(__name__)
@@ -85,24 +86,52 @@ def request_to_flow(request_json: dict) -> http.HTTPFlow:
# In Firefox HAR files images don't include response bodies
response_content = request_json["response"]["content"].get("text", "")
content_encoding = request_json["response"]["content"].get("encoding", None)
+ response_headers = fix_headers(request_json["response"]["headers"])
+
if content_encoding == "base64":
response_content = base64.b64decode(response_content)
- response_headers = fix_headers(request_json["response"]["headers"])
+ elif isinstance(response_content, str):
+ # Convert text to bytes, as in `Response.set_text`
+ try:
+ response_content = http.encoding.encode(
+ response_content,
+ (
+ content_encoding
+ or infer_content_encoding(response_headers.get("content-type", ""))
+ ),
+ )
+ except ValueError:
+ # Fallback to UTF-8
+ response_content = response_content.encode(
+ "utf-8", errors="surrogateescape"
+ )
+
+ # Then encode the content, as in `Response.set_content`
+ response_content = http.encoding.encode(
+ response_content, response_headers.get("content-encoding") or "identity"
+ )
- new_flow.response = http.Response.make(
- response_code, response_content, response_headers
+ new_flow.response = http.Response(
+ b"HTTP/1.1",
+ response_code,
+ http.status_codes.RESPONSES.get(response_code, "").encode(),
+ response_headers,
+ response_content,
+ None,
+ timestamp_start,
+ timestamp_end,
)
- # Change time to match HAR file
+ # Update timestamps
+
new_flow.request.timestamp_start = timestamp_start
new_flow.request.timestamp_end = timestamp_end
- new_flow.response.timestamp_start = timestamp_start
- new_flow.response.timestamp_end = timestamp_end
-
new_flow.client_conn.timestamp_start = timestamp_start
new_flow.client_conn.timestamp_end = timestamp_end
+ # Update HTTP version
+
match http_version_req:
case "http/2.0":
new_flow.request.http_version = "HTTP/2"
diff --git a/mitmproxy/net/http/headers.py b/mitmproxy/net/http/headers.py
index e87efc5032..7e14b2a77c 100644
--- a/mitmproxy/net/http/headers.py
+++ b/mitmproxy/net/http/headers.py
@@ -1,4 +1,5 @@
import collections
+import re
def parse_content_type(c: str) -> tuple[str, str, dict[str, str]] | None:
@@ -33,3 +34,39 @@ def assemble_content_type(type, subtype, parameters):
return f"{type}/{subtype}"
params = "; ".join(f"{k}={v}" for k, v in parameters.items())
return f"{type}/{subtype}; {params}"
+
+
+def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
+ """
+ Infer the encoding of content from the content-type header.
+ """
+ # Use the charset from the header if possible
+ parsed_content_type = parse_content_type(content_type)
+ enc = parsed_content_type[2].get("charset") if parsed_content_type else None
+
+ # Otherwise, infer the encoding
+ if not enc and "json" in content_type:
+ enc = "utf8"
+
+ if not enc and "html" in content_type:
+ meta_charset = re.search(
+ rb"""]+charset=['"]?([^'">]+)""", content, re.IGNORECASE
+ )
+ if meta_charset:
+ enc = meta_charset.group(1).decode("ascii", "ignore")
+
+ if not enc and "text/css" in content_type:
+ # @charset rule must be the very first thing.
+ css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE)
+ if css_charset:
+ enc = css_charset.group(1).decode("ascii", "ignore")
+
+ # Fallback to latin-1
+ if not enc:
+ enc = "latin-1"
+
+ # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
+ if enc.lower() in ("gb2312", "gbk"):
+ enc = "gb18030"
+
+ return enc
diff --git a/test/mitmproxy/data/har_files/charles.json b/test/mitmproxy/data/har_files/charles.json
index d38a849a9a..c4e5de8226 100644
--- a/test/mitmproxy/data/har_files/charles.json
+++ b/test/mitmproxy/data/har_files/charles.json
@@ -81,7 +81,7 @@
"headers": [
[
"Content-Type",
- "text/html; charset=utf-8"
+ "text/html"
],
[
"Content-Length",
diff --git a/test/mitmproxy/data/har_files/chrome.json b/test/mitmproxy/data/har_files/chrome.json
index d7fdc4035d..7fbf26aa17 100644
--- a/test/mitmproxy/data/har_files/chrome.json
+++ b/test/mitmproxy/data/har_files/chrome.json
@@ -194,10 +194,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "23866"
]
],
"contentLength": 23866,
@@ -326,12 +322,7 @@
"http_version": "HTTP/1.1",
"status_code": 0,
"reason": "",
- "headers": [
- [
- "content-length",
- "0"
- ]
- ],
+ "headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1689251552.676,
diff --git a/test/mitmproxy/data/har_files/firefox.json b/test/mitmproxy/data/har_files/firefox.json
index 849cb79edc..fc7a4771b1 100644
--- a/test/mitmproxy/data/har_files/firefox.json
+++ b/test/mitmproxy/data/har_files/firefox.json
@@ -174,14 +174,6 @@
[
"x-amz-cf-id",
"DPEkuUbeK1ZXMsRuUHqgk6iE4l7ShgyrJntkqIbLaSJ5646Ptc2Xew=="
- ],
- [
- "content-type",
- "text/plain; charset=utf-8"
- ],
- [
- "content-length",
- "23866"
]
],
"contentLength": 23866,
@@ -282,12 +274,7 @@
"http_version": "HTTP/1.1",
"status_code": 200,
"reason": "OK",
- "headers": [
- [
- "content-length",
- "0"
- ]
- ],
+ "headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1680134339.418,
@@ -386,12 +373,7 @@
"http_version": "HTTP/1.1",
"status_code": 200,
"reason": "OK",
- "headers": [
- [
- "content-length",
- "0"
- ]
- ],
+ "headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1680134339.456,
@@ -490,12 +472,7 @@
"http_version": "HTTP/1.1",
"status_code": 200,
"reason": "OK",
- "headers": [
- [
- "content-length",
- "0"
- ]
- ],
+ "headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1680134339.457,
@@ -594,12 +571,7 @@
"http_version": "HTTP/1.1",
"status_code": 200,
"reason": "OK",
- "headers": [
- [
- "content-length",
- "0"
- ]
- ],
+ "headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1680134339.457,
@@ -698,12 +670,7 @@
"http_version": "HTTP/1.1",
"status_code": 200,
"reason": "OK",
- "headers": [
- [
- "content-length",
- "0"
- ]
- ],
+ "headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1680134339.458,
@@ -878,10 +845,6 @@
[
"x-amz-cf-id",
"EufbwzXQheESUTNil_OCjKK8cvVL51cQpYfmF7iZSHloCTvVhfZ8yQ=="
- ],
- [
- "content-length",
- "3969"
]
],
"contentLength": 3969,
@@ -1058,10 +1021,6 @@
[
"x-amz-cf-id",
"UuHHTyGhTqqTO07G_oZCw7rxjb9RJUGTN3OW0EUS77RH4GiQ-LkAvw=="
- ],
- [
- "content-length",
- "3346"
]
],
"contentLength": 3346,
@@ -1238,10 +1197,6 @@
[
"x-amz-cf-id",
"TOLqHpQWMFHQDWnv2yHHFWI5xkA3R13TTJQDJe1ARViKrgihxZdhxA=="
- ],
- [
- "content-length",
- "794"
]
],
"contentLength": 794,
@@ -1425,10 +1380,6 @@
[
"x-amz-cf-id",
"nnIrWtgAMt42ua4HYBtNAao6m_iD9WjIzLAFyURb8mjOr5MriSQXRA=="
- ],
- [
- "content-length",
- "9689"
]
],
"contentLength": 9689,
@@ -1612,10 +1563,6 @@
[
"x-amz-cf-id",
"PMTLvP_yUCVocnhd1i1ir7_FRAJRw0ayMhK3KaZKELDO3pxxoqLWjg=="
- ],
- [
- "content-length",
- "9689"
]
],
"contentLength": 9689,
@@ -1958,10 +1905,6 @@
[
"x-amz-cf-id",
"0okGWJw6nYo7R-4egQWE-WfonThN2EXyRSLO9MlCNKyMfD-2v1AU0Q=="
- ],
- [
- "content-length",
- "6986"
]
],
"contentLength": 6986,
diff --git a/test/mitmproxy/data/har_files/head-content-length.har b/test/mitmproxy/data/har_files/head-content-length.har
new file mode 100644
index 0000000000..ec549cbe9d
--- /dev/null
+++ b/test/mitmproxy/data/har_files/head-content-length.har
@@ -0,0 +1,179 @@
+{
+ "log": {
+ "version": "1.2",
+ "creator": {
+ "name": "Firefox",
+ "version": "111.0.1"
+ },
+ "browser": {
+ "name": "Firefox",
+ "version": "111.0.1"
+ },
+ "pages": [
+ {
+ "startedDateTime": "2023-03-29T16:58:59.303-07:00",
+ "id": "page_1",
+ "title": "mitmproxy - an interactive HTTPS proxy",
+ "pageTimings": {
+ "onContentLoad": 208,
+ "onLoad": 270
+ }
+ }
+ ],
+ "entries": [
+ {
+ "startedDateTime": "2023-12-12T16:23:16.067544+00:00",
+ "time": 101.56393051147461,
+ "request": {
+ "method": "HEAD",
+ "url": "https://files.pythonhosted.org/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl",
+ "httpVersion": "HTTP/2.0",
+ "cookies": [],
+ "headers": [
+ {
+ "name": "accept",
+ "value": "*/*"
+ },
+ {
+ "name": "user-agent",
+ "value": "puffin"
+ },
+ {
+ "name": "accept-encoding",
+ "value": "gzip, br"
+ }
+ ],
+ "queryString": [],
+ "headersSize": 91,
+ "bodySize": 0
+ },
+ "response": {
+ "status": 200,
+ "statusText": "",
+ "httpVersion": "HTTP/2.0",
+ "cookies": [],
+ "headers": [
+ {
+ "name": "last-modified",
+ "value": "Thu, 10 Aug 2023 11:39:00 GMT"
+ },
+ {
+ "name": "etag",
+ "value": "\"a296c6e224c118b0d08cd77e8c08f4b1\""
+ },
+ {
+ "name": "x-amz-request-id",
+ "value": "aeb4d3335548af85"
+ },
+ {
+ "name": "x-amz-id-2",
+ "value": "aN65jxTFgNrlm8zEJMNdk7mYLYwUwTzh0"
+ },
+ {
+ "name": "x-amz-version-id",
+ "value": "4_z179c51e67f11a0ad8f6c0018_f10789ff3151435c8_d20230810_m113900_c005_v0501001_t0045_u01691667540984"
+ },
+ {
+ "name": "content-type",
+ "value": "application/octet-stream"
+ },
+ {
+ "name": "cache-control",
+ "value": "max-age=365000000, immutable, public"
+ },
+ {
+ "name": "accept-ranges",
+ "value": "bytes"
+ },
+ {
+ "name": "date",
+ "value": "Tue, 12 Dec 2023 16:23:16 GMT"
+ },
+ {
+ "name": "age",
+ "value": "2335275"
+ },
+ {
+ "name": "x-served-by",
+ "value": "cache-iad-kcgs7200038-IAD, cache-lck10926-LCK"
+ },
+ {
+ "name": "x-cache",
+ "value": "HIT, HIT"
+ },
+ {
+ "name": "x-cache-hits",
+ "value": "21459, 139679"
+ },
+ {
+ "name": "x-timer",
+ "value": "S1702398196.107663,VS0,VE0"
+ },
+ {
+ "name": "strict-transport-security",
+ "value": "max-age=31536000; includeSubDomains; preload"
+ },
+ {
+ "name": "x-frame-options",
+ "value": "deny"
+ },
+ {
+ "name": "x-xss-protection",
+ "value": "1; mode=block"
+ },
+ {
+ "name": "x-content-type-options",
+ "value": "nosniff"
+ },
+ {
+ "name": "x-permitted-cross-domain-policies",
+ "value": "none"
+ },
+ {
+ "name": "x-robots-header",
+ "value": "noindex"
+ },
+ {
+ "name": "x-pypi-file-python-version",
+ "value": "py3"
+ },
+ {
+ "name": "x-pypi-file-version",
+ "value": "4.66.1"
+ },
+ {
+ "name": "x-pypi-file-package-type",
+ "value": "bdist_wheel"
+ },
+ {
+ "name": "x-pypi-file-project",
+ "value": "tqdm"
+ },
+ {
+ "name": "content-length",
+ "value": "78258"
+ }
+ ],
+ "content": {
+ "size": 0,
+ "compression": 0,
+ "mimeType": "application/octet-stream",
+ "text": ""
+ },
+ "redirectURL": "",
+ "headersSize": 1188,
+ "bodySize": 0
+ },
+ "cache": {},
+ "timings": {
+ "connect": 31.686782836914062,
+ "ssl": 35.33315658569336,
+ "send": 0.2961158752441406,
+ "receive": 0.6310939788818359,
+ "wait": 33.61678123474121
+ },
+ "serverIPAddress": "199.232.96.223"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/test/mitmproxy/data/har_files/head-content-length.json b/test/mitmproxy/data/har_files/head-content-length.json
new file mode 100644
index 0000000000..b28343587f
--- /dev/null
+++ b/test/mitmproxy/data/har_files/head-content-length.json
@@ -0,0 +1,193 @@
+[
+ {
+ "id": "hardcoded_for_test",
+ "intercepted": false,
+ "is_replay": null,
+ "type": "http",
+ "modified": false,
+ "marked": "",
+ "comment": "",
+ "timestamp_created": 0,
+ "client_conn": {
+ "id": "hardcoded_for_test",
+ "peername": [
+ "127.0.0.1",
+ 0
+ ],
+ "sockname": [
+ "127.0.0.1",
+ 0
+ ],
+ "tls_established": false,
+ "cert": null,
+ "sni": null,
+ "cipher": null,
+ "alpn": null,
+ "tls_version": null,
+ "timestamp_start": 1702398196.067544,
+ "timestamp_tls_setup": null,
+ "timestamp_end": 1702398297.6314745
+ },
+ "server_conn": {
+ "id": "hardcoded_for_test",
+ "peername": null,
+ "sockname": null,
+ "address": [
+ "199.232.96.223",
+ 443
+ ],
+ "tls_established": false,
+ "cert": null,
+ "sni": null,
+ "cipher": null,
+ "alpn": null,
+ "tls_version": null,
+ "timestamp_start": null,
+ "timestamp_tcp_setup": null,
+ "timestamp_tls_setup": null,
+ "timestamp_end": null
+ },
+ "request": {
+ "method": "HEAD",
+ "scheme": "https",
+ "host": "files.pythonhosted.org",
+ "port": 443,
+ "path": "/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl",
+ "http_version": "HTTP/1.1",
+ "headers": [
+ [
+ "accept",
+ "*/*"
+ ],
+ [
+ "user-agent",
+ "puffin"
+ ],
+ [
+ "accept-encoding",
+ "gzip, br"
+ ],
+ [
+ "content-length",
+ "0"
+ ]
+ ],
+ "contentLength": 0,
+ "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+ "timestamp_start": 1702398196.067544,
+ "timestamp_end": 1702398297.6314745,
+ "pretty_host": "files.pythonhosted.org"
+ },
+ "response": {
+ "http_version": "HTTP/1.1",
+ "status_code": 200,
+ "reason": "OK",
+ "headers": [
+ [
+ "last-modified",
+ "Thu, 10 Aug 2023 11:39:00 GMT"
+ ],
+ [
+ "etag",
+ "\"a296c6e224c118b0d08cd77e8c08f4b1\""
+ ],
+ [
+ "x-amz-request-id",
+ "aeb4d3335548af85"
+ ],
+ [
+ "x-amz-id-2",
+ "aN65jxTFgNrlm8zEJMNdk7mYLYwUwTzh0"
+ ],
+ [
+ "x-amz-version-id",
+ "4_z179c51e67f11a0ad8f6c0018_f10789ff3151435c8_d20230810_m113900_c005_v0501001_t0045_u01691667540984"
+ ],
+ [
+ "content-type",
+ "application/octet-stream"
+ ],
+ [
+ "cache-control",
+ "max-age=365000000, immutable, public"
+ ],
+ [
+ "accept-ranges",
+ "bytes"
+ ],
+ [
+ "date",
+ "Tue, 12 Dec 2023 16:23:16 GMT"
+ ],
+ [
+ "age",
+ "2335275"
+ ],
+ [
+ "x-served-by",
+ "cache-iad-kcgs7200038-IAD, cache-lck10926-LCK"
+ ],
+ [
+ "x-cache",
+ "HIT, HIT"
+ ],
+ [
+ "x-cache-hits",
+ "21459, 139679"
+ ],
+ [
+ "x-timer",
+ "S1702398196.107663,VS0,VE0"
+ ],
+ [
+ "strict-transport-security",
+ "max-age=31536000; includeSubDomains; preload"
+ ],
+ [
+ "x-frame-options",
+ "deny"
+ ],
+ [
+ "x-xss-protection",
+ "1; mode=block"
+ ],
+ [
+ "x-content-type-options",
+ "nosniff"
+ ],
+ [
+ "x-permitted-cross-domain-policies",
+ "none"
+ ],
+ [
+ "x-robots-header",
+ "noindex"
+ ],
+ [
+ "x-pypi-file-python-version",
+ "py3"
+ ],
+ [
+ "x-pypi-file-version",
+ "4.66.1"
+ ],
+ [
+ "x-pypi-file-package-type",
+ "bdist_wheel"
+ ],
+ [
+ "x-pypi-file-project",
+ "tqdm"
+ ],
+ [
+ "content-length",
+ "78258"
+ ]
+ ],
+ "contentLength": 0,
+ "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
+ "timestamp_start": 1702398196.067544,
+ "timestamp_end": 1702398297.6314745
+ }
+ }
+]
\ No newline at end of file
diff --git a/test/mitmproxy/data/har_files/safari.json b/test/mitmproxy/data/har_files/safari.json
index f1a9b01898..299689158b 100644
--- a/test/mitmproxy/data/har_files/safari.json
+++ b/test/mitmproxy/data/har_files/safari.json
@@ -101,7 +101,7 @@
"headers": [
[
"Content-Type",
- "text/html; charset=utf-8"
+ "text/html"
],
[
"Last-Modified",
@@ -150,10 +150,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "4946"
]
],
"contentLength": 4946,
@@ -282,10 +278,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "36819"
]
],
"contentLength": 36819,
@@ -926,10 +918,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "5167"
]
],
"contentLength": 5167,
@@ -1058,10 +1046,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "3346"
]
],
"contentLength": 3346,
@@ -1190,10 +1174,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "794"
]
],
"contentLength": 794,
@@ -1334,10 +1314,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "3346"
]
],
"contentLength": 3346,
@@ -1478,10 +1454,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "3346"
]
],
"contentLength": 3346,
@@ -1622,10 +1594,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "3346"
]
],
"contentLength": 3346,
@@ -1766,10 +1734,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "3346"
]
],
"contentLength": 3346,
@@ -2282,10 +2246,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "3969"
]
],
"contentLength": 3969,
@@ -2596,10 +2556,6 @@
[
"x-cache",
"Hit from cloudfront"
- ],
- [
- "content-length",
- "1421"
]
],
"contentLength": 1421,
diff --git a/test/mitmproxy/net/http/test_headers.py b/test/mitmproxy/net/http/test_headers.py
index 473b930f84..d7fb1999a3 100644
--- a/test/mitmproxy/net/http/test_headers.py
+++ b/test/mitmproxy/net/http/test_headers.py
@@ -1,6 +1,9 @@
import collections
+import pytest
+
from mitmproxy.net.http.headers import assemble_content_type
+from mitmproxy.net.http.headers import infer_content_encoding
from mitmproxy.net.http.headers import parse_content_type
@@ -25,3 +28,34 @@ def test_assemble_content_type():
)
== "text/html; charset=utf8; foo=bar"
)
+
+
+@pytest.mark.parametrize(
+ "content_type,content,expected",
+ [
+ ("", b"", "latin-1"),
+ ("", b"foo", "latin-1"),
+ ("", b"\xfc", "latin-1"),
+ ("", b"\xF0\xE2", "latin-1"),
+ ("text/html; charset=latin1", b"\xc3\xbc", "latin1"),
+ ("text/html; charset=utf8", b"\xc3\xbc", "utf8"),
+ # json
+ ("application/json", b'"\xc3\xbc"', "utf8"),
+ # meta charset
+ (
+ "text/html",
+ b'\xe6\x98\x8e\xe4\xbc\xaf',
+ "gb18030",
+ ),
+ # css charset
+ (
+ "text/css",
+ b'@charset "gb2312";' b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}',
+ "gb18030",
+ ),
+ ],
+)
+def test_infer_content_encoding(content_type, content, expected):
+ # Additional test coverage in `test_http::TestMessageText`
+ assert infer_content_encoding(content_type, content) == expected