Skip to content

Commit

Permalink
Fix response content-length when reading from HAR files (mitmproxy#…
Browse files Browse the repository at this point in the history
…6548)

#### Description

Closes mitmproxy#6547

Responses in flows constructed from HAR files were using the
`Response.make` utility which resulted in the injection of
`content-length` headers. When a `content-length` header existed
already, this could cause failures during replay.

#### Checklist

 - [x] I have updated tests where applicable.
 - [x] I have added an entry to the CHANGELOG.

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
  • Loading branch information
zanieb and autofix-ci[bot] committed Dec 12, 2023
1 parent bda9c4e commit 1fcd033
Show file tree
Hide file tree
Showing 11 changed files with 492 additions and 162 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Expand Up @@ -7,6 +7,8 @@

## Unreleased: mitmproxy next

* Fix bug where response flows from HAR files had incorrect `content-length` headers
([#6548](https://github.com/mitmproxy/mitmproxy/pull/6548), @zanieb)
* Improved handling for `--allow-hosts`/`--ignore-hosts` options in WireGuard mode (#5930).
([#6513](https://github.com/mitmproxy/mitmproxy/pull/6513), @dsphper)
* DNS resolution is now exempted from `--ignore-hosts` in WireGuard Mode.
Expand Down
40 changes: 3 additions & 37 deletions mitmproxy/http.py
@@ -1,7 +1,6 @@
import binascii
import json
import os
import re
import time
import urllib.parse
import warnings
Expand All @@ -27,6 +26,7 @@
from mitmproxy.net.http import status_codes
from mitmproxy.net.http import url
from mitmproxy.net.http.headers import assemble_content_type
from mitmproxy.net.http.headers import infer_content_encoding
from mitmproxy.net.http.headers import parse_content_type
from mitmproxy.utils import human
from mitmproxy.utils import strutils
Expand Down Expand Up @@ -402,45 +402,11 @@ def get_content(self, strict: bool = True) -> bytes | None:
else:
return self.raw_content

def _get_content_type_charset(self) -> str | None:
ct = parse_content_type(self.headers.get("content-type", ""))
if ct:
return ct[2].get("charset")
return None

def _guess_encoding(self, content: bytes = b"") -> str:
enc = self._get_content_type_charset()
if not enc:
if "json" in self.headers.get("content-type", ""):
enc = "utf8"
if not enc:
if "html" in self.headers.get("content-type", ""):
meta_charset = re.search(
rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE
)
if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore")
if not enc:
if "text/css" in self.headers.get("content-type", ""):
# @charset rule must be the very first thing.
css_charset = re.match(
rb"""@charset "([^"]+)";""", content, re.IGNORECASE
)
if css_charset:
enc = css_charset.group(1).decode("ascii", "ignore")
if not enc:
enc = "latin-1"
# Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
if enc.lower() in ("gb2312", "gbk"):
enc = "gb18030"

return enc

def set_text(self, text: str | None) -> None:
if text is None:
self.content = None
return
enc = self._guess_encoding()
enc = infer_content_encoding(self.headers.get("content-type", ""))

try:
self.content = cast(bytes, encoding.encode(text, enc))
Expand All @@ -464,7 +430,7 @@ def get_text(self, strict: bool = True) -> str | None:
content = self.get_content(strict)
if content is None:
return None
enc = self._guess_encoding(content)
enc = infer_content_encoding(self.headers.get("content-type", ""), content)
try:
return cast(str, encoding.decode(content, enc))
except ValueError:
Expand Down
43 changes: 36 additions & 7 deletions mitmproxy/io/har.py
Expand Up @@ -7,6 +7,7 @@
from mitmproxy import connection
from mitmproxy import exceptions
from mitmproxy import http
from mitmproxy.net.http.headers import infer_content_encoding

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -85,24 +86,52 @@ def request_to_flow(request_json: dict) -> http.HTTPFlow:
# In Firefox HAR files images don't include response bodies
response_content = request_json["response"]["content"].get("text", "")
content_encoding = request_json["response"]["content"].get("encoding", None)
response_headers = fix_headers(request_json["response"]["headers"])

if content_encoding == "base64":
response_content = base64.b64decode(response_content)
response_headers = fix_headers(request_json["response"]["headers"])
elif isinstance(response_content, str):
# Convert text to bytes, as in `Response.set_text`
try:
response_content = http.encoding.encode(
response_content,
(
content_encoding
or infer_content_encoding(response_headers.get("content-type", ""))
),
)
except ValueError:
# Fallback to UTF-8
response_content = response_content.encode(
"utf-8", errors="surrogateescape"
)

# Then encode the content, as in `Response.set_content`
response_content = http.encoding.encode(
response_content, response_headers.get("content-encoding") or "identity"
)

new_flow.response = http.Response.make(
response_code, response_content, response_headers
new_flow.response = http.Response(
b"HTTP/1.1",
response_code,
http.status_codes.RESPONSES.get(response_code, "").encode(),
response_headers,
response_content,
None,
timestamp_start,
timestamp_end,
)

# Change time to match HAR file
# Update timestamps

new_flow.request.timestamp_start = timestamp_start
new_flow.request.timestamp_end = timestamp_end

new_flow.response.timestamp_start = timestamp_start
new_flow.response.timestamp_end = timestamp_end

new_flow.client_conn.timestamp_start = timestamp_start
new_flow.client_conn.timestamp_end = timestamp_end

# Update HTTP version

match http_version_req:
case "http/2.0":
new_flow.request.http_version = "HTTP/2"
Expand Down
37 changes: 37 additions & 0 deletions mitmproxy/net/http/headers.py
@@ -1,4 +1,5 @@
import collections
import re


def parse_content_type(c: str) -> tuple[str, str, dict[str, str]] | None:
Expand Down Expand Up @@ -33,3 +34,39 @@ def assemble_content_type(type, subtype, parameters):
return f"{type}/{subtype}"
params = "; ".join(f"{k}={v}" for k, v in parameters.items())
return f"{type}/{subtype}; {params}"


def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
"""
Infer the encoding of content from the content-type header.
"""
# Use the charset from the header if possible
parsed_content_type = parse_content_type(content_type)
enc = parsed_content_type[2].get("charset") if parsed_content_type else None

# Otherwise, infer the encoding
if not enc and "json" in content_type:
enc = "utf8"

if not enc and "html" in content_type:
meta_charset = re.search(
rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE
)
if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore")

if not enc and "text/css" in content_type:
# @charset rule must be the very first thing.
css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE)
if css_charset:
enc = css_charset.group(1).decode("ascii", "ignore")

# Fallback to latin-1
if not enc:
enc = "latin-1"

# Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
if enc.lower() in ("gb2312", "gbk"):
enc = "gb18030"

return enc
2 changes: 1 addition & 1 deletion test/mitmproxy/data/har_files/charles.json
Expand Up @@ -81,7 +81,7 @@
"headers": [
[
"Content-Type",
"text/html; charset=utf-8"
"text/html"
],
[
"Content-Length",
Expand Down
11 changes: 1 addition & 10 deletions test/mitmproxy/data/har_files/chrome.json
Expand Up @@ -194,10 +194,6 @@
[
"x-cache",
"Hit from cloudfront"
],
[
"content-length",
"23866"
]
],
"contentLength": 23866,
Expand Down Expand Up @@ -326,12 +322,7 @@
"http_version": "HTTP/1.1",
"status_code": 0,
"reason": "",
"headers": [
[
"content-length",
"0"
]
],
"headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1689251552.676,
Expand Down
67 changes: 5 additions & 62 deletions test/mitmproxy/data/har_files/firefox.json
Expand Up @@ -174,14 +174,6 @@
[
"x-amz-cf-id",
"DPEkuUbeK1ZXMsRuUHqgk6iE4l7ShgyrJntkqIbLaSJ5646Ptc2Xew=="
],
[
"content-type",
"text/plain; charset=utf-8"
],
[
"content-length",
"23866"
]
],
"contentLength": 23866,
Expand Down Expand Up @@ -282,12 +274,7 @@
"http_version": "HTTP/1.1",
"status_code": 200,
"reason": "OK",
"headers": [
[
"content-length",
"0"
]
],
"headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1680134339.418,
Expand Down Expand Up @@ -386,12 +373,7 @@
"http_version": "HTTP/1.1",
"status_code": 200,
"reason": "OK",
"headers": [
[
"content-length",
"0"
]
],
"headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1680134339.456,
Expand Down Expand Up @@ -490,12 +472,7 @@
"http_version": "HTTP/1.1",
"status_code": 200,
"reason": "OK",
"headers": [
[
"content-length",
"0"
]
],
"headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1680134339.457,
Expand Down Expand Up @@ -594,12 +571,7 @@
"http_version": "HTTP/1.1",
"status_code": 200,
"reason": "OK",
"headers": [
[
"content-length",
"0"
]
],
"headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1680134339.457,
Expand Down Expand Up @@ -698,12 +670,7 @@
"http_version": "HTTP/1.1",
"status_code": 200,
"reason": "OK",
"headers": [
[
"content-length",
"0"
]
],
"headers": [],
"contentLength": 0,
"contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"timestamp_start": 1680134339.458,
Expand Down Expand Up @@ -878,10 +845,6 @@
[
"x-amz-cf-id",
"EufbwzXQheESUTNil_OCjKK8cvVL51cQpYfmF7iZSHloCTvVhfZ8yQ=="
],
[
"content-length",
"3969"
]
],
"contentLength": 3969,
Expand Down Expand Up @@ -1058,10 +1021,6 @@
[
"x-amz-cf-id",
"UuHHTyGhTqqTO07G_oZCw7rxjb9RJUGTN3OW0EUS77RH4GiQ-LkAvw=="
],
[
"content-length",
"3346"
]
],
"contentLength": 3346,
Expand Down Expand Up @@ -1238,10 +1197,6 @@
[
"x-amz-cf-id",
"TOLqHpQWMFHQDWnv2yHHFWI5xkA3R13TTJQDJe1ARViKrgihxZdhxA=="
],
[
"content-length",
"794"
]
],
"contentLength": 794,
Expand Down Expand Up @@ -1425,10 +1380,6 @@
[
"x-amz-cf-id",
"nnIrWtgAMt42ua4HYBtNAao6m_iD9WjIzLAFyURb8mjOr5MriSQXRA=="
],
[
"content-length",
"9689"
]
],
"contentLength": 9689,
Expand Down Expand Up @@ -1612,10 +1563,6 @@
[
"x-amz-cf-id",
"PMTLvP_yUCVocnhd1i1ir7_FRAJRw0ayMhK3KaZKELDO3pxxoqLWjg=="
],
[
"content-length",
"9689"
]
],
"contentLength": 9689,
Expand Down Expand Up @@ -1958,10 +1905,6 @@
[
"x-amz-cf-id",
"0okGWJw6nYo7R-4egQWE-WfonThN2EXyRSLO9MlCNKyMfD-2v1AU0Q=="
],
[
"content-length",
"6986"
]
],
"contentLength": 6986,
Expand Down

0 comments on commit 1fcd033

Please sign in to comment.