Fix response content-length when reading from HAR files (mitmproxy#…

…6548) #### Description Closes mitmproxy#6547 Responses in flows constructed from HAR files were using the `Response.make` utility which resulted in the injection of `content-length` headers. When a `content-length` header existed already, this could cause failures during replay. #### Checklist - [x] I have updated tests where applicable. - [x] I have added an entry to the CHANGELOG. --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
mhils · Dec 12, 2023 · 1fcd033 · 1fcd033
1 parent bda9c4e
commit 1fcd033
Show file tree

Hide file tree

Showing 11 changed files with 492 additions and 162 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@
 
 ## Unreleased: mitmproxy next
 
+* Fix bug where response flows from HAR files had incorrect `content-length` headers
+  ([#6548](https://github.com/mitmproxy/mitmproxy/pull/6548), @zanieb)
 * Improved handling for `--allow-hosts`/`--ignore-hosts` options in WireGuard mode (#5930).
   ([#6513](https://github.com/mitmproxy/mitmproxy/pull/6513), @dsphper)
 * DNS resolution is now exempted from `--ignore-hosts` in WireGuard Mode.

diff --git a/mitmproxy/http.py b/mitmproxy/http.py
@@ -1,7 +1,6 @@
 import binascii
 import json
 import os
-import re
 import time
 import urllib.parse
 import warnings
@@ -27,6 +26,7 @@
 from mitmproxy.net.http import status_codes
 from mitmproxy.net.http import url
 from mitmproxy.net.http.headers import assemble_content_type
+from mitmproxy.net.http.headers import infer_content_encoding
 from mitmproxy.net.http.headers import parse_content_type
 from mitmproxy.utils import human
 from mitmproxy.utils import strutils
@@ -402,45 +402,11 @@ def get_content(self, strict: bool = True) -> bytes | None:
         else:
             return self.raw_content
 
-    def _get_content_type_charset(self) -> str | None:
-        ct = parse_content_type(self.headers.get("content-type", ""))
-        if ct:
-            return ct[2].get("charset")
-        return None
-
-    def _guess_encoding(self, content: bytes = b"") -> str:
-        enc = self._get_content_type_charset()
-        if not enc:
-            if "json" in self.headers.get("content-type", ""):
-                enc = "utf8"
-        if not enc:
-            if "html" in self.headers.get("content-type", ""):
-                meta_charset = re.search(
-                    rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE
-                )
-                if meta_charset:
-                    enc = meta_charset.group(1).decode("ascii", "ignore")
-        if not enc:
-            if "text/css" in self.headers.get("content-type", ""):
-                # @charset rule must be the very first thing.
-                css_charset = re.match(
-                    rb"""@charset "([^"]+)";""", content, re.IGNORECASE
-                )
-                if css_charset:
-                    enc = css_charset.group(1).decode("ascii", "ignore")
-        if not enc:
-            enc = "latin-1"
-        # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
-        if enc.lower() in ("gb2312", "gbk"):
-            enc = "gb18030"
-
-        return enc
-
     def set_text(self, text: str | None) -> None:
         if text is None:
             self.content = None
             return
-        enc = self._guess_encoding()
+        enc = infer_content_encoding(self.headers.get("content-type", ""))
 
         try:
             self.content = cast(bytes, encoding.encode(text, enc))
@@ -464,7 +430,7 @@ def get_text(self, strict: bool = True) -> str | None:
         content = self.get_content(strict)
         if content is None:
             return None
-        enc = self._guess_encoding(content)
+        enc = infer_content_encoding(self.headers.get("content-type", ""), content)
         try:
             return cast(str, encoding.decode(content, enc))
         except ValueError:

diff --git a/mitmproxy/io/har.py b/mitmproxy/io/har.py
@@ -7,6 +7,7 @@
 from mitmproxy import connection
 from mitmproxy import exceptions
 from mitmproxy import http
+from mitmproxy.net.http.headers import infer_content_encoding
 
 logger = logging.getLogger(__name__)
 
@@ -85,24 +86,52 @@ def request_to_flow(request_json: dict) -> http.HTTPFlow:
     # In Firefox HAR files images don't include response bodies
     response_content = request_json["response"]["content"].get("text", "")
     content_encoding = request_json["response"]["content"].get("encoding", None)
+    response_headers = fix_headers(request_json["response"]["headers"])
+
     if content_encoding == "base64":
         response_content = base64.b64decode(response_content)
-    response_headers = fix_headers(request_json["response"]["headers"])
+    elif isinstance(response_content, str):
+        # Convert text to bytes, as in `Response.set_text`
+        try:
+            response_content = http.encoding.encode(
+                response_content,
+                (
+                    content_encoding
+                    or infer_content_encoding(response_headers.get("content-type", ""))
+                ),
+            )
+        except ValueError:
+            # Fallback to UTF-8
+            response_content = response_content.encode(
+                "utf-8", errors="surrogateescape"
+            )
+
+    # Then encode the content, as in `Response.set_content`
+    response_content = http.encoding.encode(
+        response_content, response_headers.get("content-encoding") or "identity"
+    )
 
-    new_flow.response = http.Response.make(
-        response_code, response_content, response_headers
+    new_flow.response = http.Response(
+        b"HTTP/1.1",
+        response_code,
+        http.status_codes.RESPONSES.get(response_code, "").encode(),
+        response_headers,
+        response_content,
+        None,
+        timestamp_start,
+        timestamp_end,
     )
 
-    # Change time to match HAR file
+    # Update timestamps
+
     new_flow.request.timestamp_start = timestamp_start
     new_flow.request.timestamp_end = timestamp_end
 
-    new_flow.response.timestamp_start = timestamp_start
-    new_flow.response.timestamp_end = timestamp_end
-
     new_flow.client_conn.timestamp_start = timestamp_start
     new_flow.client_conn.timestamp_end = timestamp_end
 
+    # Update HTTP version
+
     match http_version_req:
         case "http/2.0":
             new_flow.request.http_version = "HTTP/2"

diff --git a/mitmproxy/net/http/headers.py b/mitmproxy/net/http/headers.py
@@ -1,4 +1,5 @@
 import collections
+import re
 
 
 def parse_content_type(c: str) -> tuple[str, str, dict[str, str]] | None:
@@ -33,3 +34,39 @@ def assemble_content_type(type, subtype, parameters):
         return f"{type}/{subtype}"
     params = "; ".join(f"{k}={v}" for k, v in parameters.items())
     return f"{type}/{subtype}; {params}"
+
+
+def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
+    """
+    Infer the encoding of content from the content-type header.
+    """
+    # Use the charset from the header if possible
+    parsed_content_type = parse_content_type(content_type)
+    enc = parsed_content_type[2].get("charset") if parsed_content_type else None
+
+    # Otherwise, infer the encoding
+    if not enc and "json" in content_type:
+        enc = "utf8"
+
+    if not enc and "html" in content_type:
+        meta_charset = re.search(
+            rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE
+        )
+        if meta_charset:
+            enc = meta_charset.group(1).decode("ascii", "ignore")
+
+    if not enc and "text/css" in content_type:
+        # @charset rule must be the very first thing.
+        css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE)
+        if css_charset:
+            enc = css_charset.group(1).decode("ascii", "ignore")
+
+    # Fallback to latin-1
+    if not enc:
+        enc = "latin-1"
+
+    # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
+    if enc.lower() in ("gb2312", "gbk"):
+        enc = "gb18030"
+
+    return enc
diff --git a/test/mitmproxy/data/har_files/charles.json b/test/mitmproxy/data/har_files/charles.json
@@ -81,7 +81,7 @@
             "headers": [
                 [
                     "Content-Type",
-                    "text/html; charset=utf-8"
+                    "text/html"
                 ],
                 [
                     "Content-Length",

diff --git a/test/mitmproxy/data/har_files/chrome.json b/test/mitmproxy/data/har_files/chrome.json
@@ -194,10 +194,6 @@
                 [
                     "x-cache",
                     "Hit from cloudfront"
-                ],
-                [
-                    "content-length",
-                    "23866"
                 ]
             ],
             "contentLength": 23866,
@@ -326,12 +322,7 @@
             "http_version": "HTTP/1.1",
             "status_code": 0,
             "reason": "",
-            "headers": [
-                [
-                    "content-length",
-                    "0"
-                ]
-            ],
+            "headers": [],
             "contentLength": 0,
             "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
             "timestamp_start": 1689251552.676,

diff --git a/test/mitmproxy/data/har_files/firefox.json b/test/mitmproxy/data/har_files/firefox.json
@@ -174,14 +174,6 @@
                 [
                     "x-amz-cf-id",
                     "DPEkuUbeK1ZXMsRuUHqgk6iE4l7ShgyrJntkqIbLaSJ5646Ptc2Xew=="
-                ],
-                [
-                    "content-type",
-                    "text/plain; charset=utf-8"
-                ],
-                [
-                    "content-length",
-                    "23866"
                 ]
             ],
             "contentLength": 23866,
@@ -282,12 +274,7 @@
             "http_version": "HTTP/1.1",
             "status_code": 200,
             "reason": "OK",
-            "headers": [
-                [
-                    "content-length",
-                    "0"
-                ]
-            ],
+            "headers": [],
             "contentLength": 0,
             "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
             "timestamp_start": 1680134339.418,
@@ -386,12 +373,7 @@
             "http_version": "HTTP/1.1",
             "status_code": 200,
             "reason": "OK",
-            "headers": [
-                [
-                    "content-length",
-                    "0"
-                ]
-            ],
+            "headers": [],
             "contentLength": 0,
             "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
             "timestamp_start": 1680134339.456,
@@ -490,12 +472,7 @@
             "http_version": "HTTP/1.1",
             "status_code": 200,
             "reason": "OK",
-            "headers": [
-                [
-                    "content-length",
-                    "0"
-                ]
-            ],
+            "headers": [],
             "contentLength": 0,
             "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
             "timestamp_start": 1680134339.457,
@@ -594,12 +571,7 @@
             "http_version": "HTTP/1.1",
             "status_code": 200,
             "reason": "OK",
-            "headers": [
-                [
-                    "content-length",
-                    "0"
-                ]
-            ],
+            "headers": [],
             "contentLength": 0,
             "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
             "timestamp_start": 1680134339.457,
@@ -698,12 +670,7 @@
             "http_version": "HTTP/1.1",
             "status_code": 200,
             "reason": "OK",
-            "headers": [
-                [
-                    "content-length",
-                    "0"
-                ]
-            ],
+            "headers": [],
             "contentLength": 0,
             "contentHash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
             "timestamp_start": 1680134339.458,
@@ -878,10 +845,6 @@
                 [
                     "x-amz-cf-id",
                     "EufbwzXQheESUTNil_OCjKK8cvVL51cQpYfmF7iZSHloCTvVhfZ8yQ=="
-                ],
-                [
-                    "content-length",
-                    "3969"
                 ]
             ],
             "contentLength": 3969,
@@ -1058,10 +1021,6 @@
                 [
                     "x-amz-cf-id",
                     "UuHHTyGhTqqTO07G_oZCw7rxjb9RJUGTN3OW0EUS77RH4GiQ-LkAvw=="
-                ],
-                [
-                    "content-length",
-                    "3346"
                 ]
             ],
             "contentLength": 3346,
@@ -1238,10 +1197,6 @@
                 [
                     "x-amz-cf-id",
                     "TOLqHpQWMFHQDWnv2yHHFWI5xkA3R13TTJQDJe1ARViKrgihxZdhxA=="
-                ],
-                [
-                    "content-length",
-                    "794"
                 ]
             ],
             "contentLength": 794,
@@ -1425,10 +1380,6 @@
                 [
                     "x-amz-cf-id",
                     "nnIrWtgAMt42ua4HYBtNAao6m_iD9WjIzLAFyURb8mjOr5MriSQXRA=="
-                ],
-                [
-                    "content-length",
-                    "9689"
                 ]
             ],
             "contentLength": 9689,
@@ -1612,10 +1563,6 @@
                 [
                     "x-amz-cf-id",
                     "PMTLvP_yUCVocnhd1i1ir7_FRAJRw0ayMhK3KaZKELDO3pxxoqLWjg=="
-                ],
-                [
-                    "content-length",
-                    "9689"
                 ]
             ],
             "contentLength": 9689,
@@ -1958,10 +1905,6 @@
                 [
                     "x-amz-cf-id",
                     "0okGWJw6nYo7R-4egQWE-WfonThN2EXyRSLO9MlCNKyMfD-2v1AU0Q=="
-                ],
-                [
-                    "content-length",
-                    "6986"
                 ]
             ],
             "contentLength": 6986,