fix: implement H1 container keepalive to prevent cold starts when H2 is unavailable

abolix · abolix · commit 5e7fff881177 · 2026-04-25T04:04:50.000+03:30
diff --git a/src/domain_fronter.py b/src/domain_fronter.py
@@ -780,6 +780,10 @@ async def _warm_pool(self):
         # Start H2 connection (runs alongside H1 pool)
         if self._h2:
             self._spawn(self._h2_connect_and_warm())
+        # H1 container keepalive — runs unconditionally so the Apps Script
+        # container never goes cold even when H2 is unavailable.  When H2 IS
+        # active its _keepalive_loop skips the ping; they do not double-fire.
+        self._spawn(self._h1_container_keepalive())
 
     def _spawn(self, coro) -> asyncio.Task:
         """Create a task and keep a strong reference for clean cancellation."""
@@ -913,6 +917,34 @@ async def _keepalive_loop(self):
             except Exception as e:
                 log.debug("Keepalive failed: %s", e)
 
+    async def _h1_container_keepalive(self):
+        """Keep the Apps Script container warm via H1 when H2 keepalive is absent.
+
+        H2's _keepalive_loop handles pings when H2 is connected.  When H2 is
+        unavailable (library not installed, connection dropped) this coroutine
+        takes over so the container never goes cold and causes slow cold-starts
+        on the first video / streaming request after an idle period.
+        """
+        while True:
+            try:
+                await asyncio.sleep(240)   # same cadence as H2 keepalive
+                if self._h2_available():
+                    continue  # H2 keepalive is already pinging, skip
+                payload = self._build_payload(
+                    "HEAD", "http://example.com/", {}, b""
+                )
+                t0 = time.perf_counter()
+                # _relay_payload_h1 has its own per-attempt timeout internally;
+                # no outer wait_for needed (and adding one with a shorter
+                # timeout would cancel valid in-progress relays early).
+                await self._relay_payload_h1(payload)
+                dt = (time.perf_counter() - t0) * 1000
+                log.debug("H1 container keepalive: %.0fms", dt)
+            except asyncio.CancelledError:
+                break
+            except Exception as exc:
+                log.debug("H1 container keepalive failed: %s", exc)
+
     async def _do_warm(self):
         """Open WARM_POOL_COUNT connections in parallel — failures are fine."""
         count = WARM_POOL_COUNT
@@ -1666,13 +1698,20 @@ async def _relay_with_retry(self, payload: dict) -> bytes:
                         log.debug("H2 relay failed (%s), reconnecting", e)
                         try:
                             await self._h2.reconnect()
-                            self._record_h2_success()
+                            # Do NOT record success here — only a successful relay
+                            # response proves the connection works.  Recording
+                            # success after reconnect was resetting the failure
+                            # streak and causing an infinite reconnect storm.
                         except Exception as reconnect_exc:
                             self._record_h2_failure(reconnect_exc)
                             log.warning("H2 reconnect failed, falling back to H1")
                             break
                     else:
-                        raise
+                        # Last H2 attempt failed — fall through to H1 rather
+                        # than raising here, which would bypass H1 entirely.
+                        log.debug("H2 relay failed on final attempt (%s), "
+                                  "falling back to H1", e)
+                        break
 
         # HTTP/1.1 fallback (pool-based)
         async with self._semaphore:
diff --git a/src/h2_transport.py b/src/h2_transport.py
@@ -86,6 +86,7 @@ def __init__(self, connect_host: str, sni_host: str,
         self._connect_lock = asyncio.Lock()
         self._read_task: asyncio.Task | None = None
         self._conn_generation = 0
+        self._last_reconnect_at: float = 0.0
 
         # Per-stream tracking
         self._streams: dict[int, _StreamState] = {}
@@ -193,9 +194,19 @@ async def _do_connect(self):
         log.info("H2 connected → %s (SNI=%s, TCP_NODELAY=on)",
                  self.connect_host, sni)
 
+    # Minimum seconds between successive reconnect() calls.  Without this,
+    # concurrent relay failures trigger a rapid reconnect storm that causes
+    # repeated "H2 connected → H2 reader loop ended" within milliseconds.
+    _RECONNECT_MIN_INTERVAL = 1.0
+
     async def reconnect(self):
-        """Close current connection and re-establish."""
+        """Close current connection and re-establish, with backoff."""
         async with self._connect_lock:
+            loop = asyncio.get_running_loop()
+            elapsed = loop.time() - self._last_reconnect_at
+            if elapsed < self._RECONNECT_MIN_INTERVAL:
+                await asyncio.sleep(self._RECONNECT_MIN_INTERVAL - elapsed)
+            self._last_reconnect_at = loop.time()
             await self._close_internal()
             await self._do_connect()
 
@@ -382,7 +393,13 @@ async def _reader_loop(self, generation: int):
             else:
                 log.error("H2 reader error: %s", e)
         except Exception as e:
-            if "application data after close notify" in str(e).lower():
+            # WinError 121 (semaphore timeout) — Windows OS-level socket
+            # timeout meaning the TCP connection stalled and the OS closed
+            # it.  Harmless; treat as a normal drop.  On non-Windows
+            # platforms .winerror is absent so getattr returns None.
+            if getattr(e, 'winerror', None) == 121:
+                log.warning("H2 connection dropped (OS socket timeout)")
+            elif "application data after close notify" in str(e).lower():
                 log.debug("H2 reader closed after close_notify: %s", e)
             else:
                 log.error("H2 reader error: %s", e)