Skip to content

Commit 5e7fff8

Browse files
committed
fix: implement H1 container keepalive to prevent cold starts when H2 is unavailable
1 parent e521b0a commit 5e7fff8

2 files changed

Lines changed: 60 additions & 4 deletions

File tree

src/domain_fronter.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -780,6 +780,10 @@ async def _warm_pool(self):
780780
# Start H2 connection (runs alongside H1 pool)
781781
if self._h2:
782782
self._spawn(self._h2_connect_and_warm())
783+
# H1 container keepalive — runs unconditionally so the Apps Script
784+
# container never goes cold even when H2 is unavailable. When H2 IS
785+
# active its _keepalive_loop skips the ping; they do not double-fire.
786+
self._spawn(self._h1_container_keepalive())
783787

784788
def _spawn(self, coro) -> asyncio.Task:
785789
"""Create a task and keep a strong reference for clean cancellation."""
@@ -913,6 +917,34 @@ async def _keepalive_loop(self):
913917
except Exception as e:
914918
log.debug("Keepalive failed: %s", e)
915919

920+
async def _h1_container_keepalive(self):
921+
"""Keep the Apps Script container warm via H1 when H2 keepalive is absent.
922+
923+
H2's _keepalive_loop handles pings when H2 is connected. When H2 is
924+
unavailable (library not installed, connection dropped) this coroutine
925+
takes over so the container never goes cold and causes slow cold-starts
926+
on the first video / streaming request after an idle period.
927+
"""
928+
while True:
929+
try:
930+
await asyncio.sleep(240) # same cadence as H2 keepalive
931+
if self._h2_available():
932+
continue # H2 keepalive is already pinging, skip
933+
payload = self._build_payload(
934+
"HEAD", "http://example.com/", {}, b""
935+
)
936+
t0 = time.perf_counter()
937+
# _relay_payload_h1 has its own per-attempt timeout internally;
938+
# no outer wait_for needed (and adding one with a shorter
939+
# timeout would cancel valid in-progress relays early).
940+
await self._relay_payload_h1(payload)
941+
dt = (time.perf_counter() - t0) * 1000
942+
log.debug("H1 container keepalive: %.0fms", dt)
943+
except asyncio.CancelledError:
944+
break
945+
except Exception as exc:
946+
log.debug("H1 container keepalive failed: %s", exc)
947+
916948
async def _do_warm(self):
917949
"""Open WARM_POOL_COUNT connections in parallel — failures are fine."""
918950
count = WARM_POOL_COUNT
@@ -1666,13 +1698,20 @@ async def _relay_with_retry(self, payload: dict) -> bytes:
16661698
log.debug("H2 relay failed (%s), reconnecting", e)
16671699
try:
16681700
await self._h2.reconnect()
1669-
self._record_h2_success()
1701+
# Do NOT record success here — only a successful relay
1702+
# response proves the connection works. Recording
1703+
# success after reconnect was resetting the failure
1704+
# streak and causing an infinite reconnect storm.
16701705
except Exception as reconnect_exc:
16711706
self._record_h2_failure(reconnect_exc)
16721707
log.warning("H2 reconnect failed, falling back to H1")
16731708
break
16741709
else:
1675-
raise
1710+
# Last H2 attempt failed — fall through to H1 rather
1711+
# than raising here, which would bypass H1 entirely.
1712+
log.debug("H2 relay failed on final attempt (%s), "
1713+
"falling back to H1", e)
1714+
break
16761715

16771716
# HTTP/1.1 fallback (pool-based)
16781717
async with self._semaphore:

src/h2_transport.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def __init__(self, connect_host: str, sni_host: str,
8686
self._connect_lock = asyncio.Lock()
8787
self._read_task: asyncio.Task | None = None
8888
self._conn_generation = 0
89+
self._last_reconnect_at: float = 0.0
8990

9091
# Per-stream tracking
9192
self._streams: dict[int, _StreamState] = {}
@@ -193,9 +194,19 @@ async def _do_connect(self):
193194
log.info("H2 connected → %s (SNI=%s, TCP_NODELAY=on)",
194195
self.connect_host, sni)
195196

197+
# Minimum seconds between successive reconnect() calls. Without this,
198+
# concurrent relay failures trigger a rapid reconnect storm that causes
199+
# repeated "H2 connected → H2 reader loop ended" within milliseconds.
200+
_RECONNECT_MIN_INTERVAL = 1.0
201+
196202
async def reconnect(self):
197-
"""Close current connection and re-establish."""
203+
"""Close current connection and re-establish, with backoff."""
198204
async with self._connect_lock:
205+
loop = asyncio.get_running_loop()
206+
elapsed = loop.time() - self._last_reconnect_at
207+
if elapsed < self._RECONNECT_MIN_INTERVAL:
208+
await asyncio.sleep(self._RECONNECT_MIN_INTERVAL - elapsed)
209+
self._last_reconnect_at = loop.time()
199210
await self._close_internal()
200211
await self._do_connect()
201212

@@ -382,7 +393,13 @@ async def _reader_loop(self, generation: int):
382393
else:
383394
log.error("H2 reader error: %s", e)
384395
except Exception as e:
385-
if "application data after close notify" in str(e).lower():
396+
# WinError 121 (semaphore timeout) — Windows OS-level socket
397+
# timeout meaning the TCP connection stalled and the OS closed
398+
# it. Harmless; treat as a normal drop. On non-Windows
399+
# platforms .winerror is absent so getattr returns None.
400+
if getattr(e, 'winerror', None) == 121:
401+
log.warning("H2 connection dropped (OS socket timeout)")
402+
elif "application data after close notify" in str(e).lower():
386403
log.debug("H2 reader closed after close_notify: %s", e)
387404
else:
388405
log.error("H2 reader error: %s", e)

0 commit comments

Comments
 (0)