From 8d4559246d2cab16c065ce33223900ed7bd12161 Mon Sep 17 00:00:00 2001 From: matthew-pilot Date: Thu, 28 May 2026 15:42:48 +0000 Subject: [PATCH] fix(server): wrap JSON body decoders with MaxBytesReader to prevent OOM (PILOT-134) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit POST /admin/recrawl, /admin/recrawl-by-domain, /feedback, /admin/reembed, and /contents (batch) all decoded r.Body with json.NewDecoder without a size limit. A large JSON array (e.g. 100M-element URL list) would OOM the server during parsing — the len(req.URLs) > N guard ran only AFTER decode. Add maxRequestBodySize = 1 MiB and wrap all five decoder sites with http.MaxBytesReader(w, r.Body, maxRequestBodySize). Oversized bodies now fail at the transport layer before the JSON decoder allocates. TestAdminRecrawlRejectsOversizedBody: sends a 2 MiB body and confirms 400/413. Closes PILOT-134 --- internal/server/http.go | 14 +++++++++----- internal/server/http_test.go | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/internal/server/http.go b/internal/server/http.go index dd46ad9..b8fad6b 100644 --- a/internal/server/http.go +++ b/internal/server/http.go @@ -30,6 +30,10 @@ import ( "github.com/pilot-protocol/cosift/internal/store" ) +// maxRequestBodySize caps the JSON request body to prevent OOM on +// maliciously large payloads. 1 MiB is generous even for batch endpoints. +const maxRequestBodySize = 1 << 20 + // FetchFn fetches and parses a single URL. Used by /contents on store-miss. // Empty title/text + nil error is acceptable; callers can decide what to do. // Implementations are responsible for politeness — server doesn't rate-limit. @@ -497,7 +501,7 @@ func (s *Server) handleFeedback(w http.ResponseWriter, r *http.Request) { return } var req FeedbackRequest - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, maxRequestBodySize)).Decode(&req); err != nil { writeProblem(w, http.StatusBadRequest, "invalid JSON body") return } @@ -683,7 +687,7 @@ type AdminRecrawlResponse struct { // next pass. This split keeps the API endpoint stateless and quick. func (s *Server) handleAdminRecrawl(w http.ResponseWriter, r *http.Request) { var req AdminRecrawlRequest - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, maxRequestBodySize)).Decode(&req); err != nil { writeProblem(w, http.StatusBadRequest, "invalid JSON body") return } @@ -744,7 +748,7 @@ type AdminRecrawlByDomainResponse struct { // batch-/contents cap pattern). Larger sweeps should be split. func (s *Server) handleAdminRecrawlByDomain(w http.ResponseWriter, r *http.Request) { var req AdminRecrawlByDomainRequest - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, maxRequestBodySize)).Decode(&req); err != nil { writeProblem(w, http.StatusBadRequest, "invalid JSON body") return } @@ -831,7 +835,7 @@ func (s *Server) handleAdminReembed(w http.ResponseWriter, r *http.Request) { } var req AdminReembedRequest if r.ContentLength > 0 { - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, maxRequestBodySize)).Decode(&req); err != nil { writeProblem(w, http.StatusBadRequest, "invalid JSON body") return } @@ -2189,7 +2193,7 @@ type ContentsBatchItem struct { func (s *Server) handleContentsBatch(w http.ResponseWriter, r *http.Request) { start := time.Now() var req ContentsBatchRequest - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, maxRequestBodySize)).Decode(&req); err != nil { writeProblem(w, http.StatusBadRequest, fmt.Sprintf("invalid json body: %v", err)) return } diff --git a/internal/server/http_test.go b/internal/server/http_test.go index 2496c58..c4bdf1a 100644 --- a/internal/server/http_test.go +++ b/internal/server/http_test.go @@ -1,6 +1,7 @@ package server import ( + "bytes" "context" "encoding/json" "fmt" @@ -2728,6 +2729,37 @@ func TestAdminRecrawlBadInput(t *testing.T) { } } +func TestAdminRecrawlRejectsOversizedBody(t *testing.T) { + s, _ := store.OpenMemory() + t.Cleanup(func() { s.Close() }) + srv := New(s).WithAdminToken("k") + httpSrv := httptest.NewServer(srv.Handler()) + defer httpSrv.Close() + + // Send a body larger than the 1 MiB limit. + large := make([]byte, 2<<20) + for i := range large { + large[i] = ' ' + } + // Make it valid JSON so the decoder (not the parser) trips the limit. + large[0] = '{' + large[len(large)-1] = '}' + + req, _ := http.NewRequest("POST", httpSrv.URL+"/admin/recrawl", bytes.NewReader(large)) + req.Header.Set("Authorization", "Bearer k") + req.Header.Set("Content-Type", "application/json") + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("unexpected error sending request: %v", err) + } + resp.Body.Close() + // MaxBytesReader returns 413 (StatusRequestEntityTooLarge) or the decoder + // error surfaces as 400. Either is acceptable. + if resp.StatusCode != 400 && resp.StatusCode != 413 { + t.Errorf("oversized body: got %d, want 400 or 413", resp.StatusCode) + } +} + // paraphraseChat returns a fixed paraphrase JSON array — lets the expand path // be exercised without real LLM calls. type paraphraseChat struct {