From 4da76bae65e1c413db1fb3cc97e34dd4228d3b11 Mon Sep 17 00:00:00 2001 From: Liam Stanley Date: Sun, 22 Jan 2017 21:10:20 -0500 Subject: [PATCH] add better support for content length data, by actually reading and discarding asset body bytes --- scraper/scraper.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scraper/scraper.go b/scraper/scraper.go index b06085b..a93796e 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -7,6 +7,7 @@ package scraper import ( "bytes" "fmt" + "io" "io/ioutil" "log" "net/http" @@ -115,6 +116,12 @@ func (c *Crawler) fetchResource(rsrc *Resource) { } if resp.Body != nil { + // we don't care about the body, but we want to know how large it is. + // count the bytes but discard them. + if resp.ContentLength < 1 { + resp.ContentLength, _ = io.Copy(ioutil.Discard, resp.Body) + } + resp.Body.Close() // ensure the body stream is closed } @@ -192,6 +199,10 @@ func (c *Crawler) Fetch(res *FetchResult) { res.Response.Body = string(bbytes[:]) } + if res.Response.ContentLength < 1 { + res.Response.ContentLength = int64(len(buf)) + } + c.Log.Printf("fetched %s in %dms with status %d", res.Response.URL.String(), res.Time.Milli, res.Response.Code) resourceTime := utils.NewTimer()