From bb0d7776f9ad9f75e022bd37a483fb19b74d0e2a Mon Sep 17 00:00:00 2001 From: nylar Date: Wed, 31 Dec 2014 20:33:53 -0500 Subject: [PATCH] Change ExtractLinks to use LinkWorker --- parser.go | 12 +++--------- parser_test.go | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/parser.go b/parser.go index 1ce8d57..d18464f 100644 --- a/parser.go +++ b/parser.go @@ -54,22 +54,16 @@ func ExtractAuthor(doc *goquery.Document) string { // ExtractLinks all anchors (with href attributes) from a document and return a list // of the anchors. Should return an error but goquery.NewDocumentFromReader that // subsequently calls html.Parse doesn't like returning errors for bad markup. -func ExtractLinks(doc *goquery.Document) []string { - links := []string{} - linkTracker := make(map[string]bool) - +func ExtractLinks(doc *goquery.Document, lw *LinkWorker) { doc.Find("a").Each(func(i int, s *goquery.Selection) { // Only interested in anchors that have a href attribute. link, href := s.Attr("href") if href { - if _, ok := linkTracker[link]; !ok { - links = append(links, link) - linkTracker[link] = true - } + lw.Push(Link(link)) } }) - return links + return } // ExtractText extracts all p tags from a page. diff --git a/parser_test.go b/parser_test.go index c2d9183..c3c29ea 100644 --- a/parser_test.go +++ b/parser_test.go @@ -186,10 +186,11 @@ func TestParser_ExtractAuthorPrecedence(t *testing.T) { func TestParser_ExtractLinks_Empty(t *testing.T) { doc := NewDocument("") + lw := NewLinkWorker() - links := ExtractLinks(doc) + ExtractLinks(doc, lw) - assert.Equal(t, len(links), 0) + assert.Equal(t, lw.Len(), 0) } func TestParser_ExtractLinks_Valid(t *testing.T) { @@ -201,10 +202,11 @@ func TestParser_ExtractLinks_Valid(t *testing.T) {

` doc := NewDocument(htmlSoup) + lw := NewLinkWorker() - links := ExtractLinks(doc) + ExtractLinks(doc, lw) - assert.Equal(t, len(links), 2) + assert.Equal(t, lw.Len(), 2) } func TestParser_ExtractLinks_Invalid(t *testing.T) { @@ -212,9 +214,10 @@ func TestParser_ExtractLinks_Invalid(t *testing.T) { invalidHTML := `>>qq>` doc := NewDocument(invalidHTML) - links := ExtractLinks(doc) + lw := NewLinkWorker() + ExtractLinks(doc, lw) - assert.Equal(t, len(links), 0) + assert.Equal(t, lw.Len(), 0) } func TestParser_ExtractLinks_NoDuplicates(t *testing.T) { @@ -227,10 +230,11 @@ func TestParser_ExtractLinks_NoDuplicates(t *testing.T) {

` doc := NewDocument(htmlWithDupes) + lw := NewLinkWorker() - links := ExtractLinks(doc) + ExtractLinks(doc, lw) - assert.Equal(t, len(links), 3) + assert.Equal(t, lw.Len(), 3) } func TestParser_ExtractTextEmpty(t *testing.T) {