From bb0d7776f9ad9f75e022bd37a483fb19b74d0e2a Mon Sep 17 00:00:00 2001
From: nylar
Date: Wed, 31 Dec 2014 20:33:53 -0500
Subject: [PATCH] Change ExtractLinks to use LinkWorker
---
parser.go | 12 +++---------
parser_test.go | 20 ++++++++++++--------
2 files changed, 15 insertions(+), 17 deletions(-)
diff --git a/parser.go b/parser.go
index 1ce8d57..d18464f 100644
--- a/parser.go
+++ b/parser.go
@@ -54,22 +54,16 @@ func ExtractAuthor(doc *goquery.Document) string {
// ExtractLinks all anchors (with href attributes) from a document and return a list
// of the anchors. Should return an error but goquery.NewDocumentFromReader that
// subsequently calls html.Parse doesn't like returning errors for bad markup.
-func ExtractLinks(doc *goquery.Document) []string {
- links := []string{}
- linkTracker := make(map[string]bool)
-
+func ExtractLinks(doc *goquery.Document, lw *LinkWorker) {
doc.Find("a").Each(func(i int, s *goquery.Selection) {
// Only interested in anchors that have a href attribute.
link, href := s.Attr("href")
if href {
- if _, ok := linkTracker[link]; !ok {
- links = append(links, link)
- linkTracker[link] = true
- }
+ lw.Push(Link(link))
}
})
- return links
+ return
}
// ExtractText extracts all p tags from a page.
diff --git a/parser_test.go b/parser_test.go
index c2d9183..c3c29ea 100644
--- a/parser_test.go
+++ b/parser_test.go
@@ -186,10 +186,11 @@ func TestParser_ExtractAuthorPrecedence(t *testing.T) {
func TestParser_ExtractLinks_Empty(t *testing.T) {
doc := NewDocument("")
+ lw := NewLinkWorker()
- links := ExtractLinks(doc)
+ ExtractLinks(doc, lw)
- assert.Equal(t, len(links), 0)
+ assert.Equal(t, lw.Len(), 0)
}
func TestParser_ExtractLinks_Valid(t *testing.T) {
@@ -201,10 +202,11 @@ func TestParser_ExtractLinks_Valid(t *testing.T) {
`
doc := NewDocument(htmlSoup)
+ lw := NewLinkWorker()
- links := ExtractLinks(doc)
+ ExtractLinks(doc, lw)
- assert.Equal(t, len(links), 2)
+ assert.Equal(t, lw.Len(), 2)
}
func TestParser_ExtractLinks_Invalid(t *testing.T) {
@@ -212,9 +214,10 @@ func TestParser_ExtractLinks_Invalid(t *testing.T) {
invalidHTML := `>>qq>`
doc := NewDocument(invalidHTML)
- links := ExtractLinks(doc)
+ lw := NewLinkWorker()
+ ExtractLinks(doc, lw)
- assert.Equal(t, len(links), 0)
+ assert.Equal(t, lw.Len(), 0)
}
func TestParser_ExtractLinks_NoDuplicates(t *testing.T) {
@@ -227,10 +230,11 @@ func TestParser_ExtractLinks_NoDuplicates(t *testing.T) {
`
doc := NewDocument(htmlWithDupes)
+ lw := NewLinkWorker()
- links := ExtractLinks(doc)
+ ExtractLinks(doc, lw)
- assert.Equal(t, len(links), 3)
+ assert.Equal(t, lw.Len(), 3)
}
func TestParser_ExtractTextEmpty(t *testing.T) {