Skip to content
This repository has been archived by the owner on May 14, 2018. It is now read-only.

Commit

Permalink
Change ExtractLinks to use LinkWorker
Browse files Browse the repository at this point in the history
  • Loading branch information
nylar committed Jan 1, 2015
1 parent 27159b6 commit bb0d777
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 17 deletions.
12 changes: 3 additions & 9 deletions parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,22 +54,16 @@ func ExtractAuthor(doc *goquery.Document) string {
// ExtractLinks all anchors (with href attributes) from a document and return a list
// of the anchors. Should return an error but goquery.NewDocumentFromReader that
// subsequently calls html.Parse doesn't like returning errors for bad markup.
func ExtractLinks(doc *goquery.Document) []string {
links := []string{}
linkTracker := make(map[string]bool)

func ExtractLinks(doc *goquery.Document, lw *LinkWorker) {
doc.Find("a").Each(func(i int, s *goquery.Selection) {
// Only interested in anchors that have a href attribute.
link, href := s.Attr("href")
if href {
if _, ok := linkTracker[link]; !ok {
links = append(links, link)
linkTracker[link] = true
}
lw.Push(Link(link))
}
})

return links
return
}

// ExtractText extracts all p tags from a page.
Expand Down
20 changes: 12 additions & 8 deletions parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,11 @@ func TestParser_ExtractAuthorPrecedence(t *testing.T) {

func TestParser_ExtractLinks_Empty(t *testing.T) {
doc := NewDocument("")
lw := NewLinkWorker()

links := ExtractLinks(doc)
ExtractLinks(doc, lw)

assert.Equal(t, len(links), 0)
assert.Equal(t, lw.Len(), 0)
}

func TestParser_ExtractLinks_Valid(t *testing.T) {
Expand All @@ -201,20 +202,22 @@ func TestParser_ExtractLinks_Valid(t *testing.T) {
</p>`

doc := NewDocument(htmlSoup)
lw := NewLinkWorker()

links := ExtractLinks(doc)
ExtractLinks(doc, lw)

assert.Equal(t, len(links), 2)
assert.Equal(t, lw.Len(), 2)
}

func TestParser_ExtractLinks_Invalid(t *testing.T) {
// This should return an error but html.Parse doesn't seem to care.
invalidHTML := `<html><body><aef<eqf>>>qq></body></ht>`

doc := NewDocument(invalidHTML)
links := ExtractLinks(doc)
lw := NewLinkWorker()
ExtractLinks(doc, lw)

assert.Equal(t, len(links), 0)
assert.Equal(t, lw.Len(), 0)
}

func TestParser_ExtractLinks_NoDuplicates(t *testing.T) {
Expand All @@ -227,10 +230,11 @@ func TestParser_ExtractLinks_NoDuplicates(t *testing.T) {
</p>`

doc := NewDocument(htmlWithDupes)
lw := NewLinkWorker()

links := ExtractLinks(doc)
ExtractLinks(doc, lw)

assert.Equal(t, len(links), 3)
assert.Equal(t, lw.Len(), 3)
}

func TestParser_ExtractTextEmpty(t *testing.T) {
Expand Down

0 comments on commit bb0d777

Please sign in to comment.