Skip to content

Commit

Permalink
fixed bug in result channel, at the return of the Crawl func there wa…
Browse files Browse the repository at this point in the history
…s no way to guarantee that processing of result channel was finished(in tests). Added regex url extraction for selector based navigators
  • Loading branch information
peterdeka committed Apr 9, 2016
1 parent a747446 commit 1537cf6
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 9 deletions.
20 changes: 15 additions & 5 deletions listeater.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@ import (
"net/http"
"net/http/cookiejar"
"net/url"
"regexp"
"sync"
)

var ErrInvalidConfig = errors.New("Invalid config, missing crawl ops.")
var ErrNoLoginCreds = errors.New("No login credentials provided, login needed.")
var ErrCannotLogin = errors.New("Could not login, check your credentials.")
var urlRegex = regexp.MustCompile("(http|https)://([\\w_-]+(?:(?:\\.[\\w_-]+)+))([\\w.,@?^=%&:/~+#-]*[\\w@?^=%&/~+#-])?")

type LoginDescriptor struct {
Url string `json:"url"`
Expand All @@ -38,6 +40,7 @@ type ListEaterConfig struct {
type CrawlResult struct {
Element interface{}
Error error
Done bool
}

//credentials for login (if needed)
Expand Down Expand Up @@ -78,6 +81,7 @@ func (le *ListEater) login(creds *LoginCredentials) error {

//the main listeater function, does the actual crawling
func (le *ListEater) Crawl(resChan chan CrawlResult, elementCrawler ElementCrawler, creds *LoginCredentials) error {
defer func() { resChan <- CrawlResult{Done: true} }() //will signal the results reader(on resChan) that we finished crawling
if le.CrawlDesc == nil {
return ErrInvalidConfig
}
Expand Down Expand Up @@ -129,6 +133,7 @@ func (le *ListEater) Crawl(resChan chan CrawlResult, elementCrawler ElementCrawl
return crawlErr
}
}

return nil
}

Expand All @@ -142,7 +147,7 @@ func (le *ListEater) listPageCrawl(resp *http.Response, resChan chan CrawlResult
log.Println(err)
return err
}
//scoped function to follow the single eelemnt to be extracted
//scoped function to follow the single elemnt to be extracted
asyncFollow := func(elUrl string) {
defer wg.Done()
r, err := le.Client.Get(elUrl)
Expand All @@ -159,11 +164,16 @@ func (le *ListEater) listPageCrawl(resp *http.Response, resChan chan CrawlResult
fUrl, exists := s.Attr("href")
if !exists {
log.Println("Warning no href in follow")
} else {
//log.Println("Following: " + fUrl)
wg.Add(1)
go asyncFollow(fUrl)
return
}
fUrl = urlRegex.FindString(fUrl)
if fUrl == "" {
log.Println("Warning no url in href in follow")
return
}
//log.Println("Following: " + fUrl)
wg.Add(1)
go asyncFollow(fUrl)
})
wg.Wait()

Expand Down
17 changes: 14 additions & 3 deletions listeater_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ import (
"net/url"
"os"
"strconv"
"sync"
"testing"
"time"
)

var oneElementTpl = `<html>
Expand Down Expand Up @@ -58,7 +60,6 @@ const userField = "username"
const pwField = "pass"
const user = "auser"
const pw = "somepassword"
const elPerPage = 8

func TestMain(m *testing.M) {
setup()
Expand All @@ -69,7 +70,8 @@ func TestMain(m *testing.M) {
//tests setup
func setup() {
//generate some random elements
sz := rand.Intn(30) + 8
rand.Seed(time.Now().Unix())
sz := rand.Intn(90) + 20
elements = make([]testElement, sz)
for i := 0; i < sz; i++ {
fs_n := rand.Intn(5) + 1
Expand All @@ -82,6 +84,7 @@ func setup() {
}

func TestLoginCrawl(t *testing.T) {
elPerPage := rand.Intn(len(elements)/2) + 8
require := require.New(t)
hostUrl := ""
//build a mock backend service that will reply
Expand Down Expand Up @@ -115,10 +118,11 @@ func TestLoginCrawl(t *testing.T) {
els := elements[sliceIdx:min(sliceIdx+elPerPage, len(elements))]
t, _ := template.New("list").Parse(listTpl)

hasNext := false
if sliceIdx+elPerPage < len(elements) {
pIdx = pIdx + 1
hasNext = true
}
hasNext := sliceIdx+elPerPage < len(elements)
err = t.Execute(w, map[string]interface{}{
"items": els,
"listurl": listUrl,
Expand Down Expand Up @@ -167,19 +171,26 @@ func TestLoginCrawl(t *testing.T) {
//crawl
result := []testElement{}
resChan := make(chan CrawlResult)
wg := sync.WaitGroup{}
wg.Add(1)
go func() {
defer wg.Done()
for {
y := <-resChan
if y.Error != nil {
fmt.Println("Error")
} else if y.Done {
return
} else {
result = append(result, y.Element.(testElement))
}
}
}()

if err := le.Crawl(resChan, testElCrawler{}, &LoginCredentials{user: user, pass: pw}); err != nil {
fmt.Println(err)
}
wg.Wait()
require.Equal(len(elements), len(result))
fmt.Println("DONE")
}
Expand Down
9 changes: 8 additions & 1 deletion paginator.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

var ErrInvalidSelector = errors.New("Invalid selector for pagination.")
var ErrNoHrefInPagination = errors.New("Pagination found but no href inside.")
var ErrInvalidUrlHrefInPagination = errors.New("Pagination found but url in href not found.")

//the interface that must be implemented to paginate from a page.
//Returns request for next page, hasNext bool, and an error
Expand Down Expand Up @@ -38,9 +39,15 @@ func (hph HrefPaginationHandler) Paginate(r *http.Response) (*http.Request, bool
}
nextUrl, exist := np.Attr("href")
if !exist {
log.Println("WARNING: no href in xeturl")
log.Println("WARNING: no href matched pagination selelector")
return nil, false, ErrNoHrefInPagination
}
//extract a valid url
nextUrl = urlRegex.FindString(nextUrl)
if nextUrl == "" {
log.Println("WARNING: no valid url found in href")
return nil, false, ErrInvalidUrlHrefInPagination
}
log.Println("Next page: " + nextUrl)
req, _ := http.NewRequest("GET", nextUrl, nil)
return req, true, nil
Expand Down

0 comments on commit 1537cf6

Please sign in to comment.