Skip to content
This repository has been archived by the owner on Apr 9, 2022. It is now read-only.
/ marill Public archive

Commit

Permalink
fix bug with some relative urls during asset search
Browse files Browse the repository at this point in the history
  • Loading branch information
lrstanley committed Oct 18, 2016
1 parent 85b9b17 commit 85e52e4
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion scraper/htmlparse.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
package scraper

import (
"fmt"
"io"
"net/http"
"regexp"
"strings"

"golang.org/x/net/html"
Expand All @@ -24,6 +26,8 @@ func getAttr(attr string, attrs []html.Attribute) (val string) {
return
}

var nonPrefixMatch = regexp.MustCompile(`^[a-zA-Z]`)

// getSrc crawls the body of the Results page, yielding all img/script/link resources
// so they can later be fetched.
func getSrc(b io.Reader, req *http.Request) (urls []string) {
Expand Down Expand Up @@ -82,11 +86,15 @@ func getSrc(b io.Reader, req *http.Request) (urls []string) {
req.URL.Path = "/"
}

if !strings.Contains(src, "//") && nonPrefixMatch.MatchString(src) {
src = fmt.Sprintf("%s://%s/%s", req.URL.Scheme, req.URL.Host+strings.TrimRight(req.URL.Path, "/"), src)
}

// site was developed using relative paths. E.g:
// - url: http://domain.com/sub/path and resource: ./something/main.js
// would equal http://domain.com/sub/path/something/main.js
if strings.HasPrefix(src, "./") {
src = req.URL.Scheme + "://" + req.URL.Host + req.URL.Path + strings.SplitN(src, "./", 2)[1]
src = fmt.Sprintf("%s://%s", req.URL.Scheme, req.URL.Host+req.URL.Path+strings.SplitN(src, "./", 2)[1])
}

// site is loading resources from a remote location that supports both
Expand Down

0 comments on commit 85e52e4

Please sign in to comment.