Skip to content
This repository has been archived by the owner on Apr 9, 2022. It is now read-only.

Commit

Permalink
split out tag parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
lrstanley committed Oct 19, 2016
1 parent e7253c4 commit e487921
Showing 1 changed file with 57 additions and 52 deletions.
109 changes: 57 additions & 52 deletions scraper/htmlparse.go
Expand Up @@ -26,8 +26,6 @@ func getAttr(attr string, attrs []html.Attribute) (val string) {
return
}

var nonPrefixMatch = regexp.MustCompile(`^[a-zA-Z]`)

// getSrc crawls the body of the Results page, yielding all img/script/link resources
// so they can later be fetched.
func getSrc(b io.Reader, parent *url.URL) (urls []string) {
Expand Down Expand Up @@ -60,76 +58,83 @@ func getSrc(b io.Reader, parent *url.URL) (urls []string) {
if len(rel) > 0 && strings.ToLower(rel) != "stylesheet" && strings.ToLower(rel) != "shortcut icon" {
continue
}

case "script":
src = getAttr("src", t.Attr)

case "img":
src = getAttr("src", t.Attr)

default:
continue
}

src = fmtTagLinks(src, parent)

if len(src) == 0 {
continue
}

// this assumes that the resource is something along the lines of:
// http://something.com/ -- which we don't care about
if len(src) == 0 || strings.HasSuffix(src, "/") {
continue
}
urls = append(urls, src)
}
}
}

// add trailing slash to the end of the path
if len(parent.Path) == 0 {
parent.Path = "/"
}
var nonPrefixMatch = regexp.MustCompile(`^[a-zA-Z]`)

// site was developed using sub-relative paths. E.g:
// - url: http://domain.com/sub/path and resource: something/main.js
// would equal http://domain.com/sub/path/something/main.js
if !strings.Contains(src, "//") && nonPrefixMatch.MatchString(src) {
src = fmt.Sprintf("%s://%s/%s", parent.Scheme, parent.Host+strings.TrimRight(parent.Path, "/"), src)
}
// fmtTagLinks formats the resulting link from it's previous form -- may it
// be a relative link, absolute link, invalid, etc.
func fmtTagLinks(src string, parent *url.URL) string {
// this assumes that the resource is something along the lines of:
// http://something.com/ -- which we don't care about
if len(src) == 0 || strings.HasSuffix(src, "/") {
return ""
}

// site was developed using relative paths. E.g:
// - url: http://domain.com/sub/path and resource: ./something/main.js
// would equal http://domain.com/sub/path/something/main.js
if strings.HasPrefix(src, "./") {
src = fmt.Sprintf("%s://%s", parent.Scheme, parent.Host+parent.Path+strings.SplitN(src, "./", 2)[1])
}
// add trailing slash to the end of the path
if len(parent.Path) == 0 {
parent.Path = "/"
}

// site is loading resources from a remote location that supports both
// http and https. browsers should natively tack on the current sites
// protocol to the url. E.g:
// - url: http://domain.com/ and resource: //other.com/some-resource.js
// generates: http://other.com/some-resource.js
// - url: https://domain.com/ and resource: //other.com/some-resource.js
// generates: https://other.com/some-resource.js
if strings.HasPrefix(src, "//") {
src = parent.Scheme + ":" + src
}
// site was developed using sub-relative paths. E.g:
// - url: http://domain.com/sub/path and resource: something/main.js
// would equal http://domain.com/sub/path/something/main.js
if !strings.Contains(src, "//") && nonPrefixMatch.MatchString(src) {
src = fmt.Sprintf("%s://%s/%s", parent.Scheme, parent.Host+strings.TrimRight(parent.Path, "/"), src)
}

// non-host-absolute resource. E.g. resource is loaded based on the docroot
// of the domain. E.g:
// - url: http://domain.com/ and resource: /some-resource.js
// generates: http://domain.com/some-resource.js
// - url: https://domain.com/sub/resource and resource: /some-resource.js
// generates: https://domain.com/some-resource.js
if strings.HasPrefix(src, "/") {
src = parent.Scheme + "://" + parent.Host + src
}
// site was developed using relative paths. E.g:
// - url: http://domain.com/sub/path and resource: ./something/main.js
// would equal http://domain.com/sub/path/something/main.js
if strings.HasPrefix(src, "./") {
src = fmt.Sprintf("%s://%s", parent.Scheme, parent.Host+parent.Path+strings.SplitN(src, "./", 2)[1])
}

// ignore anything else that isn't http based. E.g. ftp://, and other svg-like
// data urls, as we really can't fetch those.
if parent.Scheme != "http" && parent.Scheme != "https" {
continue
}
// site is loading resources from a remote location that supports both
// http and https. browsers should natively tack on the current sites
// protocol to the url. E.g:
// - url: http://domain.com/ and resource: //other.com/some-resource.js
// generates: http://other.com/some-resource.js
// - url: https://domain.com/ and resource: //other.com/some-resource.js
// generates: https://other.com/some-resource.js
if strings.HasPrefix(src, "//") {
src = parent.Scheme + ":" + src
}

urls = append(urls, src)
}
// non-host-absolute resource. E.g. resource is loaded based on the docroot
// of the domain. E.g:
// - url: http://domain.com/ and resource: /some-resource.js
// generates: http://domain.com/some-resource.js
// - url: https://domain.com/sub/resource and resource: /some-resource.js
// generates: https://domain.com/some-resource.js
if strings.HasPrefix(src, "/") {
src = parent.Scheme + "://" + parent.Host + src
}

// ignore anything else that isn't http based. E.g. ftp://, and other svg-like
// data urls, as we really can't fetch those.
if parent.Scheme != "http" && parent.Scheme != "https" {
return ""
}

return src
}

// stripURLDups strips all duplicate src URLs
Expand Down

0 comments on commit e487921

Please sign in to comment.