split out tag parsing

lrstanley · Oct 19, 2016 · e487921 · e487921
1 parent e7253c4
commit e487921
Showing 1 changed file with 57 additions and 52 deletions.
diff --git a/scraper/htmlparse.go b/scraper/htmlparse.go
@@ -26,8 +26,6 @@ func getAttr(attr string, attrs []html.Attribute) (val string) {
 	return
 }
 
-var nonPrefixMatch = regexp.MustCompile(`^[a-zA-Z]`)
-
 // getSrc crawls the body of the Results page, yielding all img/script/link resources
 // so they can later be fetched.
 func getSrc(b io.Reader, parent *url.URL) (urls []string) {
@@ -60,76 +58,83 @@ func getSrc(b io.Reader, parent *url.URL) (urls []string) {
 				if len(rel) > 0 && strings.ToLower(rel) != "stylesheet" && strings.ToLower(rel) != "shortcut icon" {
 					continue
 				}
-
 			case "script":
 				src = getAttr("src", t.Attr)
-
 			case "img":
 				src = getAttr("src", t.Attr)
-
 			default:
 				continue
 			}
 
+			src = fmtTagLinks(src, parent)
+
 			if len(src) == 0 {
 				continue
 			}
 
-			// this assumes that the resource is something along the lines of:
-			//   http://something.com/ -- which we don't care about
-			if len(src) == 0 || strings.HasSuffix(src, "/") {
-				continue
-			}
+			urls = append(urls, src)
+		}
+	}
+}
 
-			// add trailing slash to the end of the path
-			if len(parent.Path) == 0 {
-				parent.Path = "/"
-			}
+var nonPrefixMatch = regexp.MustCompile(`^[a-zA-Z]`)
 
-			// site was developed using sub-relative paths. E.g:
-			// - url: http://domain.com/sub/path and resource: something/main.js
-			//   would equal http://domain.com/sub/path/something/main.js
-			if !strings.Contains(src, "//") && nonPrefixMatch.MatchString(src) {
-				src = fmt.Sprintf("%s://%s/%s", parent.Scheme, parent.Host+strings.TrimRight(parent.Path, "/"), src)
-			}
+// fmtTagLinks formats the resulting link from it's previous form -- may it
+// be a relative link, absolute link, invalid, etc.
+func fmtTagLinks(src string, parent *url.URL) string {
+	// this assumes that the resource is something along the lines of:
+	//   http://something.com/ -- which we don't care about
+	if len(src) == 0 || strings.HasSuffix(src, "/") {
+		return ""
+	}
 
-			// site was developed using relative paths. E.g:
-			//  - url: http://domain.com/sub/path and resource: ./something/main.js
-			//    would equal http://domain.com/sub/path/something/main.js
-			if strings.HasPrefix(src, "./") {
-				src = fmt.Sprintf("%s://%s", parent.Scheme, parent.Host+parent.Path+strings.SplitN(src, "./", 2)[1])
-			}
+	// add trailing slash to the end of the path
+	if len(parent.Path) == 0 {
+		parent.Path = "/"
+	}
 
-			// site is loading resources from a remote location that supports both
-			// http and https. browsers should natively tack on the current sites
-			// protocol to the url. E.g:
-			//  - url: http://domain.com/ and resource: //other.com/some-resource.js
-			//    generates: http://other.com/some-resource.js
-			//  - url: https://domain.com/ and resource: //other.com/some-resource.js
-			//    generates: https://other.com/some-resource.js
-			if strings.HasPrefix(src, "//") {
-				src = parent.Scheme + ":" + src
-			}
+	// site was developed using sub-relative paths. E.g:
+	// - url: http://domain.com/sub/path and resource: something/main.js
+	//   would equal http://domain.com/sub/path/something/main.js
+	if !strings.Contains(src, "//") && nonPrefixMatch.MatchString(src) {
+		src = fmt.Sprintf("%s://%s/%s", parent.Scheme, parent.Host+strings.TrimRight(parent.Path, "/"), src)
+	}
 
-			// non-host-absolute resource. E.g. resource is loaded based on the docroot
-			// of the domain. E.g:
-			//  - url: http://domain.com/ and resource: /some-resource.js
-			//    generates: http://domain.com/some-resource.js
-			//  - url: https://domain.com/sub/resource and resource: /some-resource.js
-			//    generates: https://domain.com/some-resource.js
-			if strings.HasPrefix(src, "/") {
-				src = parent.Scheme + "://" + parent.Host + src
-			}
+	// site was developed using relative paths. E.g:
+	//  - url: http://domain.com/sub/path and resource: ./something/main.js
+	//    would equal http://domain.com/sub/path/something/main.js
+	if strings.HasPrefix(src, "./") {
+		src = fmt.Sprintf("%s://%s", parent.Scheme, parent.Host+parent.Path+strings.SplitN(src, "./", 2)[1])
+	}
 
-			// ignore anything else that isn't http based. E.g. ftp://, and other svg-like
-			// data urls, as we really can't fetch those.
-			if parent.Scheme != "http" && parent.Scheme != "https" {
-				continue
-			}
+	// site is loading resources from a remote location that supports both
+	// http and https. browsers should natively tack on the current sites
+	// protocol to the url. E.g:
+	//  - url: http://domain.com/ and resource: //other.com/some-resource.js
+	//    generates: http://other.com/some-resource.js
+	//  - url: https://domain.com/ and resource: //other.com/some-resource.js
+	//    generates: https://other.com/some-resource.js
+	if strings.HasPrefix(src, "//") {
+		src = parent.Scheme + ":" + src
+	}
 
-			urls = append(urls, src)
-		}
+	// non-host-absolute resource. E.g. resource is loaded based on the docroot
+	// of the domain. E.g:
+	//  - url: http://domain.com/ and resource: /some-resource.js
+	//    generates: http://domain.com/some-resource.js
+	//  - url: https://domain.com/sub/resource and resource: /some-resource.js
+	//    generates: https://domain.com/some-resource.js
+	if strings.HasPrefix(src, "/") {
+		src = parent.Scheme + "://" + parent.Host + src
 	}
+
+	// ignore anything else that isn't http based. E.g. ftp://, and other svg-like
+	// data urls, as we really can't fetch those.
+	if parent.Scheme != "http" && parent.Scheme != "https" {
+		return ""
+	}
+
+	return src
 }
 
 // stripURLDups strips all duplicate src URLs