Skip to content

Commit

Permalink
cmd/xurls: add -fix=all to replace temporary redirects too
Browse files Browse the repository at this point in the history
For example, GitHub uses a temporary redirect when redirecting an issue
URL to a pull request, or vice versa. We still want to fix those,
because in many cases the redirects are permanent in practice.
  • Loading branch information
mvdan committed Nov 22, 2022
1 parent e87b85f commit 2f9d359
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 26 deletions.
72 changes: 50 additions & 22 deletions cmd/xurls/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,39 @@ import (
var (
matching = flag.String("m", "", "")
relaxed = flag.Bool("r", false, "")
fix = flag.Bool("fix", false, "")
fix boolString
version = flag.Bool("version", false, "")
)

type boolString string

func (s *boolString) Set(val string) error {
*s = boolString(val)
return nil
}
func (s *boolString) Get() any { return string(*s) }
func (s *boolString) String() string { return string(*s) }
func (*boolString) IsBoolFlag() bool { return true }

func init() {
flag.Var(&fix, "fix", "")
flag.Usage = func() {
p := func(format string, a ...interface{}) {
fmt.Fprintf(os.Stderr, format, a...)
}
p("Usage: xurls [-h] [files]\n\n")
p("If no files are given, it reads from standard input.\n\n")
p(" -m <regexp> only match urls whose scheme matches a regexp\n")
p(" example: 'https?://|mailto:'\n")
p(" -r also match urls without a scheme (relaxed)\n")
p(" -fix overwrite urls that redirect\n")
p(" -version print version and exit\n")
fmt.Fprint(os.Stderr, `
Usage: xurls [-h] [files]
xurls extracts urls from text using regular expressions.
If no files are given, it reads from standard input.
-m <regexp> only match urls whose scheme matches a regexp
example: 'https?://|mailto:'
-r also match urls without a scheme (relaxed)
-version print version and exit
When the -fix or -fix=auto flag is used, xurls instead attempts to replace
any urls which result in a permanent redirect (301 or 308).
It also fails if any urls fail to load, so that they may be removed or replaced.
To replace urls which result in temporary redirect as well, use -fix=all.
`[1:])
}
}

Expand All @@ -57,7 +74,7 @@ func scanPath(re *regexp.Regexp, path string) error {
if err != nil {
return err
}
if *fix {
if fix != "" {
outBuf = new(bytes.Buffer)
out = outBuf
}
Expand All @@ -77,7 +94,7 @@ func scanPath(re *regexp.Regexp, path string) error {
for scanner.Scan() {
line := scanner.Text() + "\n"
matches := re.FindAllStringIndex(line, -1)
if !*fix {
if fix == "" {
for _, pair := range matches {
match := line[pair[0]:pair[1]]
fmt.Printf("%s\n", match)
Expand All @@ -104,8 +121,6 @@ func scanPath(re *regexp.Regexp, path string) error {
switch origURL.Scheme {
case "http", "https":
// See if the URL redirects somewhere.
// Only apply a fix if the redirect chain is permanent.
allPermanent := true
client := &http.Client{
Timeout: 10 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
Expand All @@ -114,16 +129,21 @@ func scanPath(re *regexp.Regexp, path string) error {
}
switch req.Response.StatusCode {
case http.StatusMovedPermanently, http.StatusPermanentRedirect:
// "auto" and "all" fix permanent redirects.
case http.StatusFound, http.StatusSeeOther, http.StatusTemporaryRedirect:
// Only "all" fixes temporary redirects.
if fix != "all" {
return http.ErrUseLastResponse
}
default:
allPermanent = false
// Any other redirects are ignored.
return http.ErrUseLastResponse
}
if allPermanent {
// Inherit the fragment if empty.
if req.URL.Fragment == "" {
req.URL.Fragment = origURL.Fragment
}
fixed = req.URL.String()
// Inherit the fragment if empty.
if req.URL.Fragment == "" {
req.URL.Fragment = origURL.Fragment
}
fixed = req.URL.String()
return nil
},
}
Expand Down Expand Up @@ -202,6 +222,14 @@ func main1() int {
fmt.Fprintln(os.Stderr, "-r and -m at the same time don't make much sense")
return 1
}
switch fix {
case "": // disabled by default
case "false": // disabled via -fix=false; normalize
fix = ""
case "auto", "all": // enabled via -fix=auto, -fix=all, etc
case "true": // enabled via -fix; normalize
fix = "auto"
}
var re *regexp.Regexp
if *relaxed {
re = xurls.Relaxed()
Expand Down
3 changes: 3 additions & 0 deletions cmd/xurls/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ func TestScript(t *testing.T) {
handle("HEAD", "/redir-302", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/plain-head", 302)
})
handle("HEAD", "/redir-303", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/plain-head", 303)
})
handle("HEAD", "/redir-307", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/plain-head", 307)
})
Expand Down
41 changes: 37 additions & 4 deletions cmd/xurls/testdata/script/fix.txtar
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ expand nothing
cp nothing nothing.orig

expand redirects
expand redirects.golden
expand redirects.golden-auto
expand redirects.golden-all
cp redirects redirects.orig

expand broken
Expand All @@ -16,14 +17,23 @@ cmp nothing nothing.orig

stdin redirects
exec xurls -fix
cmp stdout redirects.golden
cmp stdout redirects.golden-auto
cmp redirects redirects.orig
! stderr .

exec xurls -fix redirects
stdout '^redirects$'
! stderr .
cmp redirects redirects.golden
cmp redirects redirects.golden-auto
cp redirects.orig redirects

exec xurls -fix=auto redirects
cmp redirects redirects.golden-auto
cp redirects.orig redirects

exec xurls -fix=all redirects
cmp redirects redirects.golden-all
cp redirects.orig redirects

! exec xurls -fix broken
stdout -count=1 '^broken$'
Expand Down Expand Up @@ -52,10 +62,11 @@ Permanent redirect codes:

Temporary redirect codes:
* ${SERVER}/redir-302
* ${SERVER}/redir-303
* ${SERVER}/redir-307

Only GET allowed, HEAD fails: ${SERVER}/get-only
-- redirects.golden --
-- redirects.golden-auto --
No redirect: ${SERVER}/plain-head
One redirect: ${SERVER}/plain-head
Two redirects: ${SERVER}/plain-head
Expand All @@ -72,8 +83,30 @@ Permanent redirect codes:

Temporary redirect codes:
* ${SERVER}/redir-302
* ${SERVER}/redir-303
* ${SERVER}/redir-307

Only GET allowed, HEAD fails: ${SERVER}/plain-get
-- redirects.golden-all --
No redirect: ${SERVER}/plain-head
One redirect: ${SERVER}/plain-head
Two redirects: ${SERVER}/plain-head
Redirect inherits fragment: ${SERVER}/plain-head#foo
Redirect replaces fragment: ${SERVER}/plain-head#bar

Three links in one line: ${SERVER}/plain-head + ${SERVER}/plain-head + ${SERVER}/plain-head

Redirect to a longer path ${SERVER}/redir-longtarget with trailing text

Permanent redirect codes:
* ${SERVER}/plain-head
* ${SERVER}/plain-head

Temporary redirect codes:
* ${SERVER}/plain-head
* ${SERVER}/plain-head
* ${SERVER}/plain-head

Only GET allowed, HEAD fails: ${SERVER}/plain-get
-- broken --
One redirect: ${SERVER}/redir-1
Expand Down

0 comments on commit 2f9d359

Please sign in to comment.