From 4af24d9b71ae952dc8e07e15a1e5db5b524fabc5 Mon Sep 17 00:00:00 2001 From: Will Hawkins Date: Fri, 20 Dec 2019 13:49:28 -0500 Subject: [PATCH] Issue #5376: Lenient and strict methods for determing whether a string is a URL From only one method to determine whether a string is a url, create two -- one for a strict check and one for a lenient check. The strict check will guarantee that the URL precisely conforms to ICANN standards (e.g., the TLD in the URL is valid). The lenient check will classify any string that contains ://, ., or : as a URL. --- .../components/support/ktx/kotlin/String.kt | 9 ++ .../support/ktx/kotlin/StringTest.kt | 28 +++++ .../support/utils/URLStringUtils.kt | 62 +++++++++- .../support/utils/URLStringUtilsTest.kt | 114 +++++++++++++----- docs/changelog.md | 1 + 5 files changed, 181 insertions(+), 33 deletions(-) diff --git a/components/support/ktx/src/main/java/mozilla/components/support/ktx/kotlin/String.kt b/components/support/ktx/src/main/java/mozilla/components/support/ktx/kotlin/String.kt index f6839310b07..b558d5f7070 100644 --- a/components/support/ktx/src/main/java/mozilla/components/support/ktx/kotlin/String.kt +++ b/components/support/ktx/src/main/java/mozilla/components/support/ktx/kotlin/String.kt @@ -28,6 +28,15 @@ private val re = object { */ fun String.isUrl() = URLStringUtils.isURLLike(this) +/** + * Checks if this string is a URL. + * + * This method performs a strict check to determine whether the string is a URL. It takes longer + * to execute than isUrl() but checks whether, e.g., the TLD is ICANN-recognized. Consider + * using isUrl() unless these guarantees are required. + */ +fun String.isUrlStrict() = URLStringUtils.isURLLikeStrict(this) + fun String.toNormalizedUrl() = URLStringUtils.toNormalizedURL(this) fun String.isPhone() = re.phoneish.matches(this) diff --git a/components/support/ktx/src/test/java/mozilla/components/support/ktx/kotlin/StringTest.kt b/components/support/ktx/src/test/java/mozilla/components/support/ktx/kotlin/StringTest.kt index 1b385c9e53a..1bb24e80cf0 100644 --- a/components/support/ktx/src/test/java/mozilla/components/support/ktx/kotlin/StringTest.kt +++ b/components/support/ktx/src/test/java/mozilla/components/support/ktx/kotlin/StringTest.kt @@ -43,6 +43,34 @@ class StringTest { assertNotEquals("Tweet:", url) } + @Test + fun isUrlStrict() { + assertTrue("mozilla.org".isUrlStrict()) + assertTrue(" mozilla.org ".isUrlStrict()) + assertTrue("http://mozilla.org".isUrlStrict()) + assertTrue("https://mozilla.org".isUrlStrict()) + assertTrue("file://somefile.txt".isUrlStrict()) + assertTrue("http://mozilla".isUrlStrict()) + assertTrue("http://192.168.255.255".isUrlStrict()) + assertTrue("about:crashcontent".isUrlStrict()) + assertTrue(" about:crashcontent ".isUrlStrict()) + assertTrue("sample:about ".isUrlStrict()) + + assertFalse("sample.notatld".isUrlStrict()) + assertTrue("sample.rocks".isUrlStrict()) + + assertFalse("mozilla".isUrlStrict()) + assertFalse("mozilla android".isUrlStrict()) + assertFalse(" mozilla android ".isUrlStrict()) + assertFalse("Tweet:".isUrlStrict()) + assertFalse("inurl:mozilla.org advanced search".isUrlStrict()) + assertFalse("what is about:crashes".isUrlStrict()) + + val extraText = "Check out @asa’s Tweet: https://twitter.com/asa/status/123456789?s=09" + val url = extraText.split(" ").find { it.isUrlStrict() } + assertNotEquals("Tweet:", url) + } + @Test fun toNormalizedUrl() { val expectedUrl = "http://mozilla.org" diff --git a/components/support/utils/src/main/java/mozilla/components/support/utils/URLStringUtils.kt b/components/support/utils/src/main/java/mozilla/components/support/utils/URLStringUtils.kt index 87ef1161290..b73a184e012 100644 --- a/components/support/utils/src/main/java/mozilla/components/support/utils/URLStringUtils.kt +++ b/components/support/utils/src/main/java/mozilla/components/support/utils/URLStringUtils.kt @@ -6,16 +6,39 @@ package mozilla.components.support.utils import android.net.Uri import android.text.TextUtils +import androidx.annotation.VisibleForTesting +import java.util.regex.Pattern object URLStringUtils { - fun isURLLike(string: String, safe: Boolean = false) = + /** + * Determine whether a string is a URL. + * + * This method performs a strict check to determine whether a string is a URL. It takes longer + * to execute than isURLLike() but checks whether, e.g., the TLD is ICANN-recognized. Consider + * using isURLLike() unless these guarantees are required. + */ + fun isURLLikeStrict(string: String, safe: Boolean = false) = if (safe) { string.matches(WebURLFinder.fuzzyUrlRegex) } else { string.matches(WebURLFinder.fuzzyUrlNonWebRegex) } - fun isSearchTerm(string: String) = !isURLLike(string, false) + /** + * Determine whether a string is a URL. + * + * This method performs a lenient check to determine whether a string is a URL. Anything that + * contains a :, ://, or . and has no internal spaces is potentially a URL. If you need a + * stricter check, consider using isURLLikeStrict(). + */ + fun isURLLike(string: String) = isURLLenient.matcher(string).matches() + + /** + * Determine whether a string is a search term. + * + * This method recognizes a string as a search term as anything other than a URL. + */ + fun isSearchTerm(string: String) = !isURLLike(string) /** * Normalizes a URL String. @@ -28,4 +51,39 @@ object URLStringUtils { } return uri.toString() } + + private val isURLLenient by lazy { + // Be lenient about what is classified as potentially a URL. Anything that contains a :, + // ://, or . and has no internal spaces is potentially a URL. + // + // Use java.util.regex because it is always unicode aware on Android. + // https://developer.android.com/reference/java/util/regex/Pattern.html + + // Use both the \w+ and \S* after the punctuation because they seem to match slightly + // different things. The \S matches any non-whitespace character (e.g., '~') and \w + // matches only word characters. In other words, the regex is requiring that there be a + // non-symbol character somewhere after the ., : or :// and before any other character + // or the end of the string. For example, match + // mozilla.com/~userdir + // and not + // mozilla./~ or mozilla:/ + // Without the [/]* after the :// in the alternation of the characters required to be a + // valid URL, + // file:///home/user/myfile.html + // is considered a search term; it is clearly a URL. + Pattern.compile("^\\s*\\w+(://[/]*|:|\\.)\\w+\\S*\\s*$", flags) + } + + @VisibleForTesting(otherwise = VisibleForTesting.PRIVATE) + internal const val UNICODE_CHARACTER_CLASS: Int = 0x100 + + // To run tests on a non-Android device (like a computer), Pattern.compile + // requires a flag to enable unicode support. Set a value like flags here with a local + // copy of UNICODE_CHARACTER_CLASS. Use a local copy because that constant is not + // available on Android platforms < 24 (Fenix targets 21). At runtime this is not an issue + // because, again, Android REs are always unicode compliant. + // NB: The value has to go through an intermediate variable; otherwise, the linter will + // complain that this value is not one of the predefined enums that are allowed. + @VisibleForTesting(otherwise = VisibleForTesting.PRIVATE) + internal var flags = 0 } diff --git a/components/support/utils/src/test/java/mozilla/components/support/utils/URLStringUtilsTest.kt b/components/support/utils/src/test/java/mozilla/components/support/utils/URLStringUtilsTest.kt index b412d491e09..e4b342897bf 100644 --- a/components/support/utils/src/test/java/mozilla/components/support/utils/URLStringUtilsTest.kt +++ b/components/support/utils/src/test/java/mozilla/components/support/utils/URLStringUtilsTest.kt @@ -7,17 +7,24 @@ package mozilla.components.support.utils import androidx.test.ext.junit.runners.AndroidJUnit4 import mozilla.components.support.utils.URLStringUtils.isSearchTerm import mozilla.components.support.utils.URLStringUtils.isURLLike +import mozilla.components.support.utils.URLStringUtils.isURLLikeStrict import mozilla.components.support.utils.URLStringUtils.toNormalizedURL import org.junit.Assert.assertEquals import org.junit.Assert.assertFalse import org.junit.Assert.assertNotEquals import org.junit.Assert.assertTrue +import org.junit.Before import org.junit.Test import org.junit.runner.RunWith @RunWith(AndroidJUnit4::class) class URLStringUtilsTest { + @Before + fun configurePatternFlags() { + URLStringUtils.flags = URLStringUtils.UNICODE_CHARACTER_CLASS + } + @Test fun toNormalizedURL() { val expectedUrl = "http://mozilla.org" @@ -27,49 +34,94 @@ class URLStringUtilsTest { } @Test - fun isURLLike() { - assertTrue(isURLLike("mozilla.org")) - assertTrue(isURLLike(" mozilla.org ")) - assertTrue(isURLLike("http://mozilla.org")) - assertTrue(isURLLike("https://mozilla.org")) - assertTrue(isURLLike("file://somefile.txt")) - assertFalse(isURLLike("file://somefile.txt", true)) - - assertTrue(isURLLike("http://mozilla")) - assertTrue(isURLLike("http://192.168.255.255")) - assertTrue(isURLLike("192.167.255.255")) - assertTrue(isURLLike("about:crashcontent")) - assertTrue(isURLLike(" about:crashcontent ")) - assertTrue(isURLLike("sample:about ")) - assertTrue(isURLLike("https://mozilla-mobile.com")) - - assertTrue(isURLLike("link.info")) - assertFalse(isURLLike("link.unknown")) - - assertFalse(isURLLike("mozilla")) - assertFalse(isURLLike("mozilla android")) - assertFalse(isURLLike(" mozilla android ")) - assertFalse(isURLLike("Tweet:")) - assertFalse(isURLLike("inurl:mozilla.org advanced search")) - assertFalse(isURLLike("what is about:crashes")) + fun isURLLikeStrict() { + assertTrue(isURLLikeStrict("mozilla.org")) + assertTrue(isURLLikeStrict(" mozilla.org ")) + assertTrue(isURLLikeStrict("http://mozilla.org")) + assertTrue(isURLLikeStrict("https://mozilla.org")) + assertTrue(isURLLikeStrict("file://somefile.txt")) + assertFalse(isURLLikeStrict("file://somefile.txt", true)) + + assertTrue(isURLLikeStrict("http://mozilla")) + assertTrue(isURLLikeStrict("http://192.168.255.255")) + assertTrue(isURLLikeStrict("192.167.255.255")) + assertTrue(isURLLikeStrict("about:crashcontent")) + assertTrue(isURLLikeStrict(" about:crashcontent ")) + assertTrue(isURLLikeStrict("sample:about ")) + assertTrue(isURLLikeStrict("https://mozilla-mobile.com")) + + assertTrue(isURLLikeStrict("link.info")) + assertFalse(isURLLikeStrict("link.unknown")) + + assertFalse(isURLLikeStrict("mozilla")) + assertFalse(isURLLikeStrict("mozilla android")) + assertFalse(isURLLikeStrict(" mozilla android ")) + assertFalse(isURLLikeStrict("Tweet:")) + assertFalse(isURLLikeStrict("inurl:mozilla.org advanced search")) + assertFalse(isURLLikeStrict("what is about:crashes")) val extraText = "Check out @asa’s Tweet: https://twitter.com/asa/status/123456789?s=09" - val url = extraText.split(" ").find { isURLLike(it) } + val url = extraText.split(" ").find { isURLLikeStrict(it) } assertNotEquals("Tweet:", url) - assertFalse(isURLLike("3.14")) - assertFalse(isURLLike("3.14.2019")) + assertFalse(isURLLikeStrict("3.14")) + assertFalse(isURLLikeStrict("3.14.2019")) + + assertFalse(isURLLikeStrict("file://somefile.txt", true)) + assertFalse(isURLLikeStrict("about:config", true)) + } + + @Test + fun isUrlLike() { + assertFalse(isURLLike("inurl:mozilla.org advanced search")) + assertFalse(isURLLike("sf: help")) + assertFalse(isURLLike("mozilla./~")) + assertFalse(isURLLike("cnn.com politics")) + + assertTrue(isURLLike("about:config")) + assertTrue(isURLLike("about:config:8000")) + assertTrue(isURLLike("file:///home/user/myfile.html")) + assertTrue(isURLLike("file://////////////home//user/myfile.html")) + assertTrue(isURLLike("file://C:\\Users\\user\\myfile.html")) + assertTrue(isURLLike("http://192.168.255.255")) + + assertTrue(isURLLike("link.unknown")) + + // Per https://bugs.chromium.org/p/chromium/issues/detail?id=31405, ICANN will accept + // purely numeric gTLDs. + assertTrue(isURLLike("3.14.2019")) + assertTrue(isURLLike(" cnn.com ")) + assertTrue(isURLLike(" cnn.com")) + assertTrue(isURLLike("cnn.com ")) + assertTrue(isURLLike("mozilla.com/~userdir")) - assertFalse(isURLLike("file://somefile.txt", true)) - assertFalse(isURLLike("about:config", true)) + assertTrue(isURLLike("http://faß.de//")) + assertTrue(isURLLike("cnn.cơḿ")) + assertTrue(isURLLike("cnn.çơḿ")) } @Test fun isSearchTerm() { assertTrue(isSearchTerm("inurl:mozilla.org advanced search")) - assertTrue(isSearchTerm("3.14.2019")) + assertTrue(isSearchTerm("sf: help")) + assertTrue(isSearchTerm("mozilla./~")) + assertTrue(isSearchTerm("cnn.com politics")) assertFalse(isSearchTerm("about:config")) + assertFalse(isSearchTerm("about:config:8000")) + assertFalse(isSearchTerm("file:///home/user/myfile.html")) + assertFalse(isSearchTerm("file://////////////home//user/myfile.html")) + assertFalse(isSearchTerm("file://C:\\Users\\user\\myfile.html")) assertFalse(isSearchTerm("http://192.168.255.255")) + // Per https://bugs.chromium.org/p/chromium/issues/detail?id=31405, ICANN will accept + // purely numeric gTLDs. + assertFalse(isSearchTerm("3.14.2019")) + assertFalse(isSearchTerm(" cnn.com ")) + assertFalse(isSearchTerm(" cnn.com")) + assertFalse(isSearchTerm("cnn.com ")) + assertFalse(isSearchTerm("mozilla.com/~userdir")) + assertFalse(isSearchTerm("http://faß.de//")) + assertFalse(isSearchTerm("cnn.cơḿ")) + assertFalse(isSearchTerm("cnn.çơḿ")) } } diff --git a/docs/changelog.md b/docs/changelog.md index 2ad64a57bc8..102334c3061 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -45,6 +45,7 @@ permalink: /changelog/ * **support-ktx** * Added `Context.getDrawableWithTint` extension method to get a drawable resource with a tint applied. + * `String.isUrl` is now using a more lenient check for improved performance. Strictly checking whether a string is a URL or not is supported through the new `String.isUrlStrict` method. * **support-base** * ⚠️ **This is a breaking change**: