From 23b24f42590943ebf0bb21953d035dd7f2c12093 Mon Sep 17 00:00:00 2001 From: "Michael S. Klishin" Date: Tue, 20 Dec 2011 12:28:10 +0400 Subject: [PATCH] Initial commit --- .gitignore | 8 + .travis.yml | 1 + README.md | 31 + project.clj | 11 + .../clojurewerkz/crawlista/extraction.clj | 77 + src/clojure/clojurewerkz/crawlista/string.clj | 34 + src/clojure/clojurewerkz/crawlista/url.clj | 116 ++ src/java/.gitkeep | 0 .../crawlista/test/extraction.clj | 129 ++ test/clojurewerkz/crawlista/test/string.clj | 29 + test/clojurewerkz/crawlista/test/url.clj | 106 ++ test/resources/html/arstechnica.com.html | 1173 +++++++++++++++ test/resources/html/arstechnica.com_full.html | 1278 +++++++++++++++++ test/resources/html/wired.com.html | 64 + test/resources/js/href_value1.js | 1 + 15 files changed, 3058 insertions(+) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 README.md create mode 100644 project.clj create mode 100644 src/clojure/clojurewerkz/crawlista/extraction.clj create mode 100644 src/clojure/clojurewerkz/crawlista/string.clj create mode 100644 src/clojure/clojurewerkz/crawlista/url.clj create mode 100644 src/java/.gitkeep create mode 100644 test/clojurewerkz/crawlista/test/extraction.clj create mode 100644 test/clojurewerkz/crawlista/test/string.clj create mode 100644 test/clojurewerkz/crawlista/test/url.clj create mode 100644 test/resources/html/arstechnica.com.html create mode 100644 test/resources/html/arstechnica.com_full.html create mode 100644 test/resources/html/wired.com.html create mode 100644 test/resources/js/href_value1.js diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f923792 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +pom.xml +*jar +/lib/ +/classes/ +.lein-failures +.lein-deps-sum +TAGS +checkouts/* \ No newline at end of file diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..4f42080 --- /dev/null +++ b/.travis.yml @@ -0,0 +1 @@ +language: clojure \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a173441 --- /dev/null +++ b/README.md @@ -0,0 +1,31 @@ +# What is Crawlista + +Crawlista is a support library for Clojure applications that crawl the Web. + + +## Usage + +### Installation + +With Leiningen + + [clojurewerkz/crawlista "1.0.0-SNAPSHOT"] + +New snapshots are [published to clojars.org](https://clojars.org/clojurewerkz/crawlista) every day (if there are any changes). + + +## Crawlista is a Work In Progress + +Crawlista is a work in progress. Please see our test suite for code examples. + + +## Supported Clojure versions + +Crawlista is built from the ground up for Clojure 1.3 and up. + + +## License + +Copyright (C) 2011 Michael S. Klishin + +Distributed under the Eclipse Public License, the same as Clojure. diff --git a/project.clj b/project.clj new file mode 100644 index 0000000..7af78e1 --- /dev/null +++ b/project.clj @@ -0,0 +1,11 @@ +(defproject clojurewerkz/crawlista "1.0.0-SNAPSHOT" + :description "Support library for Clojure applications that crawl the Web" + :dependencies [[org.clojure/clojure "1.3.0"] + [clj-http "0.2.4"] + [org.jsoup/jsoup "1.6.1"] + [clojurewerkz/urly "1.0.0-SNAPSHOT"]] + :source-path "src/clojure" + :java-source-path "src/java" + :resources-path "src/resources" + :dev-resources-path "test/resources" + :warn-on-reflection true) diff --git a/src/clojure/clojurewerkz/crawlista/extraction.clj b/src/clojure/clojurewerkz/crawlista/extraction.clj new file mode 100644 index 0000000..23e13ba --- /dev/null +++ b/src/clojure/clojurewerkz/crawlista/extraction.clj @@ -0,0 +1,77 @@ +(ns clojurewerkz.crawlista.extraction + (:import [org.jsoup Jsoup] + [org.jsoup.nodes Element] + [java.net URI URL MalformedURLException]) + (:use [clojurewerkz.crawlista.string] + [clojurewerkz.crawlista.url])) + +;; +;; Implementation +;; + +(defn- urls-from + [anchors] + (map (fn [a] (.attr ^Element a "href")) anchors)) + + +;; +;; API +;; + +(defn extract-anchors + [body] + (seq (-> (Jsoup/parse body) + (.getElementsByTag "a")))) + +(defn extract-local-anchors + [body uri] + (let [host (.getHost (URL. uri))] + (seq (-> (Jsoup/parse body) + (.getElementsByTag "a"))))) + +(defn extract-local-urls + [body uri] + (let [host (.getHost (URL. (strip-query-string uri))) + anchors (extract-local-anchors body uri) + hrefs (urls-from anchors)] + (distinct (map (fn [^String s] (normalize-url (absolutize s uri))) + (filter (fn [^String s] (local-to? (strip-query-string s) host)) hrefs))))) + +(defn followable? + [^Element anchor] + (let [rel-value (.attr anchor "rel")] + (or (nil? rel-value) + (not (= "nofollow" + (-> rel-value .toLowerCase .trim)))))) + +(defn extract-local-followable-anchors + [body uri] + (filter followable? (extract-local-anchors body uri))) + +(defn extract-local-followable-urls + [body uri] + (let [host (.getHost (URL. uri)) + anchors (extract-local-followable-anchors body (strip-query-string uri)) + urls (filter crawlable-href? (urls-from anchors))] + (distinct (map (fn [^String s] (normalize-url (absolutize s uri))) + (filter (fn [^String s] (local-to? s host)) urls))))) + + +(defn extract-title + [^String body] + (-> (Jsoup/parse body) .title)) + + +(defn has-anchor? + ([body uri] + (let [hrefs (urls-from (extract-anchors body))] + (some (fn [^String s] + (and s + (= (resourcify s) (resourcify uri)))) hrefs))) + ([body uri text] + (let [anchors (extract-anchors body)] + (some (fn [^Element anchor] + (let [href (.attr anchor "href")] + (and href + (= (resourcify href) (resourcify uri)) + (= (.text anchor) text)))) anchors)))) diff --git a/src/clojure/clojurewerkz/crawlista/string.clj b/src/clojure/clojurewerkz/crawlista/string.clj new file mode 100644 index 0000000..865a06c --- /dev/null +++ b/src/clojure/clojurewerkz/crawlista/string.clj @@ -0,0 +1,34 @@ +(ns clojurewerkz.crawlista.string + (:use [clojure.string :only [split blank?]])) + +(defn maybe-prepend + [^String s ^String prefix] + (.toLowerCase (if (.startsWith (.toLowerCase s) (.toLowerCase prefix)) + s + (str prefix s)))) + +(defn maybe-append + [^String s ^String suffix] + (.toLowerCase (if (.endsWith (.toLowerCase s) (.toLowerCase suffix)) + s + (str s suffix)))) + +(defn maybe-chopl + [^String s ^String prefix] + (let [ls (.toLowerCase s)] + (if (.startsWith ls prefix) + (.replaceAll ls (str "^" prefix) "") + s))) + +(defn maybe-chopr + [^String s ^String suffix] + (let [ls (.toLowerCase s)] + (if (.endsWith ls suffix) + (.replaceAll ls (str suffix "$") "") + s))) + +(defn hex-to-int + [^String s] + (Long/parseLong (if (.startsWith s "0x") + (subs s 2) + s) 16)) diff --git a/src/clojure/clojurewerkz/crawlista/url.clj b/src/clojure/clojurewerkz/crawlista/url.clj new file mode 100644 index 0000000..a6f85a4 --- /dev/null +++ b/src/clojure/clojurewerkz/crawlista/url.clj @@ -0,0 +1,116 @@ +(ns clojurewerkz.crawlista.url + (:import [java.net URI URL MalformedURLException] + [clojurewerkz.urly UrlLike]) + (:use [clojure.string :only [split blank?]] + [clojurewerkz.crawlista.string] + [clojure.string :only [lower-case]] + [clojurewerkz.urly.core :only [path-of]])) + + +(defn strip-query-string + [^String s] + (.replaceAll s "\\?.*$" "")) + +(def resourcify + (comp (fn [^String s] + (if-not (re-find #"\.([a-zA-Z0-9]+)$" (path-of s)) + (maybe-append s "/") + s)) + strip-query-string + lower-case)) + +(defn separate-query-string + [^String s] + (split s #"\?")) + +(defn client-side-href? + [^String s] + (or (.startsWith s "#") + (.startsWith s "(") + (.startsWith (.toLowerCase s) "javascript") + (blank? s))) + +(defn crawlable-href? + [^String s] + (and (not (client-side-href? s)) (try + (URI. (strip-query-string s)) + true + (catch java.net.URISyntaxException se + false) + (catch Exception e + false)))) + + +(defprotocol URLNormalization + (normalize-host [input] "Normalizes host by chopping off www. if necessary") + (normalize-url [input] "Normalizes URL by chopping off www. at the beginning and trailing slash at the end, if necessary") + (absolutize [input against] "Returns absolute URL") + (relativize [input against] "Returns relative URL")) + +(extend-protocol URLNormalization + String + (normalize-host [input] + (try + (let [url (URL. input) + url* (URL. (.getProtocol url) (maybe-chopl (.toLowerCase (.getHost url)) "www.") (.getPort url) (.getFile url))] + (str url*)) + (catch MalformedURLException e + (maybe-chopl (.toLowerCase input) "www.")))) + (normalize-url [input] + (maybe-chopr (normalize-host input) "/")) + (absolutize [input against] + (let [[input-without-query-string query-string] (separate-query-string input) + resolved (.toString (.resolve (URI. against) + (URI. input-without-query-string)))] + (if query-string + (str resolved "?" query-string) + resolved))) + + + URL + (normalize-host [input] + (URL. (.getProtocol input) (maybe-chopl (.toLowerCase (.getHost input)) "www.") (.getPort input) (.getFile input))) + + + URI + (normalize-host [input] + (URI. (.getScheme input) nil (maybe-chopl (.toLowerCase (.getHost input)) "www.") (.getPort input) (.getPath input) nil nil)) + (absolutize [input ^java.net.URI against] + (.resolve against input))) + + + +(defprotocol DomainRoot + (root? [input] "Returns true if given URL/URI is the site root")) + +(extend-protocol DomainRoot + String + (root? [input] + (root? (URI. (strip-query-string input)))) + + URI + (root? [input] + (.isEmpty (UrlLike/normalizePath (.getPath input)))) + + URL + (root? [input] + (root? (.toURI input)))) + + +(defn- maybe-prepend-protocol + "Fixes broken URLs like //jobs.arstechnica.com/list/1186 (that parse fine and both have host and are not considered absolute by java.net.URI)" + ([^String uri-str] + (maybe-prepend-protocol uri-str "http")) + ([^String uri-str ^String proto] + (let [uri (URI. uri-str)] + (if (and (not (.isAbsolute uri)) + (not (nil? (.getHost uri)))) + (str proto ":" uri-str) + uri-str)))) + +(defn local-to? + [^String uri-str ^String host] + (let [uri (URI. (-> uri-str strip-query-string (maybe-prepend-protocol "http")))] + (or (and (.getHost uri) + (= (maybe-prepend (.toLowerCase host) "www.") (maybe-prepend (.toLowerCase (.getHost uri)) "www."))) + (not (.isAbsolute uri))))) diff --git a/src/java/.gitkeep b/src/java/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/test/clojurewerkz/crawlista/test/extraction.clj b/test/clojurewerkz/crawlista/test/extraction.clj new file mode 100644 index 0000000..cf52705 --- /dev/null +++ b/test/clojurewerkz/crawlista/test/extraction.clj @@ -0,0 +1,129 @@ +(ns clojurewerkz.crawlista.test.extraction + (:import [java.net URI URL]) + (:use [clojurewerkz.crawlista.extraction] + [clojurewerkz.crawlista.url] + [clojure.test])) + +(deftest test-extract-title + (is (= "Wired.com" (extract-title (slurp (clojure.java.io/resource "html/wired.com.html")))))) + + +(deftest test-extract-local-urls + (let [body (slurp (clojure.java.io/resource "html/wired.com.html")) + result (extract-local-urls body "http://wired.com")] + (is (= + ["http://wired.com" + "http://wired.com/wiredscience/2011/11/absolute-with-www" + "http://wired.comwww.wired.com/wiredscience/2011/11/link-without-http" + "http://wired.com/dangerroom/2011/11/absolute-without-domain" + "http://wired.comdangerroom/2011/11/relative" + "http://wired.com/auth?goauth_start=1&goauth_service=linkedin&goauth_action=login&loc=" + "http://wired.com/gadgets/news/2011/11/arms-new-tools-make-it-easier-for-android-devs-to-use-native-code.ars?comments=1"] (vec result))))) + +(deftest test-extract-local-followable-urls-case-1 + (let [body (slurp (clojure.java.io/resource "html/wired.com.html")) + result (extract-local-followable-urls body "http://wired.com")] + (is (= + ["http://wired.com" + "http://wired.com/wiredscience/2011/11/absolute-with-www" + "http://wired.comwww.wired.com/wiredscience/2011/11/link-without-http" + "http://wired.com/dangerroom/2011/11/absolute-without-domain" + "http://wired.comdangerroom/2011/11/relative" + "http://wired.com/gadgets/news/2011/11/arms-new-tools-make-it-easier-for-android-devs-to-use-native-code.ars?comments=1"] (vec result))))) + + +(deftest test-extract-local-followable-urls-case-2 + (let [body (slurp (clojure.java.io/resource "html/arstechnica.com.html")) + result (vec (extract-local-followable-urls body "http://arstechnica.com")) + expected ["http://arstechnica.com" + "http://arstechnica.com/apple" + "http://arstechnica.com/apple/guides/2011/11/can-the-iphone-4s-replace-a-real-digital-camera-for-many-yes.ars" + "http://arstechnica.com/apple/news/2011/12/battery-life-woes-continue-to-plague-iphone-4s-users.ars" + "http://arstechnica.com/apple/news/2011/12/it-appears-the-syrian-government.ars?comments=1" + "http://arstechnica.com/apple/news/2011/12/mac-os-x-1073-includes.ars?comments=1" + "http://arstechnica.com/apple/news/2011/12/thai-flooding-finally-hits-apple-with-2tb-drive-shortage.ars" + "http://arstechnica.com/apple/reviews/2011/11/why-steve-jobs-cried.ars" + "http://arstechnica.com/ask-ars" + "http://arstechnica.com/author/ben-kuchera" + "http://arstechnica.com/author/casey-johnston" + "http://arstechnica.com/author/chris-foresman" + "http://arstechnica.com/author/jacqui-cheng" + "http://arstechnica.com/author/john-timmer" + "http://arstechnica.com/author/jon-brodkin" + "http://arstechnica.com/author/matthew-lasar" + "http://arstechnica.com/author/nate-anderson" + "http://arstechnica.com/author/ohrmazd" + "http://arstechnica.com/author/peter-bright" + "http://arstechnica.com/author/ryan-paul" + "http://arstechnica.com/author/scott-johnson" + "http://arstechnica.com/author/sean-gallagher" + "http://arstechnica.com/author/wired-uk" + "http://arstechnica.com/author/wiredcom" + "http://arstechnica.com/business" + "http://arstechnica.com/business/consumerization-of-it" + "http://arstechnica.com/business/news/2011/11/bulldozer-server-benchmarks-are-here-and-theyre-a-catastrophe.ars" + "http://arstechnica.com/business/news/2011/11/private-app-stores-does-your-company-need-its-own.ars" + "http://arstechnica.com/business/news/2011/12/bill-would-end-overtime-pay-requirement-for-many-more-it-workers.ars" + "http://arstechnica.com/business/news/2011/12/week-in-it-the-end-of-internal-e-mail-and-tracking-cell-phone-signals.ars" + "http://arstechnica.com/civis" "http://arstechnica.com/civis/ucp.php?mode=login&return_to=http%3A%2F%2Farstechnica.com%2Findex.php" + "http://arstechnica.com/features" + "http://arstechnica.com/gadgets" + "http://arstechnica.com/gadgets/news/2011/12/apple-managed-to-extend-the.ars?comments=1" + "http://arstechnica.com/gadgets/news/2011/12/mobile-operator-turns-flagship-store-into-androidland.ars" + "http://arstechnica.com/gadgets/news/2011/12/rims-troubles-continue-blackberry-playbook-costing-company-485-million.ars" + "http://arstechnica.com/gadgets/news/2011/12/the-shards-bleeding-edge-anatomy-of-a-21st-century-skyscraper.ars" + "http://arstechnica.com/gadgets/reviews/2011/11/dont-call-it-a-tablet-the-kindle-fire-reviewed.ars" + "http://arstechnica.com/gadgets/reviews/2011/11/lean-mean-consuming-machine-reviewing-the-nook-tablet.ars" + "http://arstechnica.com/gaming" + "http://arstechnica.com/gaming/news/2011/11/2011-childs-play-drive-signed-ultima-halo-360-hardware-aliens-themed-nerf-gun.ars" + "http://arstechnica.com/gaming/news/2011/12/drm-free-gaming-distributor-gogcom-trades-convenience-for-safety.ars" + "http://arstechnica.com/gaming/news/2011/12/markus-notch-person-steps-down-as-lead-minecraft-developer.ars" + "http://arstechnica.com/gaming/news/2011/12/microsofts-bid-to-rule-your-living-room-with-the-xbox-360-begins-tomorrow.ars" + "http://arstechnica.com/gaming/reviews/2011/11/modern-warfare-3-single-player-on-pc-the-canonization-of-violence.ars" + "http://arstechnica.com/guides" + "http://arstechnica.com/hardware" + "http://arstechnica.com/media" + "http://arstechnica.com/microsoft" + "http://arstechnica.com/microsoft/news/2011/12/why-microsoft-should-and-shouldnt-support-the-legacy-windows-desktop-on-arm.ars" + "http://arstechnica.com/microsoft/news/2011/12/with-wp7-mango-available-for-all-microsoft-pushes-ahead-with-new-updates.ars" + "http://arstechnica.com/open-source" + "http://arstechnica.com/reviews" + "http://arstechnica.com/science" + "http://arstechnica.com/science/news/2011/11/how-the-collapse-of-a-scientific-hypothesis-led-to-a-lawsuit-and-arrest.ars" + "http://arstechnica.com/science/news/2011/12/-a-us-house-committee.ars?comments=1" + "http://arstechnica.com/science/news/2011/12/-voyager-1-has-confirmed.ars?comments=1" + "http://arstechnica.com/science/news/2011/12/new-approach-to-determining-human-impact-on-climate-gives-same-answer.ars" + "http://arstechnica.com/science/news/2011/12/new-approach-to-determining-human-impact-on-climate-gives-same-answer.ars/2" + "http://arstechnica.com/science/news/2011/12/short-circuiting-the-immune-system-to-block-hiv.ars" + "http://arstechnica.com/science/news/2011/12/transparent-crab-shell-holds-the-secret-to-bendable-screens.ars" + "http://arstechnica.com/science/news/2011/12/week-in-science-collapsing-ideas-and-long-distance-voyagers.ars" + "http://arstechnica.com/security" + "http://arstechnica.com/site/tv.ars" + "http://arstechnica.com/software" + "http://arstechnica.com/staff" + "http://arstechnica.com/subscriptions" + "http://arstechnica.com/tech-policy" + "http://arstechnica.com/tech-policy/news/2011/12/apple-motorola-att-sprint-t-mobile-latest-to-be-sued-over-carrier-iq-tracking.ars" + "http://arstechnica.com/tech-policy/news/2011/12/carrier-iq-hit-with-privacy-lawsuits-as-more-security-researchers-weigh-in.ars" + "http://arstechnica.com/tech-policy/news/2011/12/data-caps-a-crude-and-unfair-tool-for-easing-online-congestion.ars" + "http://arstechnica.com/tech-policy/news/2011/12/fcc-to-probe-san-francisco-subway-cell-phone-interruption-policy.ars" + "http://arstechnica.com/tech-policy/news/2011/12/gallery-how-the-surveillance-industry-markets-spyware-to-governments.ars" + "http://arstechnica.com/tech-policy/news/2011/12/setting-intelligent-internet-policy-requires-data-we-dont-have-and-arent-getting.ars" + "http://arstechnica.com/tech-policy/news/2011/12/suspension-of-disbelief-magicians-friends-targeted-by-new-phishing-scam.ars" + "http://arstechnica.com/tech-policy/news/2011/12/verizon-snags-36b-worth-of-spectrum-licenses-as-att-hits-fcc-roadbloack.ars" + "http://arstechnica.com/tech-policy/news/2011/12/week-on-the-web-kindle-vs-nook-att-vs-the-fcc-ie10-vs-windows-7.ars" + "http://arstechnica.com/tech-policy/news/2011/12/wikileaks-founder-julian-assange-will.ars?comments=1" + "http://arstechnica.com/telecom" + "http://arstechnica.com/web"]] + (is (= 81 (count result))) + (is (= (sort expected) (sort result))))) + + +(deftest test-has-anchor + (let [body (slurp (clojure.java.io/resource "html/arstechnica.com_full.html"))] + (is (has-anchor? body "/apple")) + (is (has-anchor? body "/apple" "Apple")) + (is (not (has-anchor? body "/apple" "Open Source"))) + (is (has-anchor? body "/open-source" "Open Source")) + (is (has-anchor? body "http://www.wired.com" "Wired")) + (is (has-anchor? body "http://www.style.com" "Style")))) diff --git a/test/clojurewerkz/crawlista/test/string.clj b/test/clojurewerkz/crawlista/test/string.clj new file mode 100644 index 0000000..ae08b9a --- /dev/null +++ b/test/clojurewerkz/crawlista/test/string.clj @@ -0,0 +1,29 @@ +(ns clojurewerkz.crawlista.test.string + (:import (java.net URI URL)) + (:use [clojurewerkz.crawlista.string] + [clojure.test])) + +(deftest test-maybe-prepend-www + (is (= "www.apple.com" (maybe-prepend "apple.com" "www."))) + (is (= "www.apple.com" (maybe-prepend "APPLE.com" "www."))) + (is (= "www.apple.com" (maybe-prepend "www.apple.com" "www.")))) + +(deftest test-maybe-chopl + (is (= (maybe-chopl "www.google.com" "www.") "google.com")) + (is (= (maybe-chopl "google.com" "goo") "gle.com")) + (is (= (maybe-chopl "Google.COM" "google.") "com")) + (is (= (maybe-chopl "www.www2.megacorp.net" "www.") "www2.megacorp.net"))) + +(deftest test-maybe-chopr + (is (= (maybe-chopr "http://www.google.com/" "/") "http://www.google.com")) + (is (= (maybe-chopr "google.com/" ".com/") "google")) + (is (= (maybe-chopr "Google.COM/" "/") "google.com"))) + +(deftest test-hexadecimal-to-int + (are [s i] (is (= (hex-to-int s) i)) + "0xFF00FF" 16711935 + "FF00FF" 16711935 + "0xFF0000" 16711680 + "FF0000" 16711680 + "0x001100" 4352 + "001100" 4352)) diff --git a/test/clojurewerkz/crawlista/test/url.clj b/test/clojurewerkz/crawlista/test/url.clj new file mode 100644 index 0000000..b705d13 --- /dev/null +++ b/test/clojurewerkz/crawlista/test/url.clj @@ -0,0 +1,106 @@ +(ns clojurewerkz.crawlista.test.url + (:import (java.net URI URL)) + (:use [clojurewerkz.crawlista.url] + [clojure.test])) + +(deftest test-relative-anchor-locality + (is (local-to? "http://wired.com/reviews" "wired.com")) + (is (local-to? "http://wired.com/reviews" "www.wired.com")) + (is (local-to? "http://www.wired.com/reviews" "wired.com")) + (is (local-to? "/reviews" "wired.com")) + (is (not (local-to? "http://wired.com/reviews" "apple.com"))) + (is (not (local-to? "http://apple.com/iphone" "wired.com"))) + (is (not (local-to? "http://apple.com/iphone" "clojure.org"))) + (is (not (local-to? "http://bit.ly/clojure.org" "clojure.org"))) + (is (not (local-to? "http://jobs.arstechnica.com" "arstechnica.com"))) + (is (not (local-to? "http://jobs.arstechnica.com/list/1186" "arstechnica.com"))) + (is (not (local-to? "//jobs.arstechnica.com/list/1186" "arstechnica.com")))) + + +(deftest test-strip-query-string + (is (= "http://novemberain.com" (strip-query-string "http://novemberain.com?query=string")))) + +(deftest test-resourcification + (is (= "http://novemberain.com/" (resourcify "http://NOVEMBERAIN.com?query=string"))) + (is (= "http://novemberain.com/" (resourcify "http://NOVEMBERAIN.com"))) + (is (= "http://novemberain.com/" (resourcify "http://NOVEMBERAIN.com?query=string#fragment"))) + (is (= "http://novemberain.com/page.html" (resourcify "http://NOVEMBERAIN.com/page.html?query=string"))) + (is (= "http://novemberain.com/page/" (resourcify "http://NOVEMBERAIN.com/page"))) + (is (= "http://novemberain.com/page/" (resourcify "http://NOVEMBERAIN.com/page/")))) + +(deftest test-client-side-hrefs + (is (client-side-href? " ")) + (is (client-side-href? "")) + (is (client-side-href? "#")) + (is (client-side-href? "#/")) + (is (client-side-href? "(0)")) + (is (client-side-href? "javascript: void()(0)")) + (is (client-side-href? "javascript: alert('123')"))) + +(deftest test-crawlable-hrefs + (is (crawlable-href? "http://username:password@host:80/path?query=string#fragment")) + (is (crawlable-href? "http://bit.ly/clojure.org")) + (is (crawlable-href? "http://apple.com/iphone")) + (is (crawlable-href? "www.apple.com/iphone")) + (is (crawlable-href? "apple.com/iphone")) + (is (crawlable-href? "apple.com:80/iphone")) + (is (crawlable-href? "apple.com:/iphone")) + (is (crawlable-href? "/reviews")) + (is (crawlable-href? "/blobs?page=3#comments")) + (is (not (crawlable-href? " "))) + (is (not (crawlable-href? ""))) + (is (not (crawlable-href? "#"))) + (is (not (crawlable-href? "#/"))) + (is (not (crawlable-href? "(0)"))) + (is (not (crawlable-href? "\\\"http:\\/\\/wired.com\\/apple\\/review-the-galaxy-nexus-from-an-iphone-owners-perspective\\/\\\""))) + (is (not (crawlable-href? "javascript: void()(0)"))) + (is (not (crawlable-href? (slurp (clojure.java.io/resource "js/href_value1.js"))))) + (is (not (crawlable-href? "javascript: alert('123')")))) + +(deftest test-host-normalization-with-strings + (are [input result] (is (= (normalize-host input) result)) + "http://www.google.com/" "http://google.com/" ;; does not deal with path. MK. + "www.google.com/" "google.com/" + "https://www.apple.com" "https://apple.com" + "http://www.store.apple.com" "http://store.apple.com" + (URL. "http://www.google.com/") (URL. "http://google.com/") + (URI. "http://www.google.com/") (URI. "http://google.com/"))) + +(deftest test-full-url-normalization-with-strings + (are [input result] (is (= (normalize-url input) result)) + "http://www.google.com/" "http://google.com" + "www.google.com/" "google.com" + "https://www.apple.com" "https://apple.com" + "http://www.store.apple.com" "http://store.apple.com")) + +(deftest test-absolutize-with-strings + (is (= (absolutize "" "http://giove.local") "http://giove.local")) + (is (= (absolutize "/" "http://giove.local") "http://giove.local/")) + (is (= (absolutize "/comments?authenticate=1" "http://giove.local") "http://giove.local/comments?authenticate=1")) + (are [input result] (is (= (absolutize input "http://giove.local") result)) + "" "http://giove.local" + "/" "http://giove.local/" + "/reviews" "http://giove.local/reviews" + "/autopia/2011/11/evs-go-off-grid/" "http://giove.local/autopia/2011/11/evs-go-off-grid/")) + +(deftest test-absolutize-with-uris + (are [input result] (is (= (absolutize input (URI. "http://giove.local")) result)) + (URI. "") (URI. "http://giove.local") + (URI. "/") (URI. "http://giove.local/") + (URI. "/reviews") (URI. "http://giove.local/reviews") + (URI. "/autopia/2011/11/evs-go-off-grid/") (URI. "http://giove.local/autopia/2011/11/evs-go-off-grid/"))) + +(deftest test-whether-uri-is-root + (is (root? "http://giove.local")) + (is (root? "http://giove.local/")) + (is (root? "https://giove.local")) + (is (root? "https://giove.local/")) + (is (root? "http://www.giove.local")) + (is (root? "HTTPS://www.giove.local/")) + (is (root? "https://subdomain.giove.local")) + (is (root? "https://subdomain.giove.local/")) + (is (root? "https://www.subdomain.giove.local")) + (is (root? "http://www.subdomain.giove.local/")) + (is (not (root? "http://giove.local/path"))) + (is (not (root? "https://giove.local/section/path"))) + (is (not (root? "HTTPS://www.giove.local/search?q=weather")))) diff --git a/test/resources/html/arstechnica.com.html b/test/resources/html/arstechnica.com.html new file mode 100644 index 0000000..17d3d41 --- /dev/null +++ b/test/resources/html/arstechnica.com.html @@ -0,0 +1,1173 @@ + + + + + + Ars Technica + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+ +
+
+ +
+ + +
+ + +
+
+ + + +
+
+
+ +
+
+
+
+

New approach to determining human impact on climate gives same answer

+
+
New approach to determining human impact on climate gives same answer
+ +
+

A new study tries to balance the books on the Earth's energy budget, and finds that greenhouse gasses and aerosols have large effects, but largely offset each other.

+
+ + + + + + + + +
+
+
+ +
+
+

Microsoft's bid to rule your living room with the Xbox 360 begins tomorrow

+
+
Microsoft's bid to rule your living room with the Xbox 360 begins tomorrow
+ +
+

Microsoft's latest Xbox 360 update goes live tomorrow, bringing with it expanded options for live and streaming television and movies.

+
+ + + + + + + + +
+
+
+ +
+
+

Apple, Motorola, AT&T, Sprint, T-Mobile latest to be sued over Carrier IQ tracking

+
+ +
+

Apple, Motorola, and three major wireless carriers are the latest to face a class-action lawsuit over a smartphone privacy scandal, with Carrier IQ, HTC, and Samsung are also facing allegations that they spy on users with software installed on smartphones.

+
+ + + + + + + + +
+
+
+ +
+
+

DRM-free gaming distributor GOG.com trades convenience for safety

+
+
DRM-free gaming distributor GOG.com trades convenience for safety
+ +
+

What's the worth of a fast checkout process for digital retailers? GOG.com recently ased its users that, and the data came back showing customers would rather deal with a longer checkout process if that meant their data wasn't kept on the company's servers. What's surprising is that the company is listening.

+
+ + + + + + + + +
+
+
+ +
+
+

Why Microsoft should, and shouldn't, support legacy Windows desktop on ARM

+
+
Why Microsoft should, and shouldn't, support legacy Windows desktop on ARM
+ +
+

Will Windows 8 on ARM include support for the traditional Windows desktop? Last week it was been rumored first that it won't, and then that it will after all. We think that Microsoft should do both.

+
+ + + + + + + + +
+
+
+ +
+
+

Suspension of Disbelief: magicians' friends targeted by new phishing scam

+
+
Suspension of Disbelief: magicians' friends targeted by new phishing scam
+ +
+

The well-worn friends-in-trouble-on-vacation scam that has plagued Facebook for years has evolved, as a husband-and-wife magic act found out the hard way.

+
+ + + + + + + + +
+
+
+ +
+
+

The Shard's bleeding edge: anatomy of a 21st century skyscraper

+
+
The Shard's bleeding edge: anatomy of a 21st century skyscraper
+ +
feature
+
+

As the tallest skyscraper in the EU goes up in London, we talk to the engineers behind it about the technology that makes the Shard possible.

+
+ + + + + + + + +
+
+
+ +
+
+

Gallery: how the surveillance industry markets spyware to governments

+
+
Gallery: how the surveillance industry markets spyware to governments
+ +
+

The latest Wikileaks dump offers a look at the slide presentations and brochures used by security companies to market malware and other spy tools to governments and law enforcement agencies.

+
+ + + + + + + + +
+
+
+ +
+
+

RIM's troubles continue: BlackBerry Playbook costing company $485 million

+
+
RIM's troubles continue: BlackBerry Playbook costing company $485 million
+ +
+

RIM can't catch a break. Not only is the company coping with dismal PlayBook tablet sales, it's also taking a near half-billion dollar hit for sitting inventory that must now be sold at rock-bottom prices.

+
+ + + + + + + + +
+
+
+ +
+
+

Researchers short-circuit the immune system to block HIV

+
+ +
+

Researchers give mice lifelong protection against HIV, without the need for a vaccine.

+
+ + + + + + + + +
+
+
+ +
+
+

Week on the Web: Kindle vs. Nook, AT&T vs. the FCC, IE10 vs. Windows 7

+
+
Week on the Web: Kindle vs. Nook, AT&T vs. the FCC, IE10 vs. Windows 7
+ +
+

A look back at the biggest stories of the week across Ars.

+
+ + + + + + + + +
+
+
+ +
+
+

Week in IT: the end of internal e-mail and tracking cell phone signals

+
+
Week in IT: the end of internal e-mail and tracking cell phone signals
+ +
+

Ars recaps the week's biggest stories from Uptime, our section devoted to IT topics.

+
+ + + + + + + + +
+
+
+ +
+
+

Week in science: collapsing ideas and long-distance Voyagers

+
+
Week in science: collapsing ideas and long-distance Voyagers
+ +
+

The Universe may not be so constant, the Voyager spacecraft continue to be reliable, and, closer to home, researchers have built a squishy robot.

+
+ + + + + + + + +
+
+
+ +
+
+

Thai flooding leaves Apple with 2TB drive shortage

+
+ +
+

Drive shortages caused by massive flooding in Thailand have left Apple without 2TB hard drives to install in build-to-order iMacs. At least one of Western Digital's plants is back up and running.

+
+ + + + + + + + +
+
+
+ +
+
+

Carrier IQ hit with privacy lawsuits as more security researchers weigh in

+
+
Carrier IQ hit with privacy lawsuits as more security researchers weigh in
+ +
+

Carrier IQ has been hit with two class-action lawsuits from users worried about how the company's software tracks their smartphone activity.

+
+ + + + + + + + +
+
+
+ +
+
+

Mobile operator turns flagship store into Androidland

+
+
Mobile operator turns flagship store into Androidland
+ +
+

Australian mobile network operator Telstra has set up a mini Android theme park inside of its flagship Melbourne store.

+
+ + + + + + + + +
+
+
+ +
+
+

Setting smart Internet policy requires data we don't have, aren't getting

+
+
Setting smart Internet policy requires data we don't have, aren't getting
+ +
+

It would be nice to think that we could develop regulations for the Internet based on how people are actually using it. Unfortunately, according to some Harvard researchers, we're not doing enough to gather that data.

+
+ + + + + + + + +
+
+
+ +
+
+

FCC to probe San Francisco subway cell phone "interruption" policy

+
+
FCC to probe San Francisco subway cell phone "interruption" policy
+ +
+

The FCC says it wants to take a look at the legal and First Amendment implications of the new cell phone interruption policy employed by San Francisco and Oakland's Bay Area Rapid Transit (BART).

+
+ + + + + + + + +
+
+
+ +
+
+

Data caps a "crude and unfair tool" for easing online congestion

+
+
Data caps a "crude and unfair tool" for easing online congestion
+ +
+

Two European Internet experts argue that monthly data caps don't actually help with "congestion"—and they have some data to prove it.

+
+ + + + + + + + +
+
+
+ +
+
+

Verizon snags $3.6B worth of spectrum licenses as AT&T hits FCC roadbloack

+
+ +
+

Verizon Wireless has struck a $3.6 billion deal to buy wireless spectrum covering 259 million Americans from Comcast, Time Warner Cable, and Bright House Networks.

+
+ + + + + + + + +
+
+
+ +
+
+

Markus "Notch" Persson steps down as lead Minecraft developer

+
+ +
+

Markus "Notch" Persson has long been the lead developer and face of the game Minecraft, but has announced he will scale back his involvement on the project to get started on something new.

+
+ + + + + + + + +
+
+
+ +
+
+

Bill would end overtime pay requirement for many more IT workers

+
+ +
+

A bill recently introduced in Congress would greatly expand the exemption to the Fair Labor Standards Act for IT employees, ending overtime benefits for many more types of workers, including network, database and security specialists.

+
+ + + + + + + + +
+
+
+ +
+
+

Transparent crab shell holds the secret to bendable screens

+
+
Transparent crab shell holds the secret to bendable screens
+ +
+

Researchers turn a crab shell transparent, an advance that could lead to thin, bendable displays.

+
+ + + + + + + + +
+
+
+ +
+
+

iPhone battery life issues may continue to vex users—even post iOS 5.1

+
+
iPhone battery life issues may continue to vex users—even post iOS 5.1
+ +
+

iPhone 4S users still complain that a software fix for battery life problems hasn't addressed the issue fully, and the first beta of iOS 5.1 reportedly offers no help. What little is known about the problem outside of Apple is that the issue is software, and not hardware, but any fix won't be easy.

+
+ + + + + + + + +
+
+
+ +
+
+

With WP7 Mango available for all, Microsoft pushes ahead with new updates

+
+ +
+

With Mango now available for the last few unpatched handsets, Microsoft is using its streamlined and improved update process to push out other fixes and features. The result is an almost Apple-like update process that's a clear improvement on what Android offers users.

+
+ + + + + + + + +
+
+
+
+ + +
+ + + +
+ +
+ + + + + + + + + + + + + + + + + + diff --git a/test/resources/html/arstechnica.com_full.html b/test/resources/html/arstechnica.com_full.html new file mode 100644 index 0000000..859e028 --- /dev/null +++ b/test/resources/html/arstechnica.com_full.html @@ -0,0 +1,1278 @@ + + + + + + Ars Technica + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+ +
+
+ +
+ + +
+ + +
+
+ + + +
+
+
+ +
+
+
+
+

Tech entrepreneurs attack SOPA on the eve of markup

+
+
Tech entrepreneurs attack SOPA on the eve of markup
+ +
+

With the House Judiciary Committee scheduled to meet on Thursday to consider changes to the Stop Online Piracy Act, the co-founders of some of the nation's leading Internet companies have signed a letter condemning the proposal.

+
+ + + + + + + + +
+
+
+ +
+
+

Sonic CD remains fun, fast, and now includes the Japanese soundtrack

+
+
<em>Sonic CD</em> remains fun, fast, and now includes the Japanese soundtrack
+ +
+

Sonic CD is out now on the Xbox Live Arcade for $5. We'll describe how it looks and plays, but it's a classic, it's $5, and it's coming to a variety of platforms. This one is a no-brainer.

+
+ + + + + + + + +
+
+
+ +
+
+

Stolen iPhone? Your iMessages may still be going to the wrong place

+
+
Stolen iPhone? Your iMessages may still be going to the wrong place
+ +
+

iPhone owners shouldn't have to worry that a thief might receive their messages after they remote wipe and deactivate the account, right? Maybe not. For some users, iMessages seem to go through to their stolen devices even after taking the right precautionary measures.

+
+ + + + + + + + +
+
+
+ +
+
+

The Milky Way's black hole may spring to life in 2013

+
+
The Milky Way's black hole may spring to life in 2013
+ +
+

Astronomers have spotted a cloud of gas with three times the mass of the Earth on a near-collision course with the Milky Way's central black hole.

+
+ + + + + + + + +
+
+
+ +
+
+

"Retina" MacBook Pros shipping next year? It's possible

+
+ +
+

Apple might release a super high resolution "retina" MacBook Pro beginning in 2012, which would be a perfect fit for Lion's HiDPI display technology.

+
+ + + + + + + + +
+
+
+ +
+
+

The NTSB wants you to shut up and drive

+
+
The NTSB wants you to shut up and drive
+ +
+

After studying crash statistics and the causes of a fatal Missouri accident last year, the NTSB has recommended banning the use of all electronic devices while driving.

+
+ + + + + + + + +
+
+
+ +
+
+

Scientists shrink a Stirling heat engine to single microscopic particle

+
+
Scientists shrink a Stirling heat engine to single microscopic particle
+ +
+

Researchers created a microscopic version of the Stirling engine using a single particle. At such small scales, the random fluctuations of Brownian motion affect the position of the particle and the work output of the engine.

+
+ + + + + + + + +
+
+
+ +
+
+

The latest Humble Bundle offers amazing games, promotes multiplatform releases

+
+
The latest Humble Bundle offers amazing games, promotes multiplatform releases
+ +
+

The latest Humble Bundle may offer the best series of games yet, including both Super Meat Boy, and Shank, and the "Bundle Advocate" hints at what's coming in the future for the Humble team.

+
+ + + + + + + + +
+
+
+ +
+
+

Devonian-era lungfish may have faked us out, left tetrapod-like tracks

+
+
Devonian-era lungfish may have faked us out, left tetrapod-like tracks
+ +
+

The modern relatives of the fish that gave rise to vertebrates with four limbs can also go for a walk, using their fins to propel themselves across river floors.

+
+ + + + + + + + +
+
+
+ +
+
+

Leadership change could herald Windows 8-powered phones; what's the hurry?

+
+
Leadership change could herald Windows 8-powered phones; what's the hurry?
+ +
+

Andy Lees, president of Windows Phone at Microsoft has been moved into a new role that bridges Windows Phone and Windows 8. It's hard to tell if this is a promotion or a demotion; this could be a first step towards a unified Windows platform that runs everywhere from the phone to the PC to the living room, or it could be a move to sideline a man whose major product, Windows Phone, hasn't been overwhelmingly successful.

+
+ + + + + + + + +
+
+
+ +
+
+

Facebook looks to fix PHP performance with HipHop virtual machine

+
+
Facebook looks to fix PHP performance with HipHop virtual machine
+ +
+

Based on the open-source PHP-to-C++ translator used to create nearly all of Facebook's production websites, the HipHop virtual machine turns script into bytecode, boosting performance.

+
+ + + + + + + + +
+
+
+ +
+
+

Microsoft squashes Duqu threat with Windows patch

+
+ +
+

A month after releasing a temporary workaround to block malware exploiting a Windows kernel vulnerability, Microsoft today issued a patch for all supported releases of Windows aimed at putting an end to attacks based on the Duqu worm.

+
+ + + + + + + + +
+
+
+ +
+
+

Google pulls 22 apps from Android Market to prevent fraudulent charges

+
+ +
+

Google has reportedly removed 22 malicious applications from the Android Market after security vendors pointed out that the apps had been downloaded 14,000 times and were attempting to trick users into accepting fraudulent charges via SMS.

+
+ + + + + + + + +
+
+
+ +
+
+

RAAM's Shadow DLC adds context, richness to world of Gears of Wars

+
+
<em>RAAM's Shadow</em> DLC adds context, richness to world of <em>Gears of Wars</em>
+ +
+

RAAM's Shadow, a $15 content pack for Gears of War 3, shows what Sera looked like before the war destroyed much of the planet's elegance, and it puts you in control of one of the most powerful characters in the series.

+
+ + + + + + + + +
+
+
+ +
+
+

Apple aiming to improve iOS notifications further with fresh talent

+
+
Apple aiming to improve iOS notifications further with fresh talent
+ +
+

iOS 5's Notification Center is a vast improvement over what was previously available, but that doesn't mean it's perfect. To improve the situation, Apple has been hiring on some fresh eyes.

+
+ + + + + + + + +
+
+
+ +
+
+

Possible Higgs boson signals, but we won't know for sure until next year

+
+
Possible Higgs boson signals, but we won't know for sure until next year
+ +
+

If the Higgs exists at anywhere near the energies that we think it must, then it's probably somewhere right around 125GeV. Both of the LHC's detectors see a signal there, but it doesn't yet rise to the level that constitutes evidence for the Higgs.

+
+ + + + + + + + +
+
+
+ +
+
+

Apple reportedly tweaking iAd terms to better appeal to advertisers

+
+ +
+

Apple's mobile advertising service has seen some success since its launch in 2010, but nowhere near what Steve Jobs initially projected. Now, the company is reportedly changing some of its requirements in order to attract more advertisers.

+
+ + + + + + + + +
+
+
+ +
+
+

FBI using Carrier IQ info for "law enforcement purposes," refuses to release records

+
+
FBI using Carrier IQ info for "law enforcement purposes," refuses to release records
+ +
+

The FBI refuses to release information related to Carrier IQ, the tracking software installed on millions of smartphones. The FBI is using the information in a law enforcement case, but whether it is investigating Carrier IQ for privacy violations or using the information for other investigations is unknown.

+
+ + + + + + + + +
+
+
+ +
+
+

Sponsor waters down online blacklisting bill, but STOP still has issues

+
+ +
+

Rep. Lamar Smith (R-Texas) introduced amendments late Monday to the Stop Online Piracy Act—changes that dramatically water down the measure's scope. Among the amendments is a clarification that "rogue" sites must be foreign, and that rights holders now do not have the power to demand financial institutions and ad networks to stop doing business with infringing sites.

+
+ + + + + + + + +
+
+
+ +
+
+

Judge dismisses "Other OS" class-action suit against Sony

+
+ +
+

A class-action lawsuit against Sony for removing the "Other OS" features of the PlayStation 3 has been dismissed. The Judge noted that while the move may not have been the best thing for customers, there was no legal argument against Sony's actions in the current suit.

+
+ + + + + + + + +
+
+
+ +
+
+

Creative Commons ponders ports and database rights for license update

+
+
Creative Commons ponders ports and database rights for license update
+ +
+

Creative Commons is planning to issue a major update to its suite of licenses next year. Some topics of discussion include harmonizing with database rights and ending regional ports in favor of improving the international applicability of unported licenses.

+
+ + + + + + + + +
+
+
+ +
+
+

Domain seizure oversight lax and broken, targets out of luck

+
+
Domain seizure oversight lax and broken, targets out of luck
+ +
+

Can the government really hold a domain for a year and then return it without so much as an apology? A legal expert tells Ars the courts have have done a poor job overseeing the domain seizure program, but the victims are unlikely to receive significant compensation.

+
+ + + + + + + + +
+
+
+ +
+
+

Azure price cuts, bigger databases, now with node.js and MongoDB support, Hadoop on its way

+
+
Azure price cuts, bigger databases, now with node.js and MongoDB support, Hadoop on its way
+ +
+

Microsoft has released a set of updates to its Azure cloud computing platform to make it cheaper, more scalable, and easier to manage. On top of that, the company has shipped a new SDK for node.js, new guidance for MongoDB, and has started to offer support for Apache Hadoop.

+
+ + + + + + + + +
+
+
+ +
+
+

Apple extending AirPlay to low-power, low-latency Bluetooth 4.0

+
+ +
+

Apple will be enabling future iOS devices to connect to accessories using Bluetooth 4.0 instead of WiFi.

+
+ + + + + + + + +
+
+
+ +
+
+

Microsoft expands presence on iOS and Android with OneNote and Lync apps

+
+ +
+

Microsoft is boosting Office's presence on mobile devices with a new OneNote app designed for Apple's iPad and a Lync application for the major mobile platforms.

+
+ + + + + + + + +
+
+
+
+ + +
+ + + +
+ +
+ + + + + + + + + + + + + + + + + + diff --git a/test/resources/html/wired.com.html b/test/resources/html/wired.com.html new file mode 100644 index 0000000..bbba123 --- /dev/null +++ b/test/resources/html/wired.com.html @@ -0,0 +1,64 @@ + + + + Wired.com + + + + + +
+
+ +
+ + +
+ + diff --git a/test/resources/js/href_value1.js b/test/resources/js/href_value1.js new file mode 100644 index 0000000..c7cc221 --- /dev/null +++ b/test/resources/js/href_value1.js @@ -0,0 +1 @@ +var varname = "wijax_85438735f00440f619effb45051fc5df"; window[varname]="
\n\n\n\n\n\t
\n\t\t\t\t
\n\t\t\t\n\n\t\t\t

4 reasons for Amazon Kindle’s 4X sales boost<\/a><\/h2>\n\t\t\t
\n\t\t\t\tBy Ryan Kim<\/a><\/span>\n\t\t\t\tNov. 28, 2011, 10:31am PT<\/span>\n\t\t\t\t2 Comments<\/a><\/span>\n\t\t\t<\/div>\n\t\t<\/div>\n\t\t
<\/span><\/div><\/div><\/a>\t\t

Amazon did some big Black Friday business with its Kindle devices, recording a 4x increase over the previous year, the company crowed today. The numbers show that Amazon is still accelerating its Kindle business and the Kindle Fire is likely contributing to the sales growth. \n Read More »<\/a><\/p><\/div>\n\t<\/div>\n\n\t\n\t