Initial commit

michaelklishin · Dec 20, 2011 · 23b24f4 · 23b24f4
commit 23b24f4
Show file tree

Hide file tree

Showing 15 changed files with 3,058 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+pom.xml
+*jar
+/lib/
+/classes/
+.lein-failures
+.lein-deps-sum
+TAGS
+checkouts/*
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1 @@
+language: clojure
diff --git a/README.md b/README.md
@@ -0,0 +1,31 @@
+# What is Crawlista
+
+Crawlista is a support library for Clojure applications that crawl the Web.
+
+
+## Usage
+
+### Installation
+
+With Leiningen
+
+    [clojurewerkz/crawlista "1.0.0-SNAPSHOT"]
+
+New snapshots are [published to clojars.org](https://clojars.org/clojurewerkz/crawlista) every day (if there are any changes).
+
+
+## Crawlista is a Work In Progress
+
+Crawlista is a work in progress. Please see our test suite for code examples.
+
+
+## Supported Clojure versions
+
+Crawlista is built from the ground up for Clojure 1.3 and up.
+
+
+## License
+
+Copyright (C) 2011 Michael S. Klishin
+
+Distributed under the Eclipse Public License, the same as Clojure.
diff --git a/project.clj b/project.clj
@@ -0,0 +1,11 @@
+(defproject clojurewerkz/crawlista "1.0.0-SNAPSHOT"
+  :description "Support library for Clojure applications that crawl the Web"
+  :dependencies [[org.clojure/clojure "1.3.0"]
+                 [clj-http            "0.2.4"]
+                 [org.jsoup/jsoup     "1.6.1"]
+                 [clojurewerkz/urly   "1.0.0-SNAPSHOT"]]
+  :source-path        "src/clojure"
+  :java-source-path   "src/java"
+  :resources-path     "src/resources"
+  :dev-resources-path "test/resources"
+  :warn-on-reflection true)
diff --git a/src/clojure/clojurewerkz/crawlista/extraction.clj b/src/clojure/clojurewerkz/crawlista/extraction.clj
@@ -0,0 +1,77 @@
+(ns clojurewerkz.crawlista.extraction
+  (:import [org.jsoup Jsoup]
+           [org.jsoup.nodes Element]
+           [java.net URI URL MalformedURLException])
+  (:use    [clojurewerkz.crawlista.string]
+           [clojurewerkz.crawlista.url]))
+
+;;
+;; Implementation
+;;
+
+(defn- urls-from
+  [anchors]
+  (map (fn [a] (.attr ^Element a "href")) anchors))
+
+
+;;
+;; API
+;;
+
+(defn extract-anchors
+  [body]
+  (seq (-> (Jsoup/parse body)
+           (.getElementsByTag "a"))))
+
+(defn extract-local-anchors
+  [body uri]
+  (let [host        (.getHost (URL. uri))]
+    (seq (-> (Jsoup/parse body)
+             (.getElementsByTag "a")))))
+
+(defn extract-local-urls
+  [body uri]
+  (let [host        (.getHost (URL. (strip-query-string uri)))
+        anchors     (extract-local-anchors body uri)
+        hrefs       (urls-from anchors)]
+    (distinct (map (fn [^String s] (normalize-url (absolutize s uri)))
+                   (filter (fn [^String s] (local-to? (strip-query-string s) host)) hrefs)))))
+
+(defn followable?
+  [^Element anchor]
+  (let [rel-value (.attr anchor "rel")]
+    (or (nil? rel-value)
+        (not (= "nofollow"
+                (-> rel-value .toLowerCase .trim))))))
+
+(defn extract-local-followable-anchors
+  [body uri]
+  (filter followable? (extract-local-anchors body uri)))
+
+(defn extract-local-followable-urls
+  [body uri]
+  (let [host       (.getHost (URL. uri))
+        anchors    (extract-local-followable-anchors body (strip-query-string uri))
+        urls       (filter crawlable-href? (urls-from anchors))]
+    (distinct (map (fn [^String s] (normalize-url (absolutize s uri)))
+                   (filter (fn [^String s] (local-to? s host)) urls)))))
+
+
+(defn extract-title
+  [^String body]
+  (-> (Jsoup/parse body) .title))
+
+
+(defn has-anchor?
+  ([body uri]
+     (let [hrefs (urls-from (extract-anchors body))]
+       (some (fn [^String s]
+               (and s
+                    (= (resourcify s) (resourcify uri)))) hrefs)))
+  ([body uri text]
+     (let [anchors (extract-anchors body)]
+       (some (fn [^Element anchor]
+               (let [href (.attr anchor "href")]
+                 (and href
+                      (= (resourcify href) (resourcify uri))
+                      (= (.text anchor) text)))) anchors))))
diff --git a/src/clojure/clojurewerkz/crawlista/string.clj b/src/clojure/clojurewerkz/crawlista/string.clj
@@ -0,0 +1,34 @@
+(ns clojurewerkz.crawlista.string
+  (:use [clojure.string :only [split blank?]]))
+
+(defn maybe-prepend
+  [^String s ^String prefix]
+  (.toLowerCase (if (.startsWith (.toLowerCase s) (.toLowerCase prefix))
+                  s
+                  (str prefix s))))
+
+(defn maybe-append
+  [^String s ^String suffix]
+  (.toLowerCase (if (.endsWith (.toLowerCase s) (.toLowerCase suffix))
+                  s
+                  (str s suffix))))
+
+(defn maybe-chopl
+  [^String s ^String prefix]
+  (let [ls (.toLowerCase s)]
+    (if (.startsWith ls prefix)
+      (.replaceAll ls (str "^" prefix) "")
+      s)))
+
+(defn maybe-chopr
+  [^String s ^String suffix]
+  (let [ls (.toLowerCase s)]
+    (if (.endsWith ls suffix)
+      (.replaceAll ls (str suffix "$") "")
+      s)))
+
+(defn hex-to-int
+  [^String s]
+    (Long/parseLong (if (.startsWith s "0x")
+                      (subs s 2)
+                      s) 16))
diff --git a/src/clojure/clojurewerkz/crawlista/url.clj b/src/clojure/clojurewerkz/crawlista/url.clj
@@ -0,0 +1,116 @@
+(ns clojurewerkz.crawlista.url
+  (:import [java.net URI URL MalformedURLException]
+           [clojurewerkz.urly UrlLike])
+  (:use [clojure.string :only [split blank?]]
+        [clojurewerkz.crawlista.string]
+        [clojure.string :only [lower-case]]
+        [clojurewerkz.urly.core :only [path-of]]))
+
+
+(defn strip-query-string
+  [^String s]
+  (.replaceAll s "\\?.*$" ""))
+
+(def resourcify
+  (comp (fn [^String s]
+          (if-not (re-find #"\.([a-zA-Z0-9]+)$" (path-of s))
+            (maybe-append s "/")
+            s))
+        strip-query-string
+        lower-case))
+
+(defn separate-query-string
+  [^String s]
+  (split s #"\?"))
+
+(defn client-side-href?
+  [^String s]
+  (or (.startsWith s "#")
+      (.startsWith s "(")
+      (.startsWith (.toLowerCase s) "javascript")
+      (blank? s)))
+
+(defn crawlable-href?
+  [^String s]
+  (and (not (client-side-href? s)) (try
+                                     (URI. (strip-query-string s))
+                                     true
+                                     (catch java.net.URISyntaxException se
+                                       false)
+                                     (catch Exception e
+                                       false))))
+
+
+(defprotocol URLNormalization
+  (normalize-host [input] "Normalizes host by chopping off www. if necessary")
+  (normalize-url  [input] "Normalizes URL by chopping off www. at the beginning and trailing slash at the end, if necessary")
+  (absolutize     [input against] "Returns absolute URL")
+  (relativize     [input against] "Returns relative URL"))
+
+(extend-protocol URLNormalization
+  String
+  (normalize-host [input]
+    (try
+      (let [url  (URL. input)
+            url* (URL. (.getProtocol url) (maybe-chopl (.toLowerCase (.getHost url)) "www.") (.getPort url) (.getFile url))]
+        (str url*))
+      (catch MalformedURLException e
+        (maybe-chopl (.toLowerCase input) "www."))))
+  (normalize-url [input]
+    (maybe-chopr (normalize-host input) "/"))
+  (absolutize [input against]
+    (let [[input-without-query-string query-string] (separate-query-string input)
+          resolved (.toString (.resolve (URI. against)
+                                        (URI. input-without-query-string)))]
+      (if query-string
+        (str resolved "?" query-string)
+        resolved)))
+
+
+  URL
+  (normalize-host [input]
+    (URL. (.getProtocol input) (maybe-chopl (.toLowerCase (.getHost input)) "www.") (.getPort input) (.getFile input)))
+
+
+  URI
+  (normalize-host [input]
+    (URI. (.getScheme input) nil (maybe-chopl (.toLowerCase (.getHost input)) "www.") (.getPort input) (.getPath input) nil nil))
+  (absolutize [input ^java.net.URI against]
+    (.resolve against input)))
+
+
+
+(defprotocol DomainRoot
+  (root? [input] "Returns true if given URL/URI is the site root"))
+
+(extend-protocol DomainRoot
+  String
+  (root? [input]
+    (root? (URI. (strip-query-string input))))
+
+  URI
+  (root? [input]
+    (.isEmpty (UrlLike/normalizePath (.getPath input))))
+
+  URL
+  (root? [input]
+    (root? (.toURI input))))
+
+
+(defn- maybe-prepend-protocol
+  "Fixes broken URLs like //jobs.arstechnica.com/list/1186 (that parse fine and both have host and are not considered absolute by java.net.URI)"
+  ([^String uri-str]
+     (maybe-prepend-protocol uri-str "http"))
+  ([^String uri-str ^String proto]
+     (let [uri (URI. uri-str)]
+       (if (and (not (.isAbsolute uri))
+                (not (nil? (.getHost uri))))
+         (str proto ":" uri-str)
+         uri-str))))
+
+(defn local-to?
+  [^String uri-str ^String host]
+  (let [uri (URI. (-> uri-str strip-query-string (maybe-prepend-protocol "http")))]
+    (or (and (.getHost uri)
+             (= (maybe-prepend (.toLowerCase host) "www.") (maybe-prepend (.toLowerCase (.getHost uri)) "www.")))
+        (not (.isAbsolute uri)))))
diff --git a/src/java/.gitkeep b/src/java/.gitkeep