Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 23b24f4
Showing
15 changed files
with
3,058 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
pom.xml | ||
*jar | ||
/lib/ | ||
/classes/ | ||
.lein-failures | ||
.lein-deps-sum | ||
TAGS | ||
checkouts/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
language: clojure |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# What is Crawlista | ||
|
||
Crawlista is a support library for Clojure applications that crawl the Web. | ||
|
||
|
||
## Usage | ||
|
||
### Installation | ||
|
||
With Leiningen | ||
|
||
[clojurewerkz/crawlista "1.0.0-SNAPSHOT"] | ||
|
||
New snapshots are [published to clojars.org](https://clojars.org/clojurewerkz/crawlista) every day (if there are any changes). | ||
|
||
|
||
## Crawlista is a Work In Progress | ||
|
||
Crawlista is a work in progress. Please see our test suite for code examples. | ||
|
||
|
||
## Supported Clojure versions | ||
|
||
Crawlista is built from the ground up for Clojure 1.3 and up. | ||
|
||
|
||
## License | ||
|
||
Copyright (C) 2011 Michael S. Klishin | ||
|
||
Distributed under the Eclipse Public License, the same as Clojure. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
(defproject clojurewerkz/crawlista "1.0.0-SNAPSHOT" | ||
:description "Support library for Clojure applications that crawl the Web" | ||
:dependencies [[org.clojure/clojure "1.3.0"] | ||
[clj-http "0.2.4"] | ||
[org.jsoup/jsoup "1.6.1"] | ||
[clojurewerkz/urly "1.0.0-SNAPSHOT"]] | ||
:source-path "src/clojure" | ||
:java-source-path "src/java" | ||
:resources-path "src/resources" | ||
:dev-resources-path "test/resources" | ||
:warn-on-reflection true) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
(ns clojurewerkz.crawlista.extraction | ||
(:import [org.jsoup Jsoup] | ||
[org.jsoup.nodes Element] | ||
[java.net URI URL MalformedURLException]) | ||
(:use [clojurewerkz.crawlista.string] | ||
[clojurewerkz.crawlista.url])) | ||
|
||
;; | ||
;; Implementation | ||
;; | ||
|
||
(defn- urls-from | ||
[anchors] | ||
(map (fn [a] (.attr ^Element a "href")) anchors)) | ||
|
||
|
||
;; | ||
;; API | ||
;; | ||
|
||
(defn extract-anchors | ||
[body] | ||
(seq (-> (Jsoup/parse body) | ||
(.getElementsByTag "a")))) | ||
|
||
(defn extract-local-anchors | ||
[body uri] | ||
(let [host (.getHost (URL. uri))] | ||
(seq (-> (Jsoup/parse body) | ||
(.getElementsByTag "a"))))) | ||
|
||
(defn extract-local-urls | ||
[body uri] | ||
(let [host (.getHost (URL. (strip-query-string uri))) | ||
anchors (extract-local-anchors body uri) | ||
hrefs (urls-from anchors)] | ||
(distinct (map (fn [^String s] (normalize-url (absolutize s uri))) | ||
(filter (fn [^String s] (local-to? (strip-query-string s) host)) hrefs))))) | ||
|
||
(defn followable? | ||
[^Element anchor] | ||
(let [rel-value (.attr anchor "rel")] | ||
(or (nil? rel-value) | ||
(not (= "nofollow" | ||
(-> rel-value .toLowerCase .trim)))))) | ||
|
||
(defn extract-local-followable-anchors | ||
[body uri] | ||
(filter followable? (extract-local-anchors body uri))) | ||
|
||
(defn extract-local-followable-urls | ||
[body uri] | ||
(let [host (.getHost (URL. uri)) | ||
anchors (extract-local-followable-anchors body (strip-query-string uri)) | ||
urls (filter crawlable-href? (urls-from anchors))] | ||
(distinct (map (fn [^String s] (normalize-url (absolutize s uri))) | ||
(filter (fn [^String s] (local-to? s host)) urls))))) | ||
|
||
|
||
(defn extract-title | ||
[^String body] | ||
(-> (Jsoup/parse body) .title)) | ||
|
||
|
||
(defn has-anchor? | ||
([body uri] | ||
(let [hrefs (urls-from (extract-anchors body))] | ||
(some (fn [^String s] | ||
(and s | ||
(= (resourcify s) (resourcify uri)))) hrefs))) | ||
([body uri text] | ||
(let [anchors (extract-anchors body)] | ||
(some (fn [^Element anchor] | ||
(let [href (.attr anchor "href")] | ||
(and href | ||
(= (resourcify href) (resourcify uri)) | ||
(= (.text anchor) text)))) anchors)))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
(ns clojurewerkz.crawlista.string | ||
(:use [clojure.string :only [split blank?]])) | ||
|
||
(defn maybe-prepend | ||
[^String s ^String prefix] | ||
(.toLowerCase (if (.startsWith (.toLowerCase s) (.toLowerCase prefix)) | ||
s | ||
(str prefix s)))) | ||
|
||
(defn maybe-append | ||
[^String s ^String suffix] | ||
(.toLowerCase (if (.endsWith (.toLowerCase s) (.toLowerCase suffix)) | ||
s | ||
(str s suffix)))) | ||
|
||
(defn maybe-chopl | ||
[^String s ^String prefix] | ||
(let [ls (.toLowerCase s)] | ||
(if (.startsWith ls prefix) | ||
(.replaceAll ls (str "^" prefix) "") | ||
s))) | ||
|
||
(defn maybe-chopr | ||
[^String s ^String suffix] | ||
(let [ls (.toLowerCase s)] | ||
(if (.endsWith ls suffix) | ||
(.replaceAll ls (str suffix "$") "") | ||
s))) | ||
|
||
(defn hex-to-int | ||
[^String s] | ||
(Long/parseLong (if (.startsWith s "0x") | ||
(subs s 2) | ||
s) 16)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
(ns clojurewerkz.crawlista.url | ||
(:import [java.net URI URL MalformedURLException] | ||
[clojurewerkz.urly UrlLike]) | ||
(:use [clojure.string :only [split blank?]] | ||
[clojurewerkz.crawlista.string] | ||
[clojure.string :only [lower-case]] | ||
[clojurewerkz.urly.core :only [path-of]])) | ||
|
||
|
||
(defn strip-query-string | ||
[^String s] | ||
(.replaceAll s "\\?.*$" "")) | ||
|
||
(def resourcify | ||
(comp (fn [^String s] | ||
(if-not (re-find #"\.([a-zA-Z0-9]+)$" (path-of s)) | ||
(maybe-append s "/") | ||
s)) | ||
strip-query-string | ||
lower-case)) | ||
|
||
(defn separate-query-string | ||
[^String s] | ||
(split s #"\?")) | ||
|
||
(defn client-side-href? | ||
[^String s] | ||
(or (.startsWith s "#") | ||
(.startsWith s "(") | ||
(.startsWith (.toLowerCase s) "javascript") | ||
(blank? s))) | ||
|
||
(defn crawlable-href? | ||
[^String s] | ||
(and (not (client-side-href? s)) (try | ||
(URI. (strip-query-string s)) | ||
true | ||
(catch java.net.URISyntaxException se | ||
false) | ||
(catch Exception e | ||
false)))) | ||
|
||
|
||
(defprotocol URLNormalization | ||
(normalize-host [input] "Normalizes host by chopping off www. if necessary") | ||
(normalize-url [input] "Normalizes URL by chopping off www. at the beginning and trailing slash at the end, if necessary") | ||
(absolutize [input against] "Returns absolute URL") | ||
(relativize [input against] "Returns relative URL")) | ||
|
||
(extend-protocol URLNormalization | ||
String | ||
(normalize-host [input] | ||
(try | ||
(let [url (URL. input) | ||
url* (URL. (.getProtocol url) (maybe-chopl (.toLowerCase (.getHost url)) "www.") (.getPort url) (.getFile url))] | ||
(str url*)) | ||
(catch MalformedURLException e | ||
(maybe-chopl (.toLowerCase input) "www.")))) | ||
(normalize-url [input] | ||
(maybe-chopr (normalize-host input) "/")) | ||
(absolutize [input against] | ||
(let [[input-without-query-string query-string] (separate-query-string input) | ||
resolved (.toString (.resolve (URI. against) | ||
(URI. input-without-query-string)))] | ||
(if query-string | ||
(str resolved "?" query-string) | ||
resolved))) | ||
|
||
|
||
URL | ||
(normalize-host [input] | ||
(URL. (.getProtocol input) (maybe-chopl (.toLowerCase (.getHost input)) "www.") (.getPort input) (.getFile input))) | ||
|
||
|
||
URI | ||
(normalize-host [input] | ||
(URI. (.getScheme input) nil (maybe-chopl (.toLowerCase (.getHost input)) "www.") (.getPort input) (.getPath input) nil nil)) | ||
(absolutize [input ^java.net.URI against] | ||
(.resolve against input))) | ||
|
||
|
||
|
||
(defprotocol DomainRoot | ||
(root? [input] "Returns true if given URL/URI is the site root")) | ||
|
||
(extend-protocol DomainRoot | ||
String | ||
(root? [input] | ||
(root? (URI. (strip-query-string input)))) | ||
|
||
URI | ||
(root? [input] | ||
(.isEmpty (UrlLike/normalizePath (.getPath input)))) | ||
|
||
URL | ||
(root? [input] | ||
(root? (.toURI input)))) | ||
|
||
|
||
(defn- maybe-prepend-protocol | ||
"Fixes broken URLs like //jobs.arstechnica.com/list/1186 (that parse fine and both have host and are not considered absolute by java.net.URI)" | ||
([^String uri-str] | ||
(maybe-prepend-protocol uri-str "http")) | ||
([^String uri-str ^String proto] | ||
(let [uri (URI. uri-str)] | ||
(if (and (not (.isAbsolute uri)) | ||
(not (nil? (.getHost uri)))) | ||
(str proto ":" uri-str) | ||
uri-str)))) | ||
|
||
(defn local-to? | ||
[^String uri-str ^String host] | ||
(let [uri (URI. (-> uri-str strip-query-string (maybe-prepend-protocol "http")))] | ||
(or (and (.getHost uri) | ||
(= (maybe-prepend (.toLowerCase host) "www.") (maybe-prepend (.toLowerCase (.getHost uri)) "www."))) | ||
(not (.isAbsolute uri))))) |
Empty file.
Oops, something went wrong.