Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelklishin committed Dec 20, 2011
0 parents commit 23b24f4
Show file tree
Hide file tree
Showing 15 changed files with 3,058 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
@@ -0,0 +1,8 @@
pom.xml
*jar
/lib/
/classes/
.lein-failures
.lein-deps-sum
TAGS
checkouts/*
1 change: 1 addition & 0 deletions .travis.yml
@@ -0,0 +1 @@
language: clojure
31 changes: 31 additions & 0 deletions README.md
@@ -0,0 +1,31 @@
# What is Crawlista

Crawlista is a support library for Clojure applications that crawl the Web.


## Usage

### Installation

With Leiningen

[clojurewerkz/crawlista "1.0.0-SNAPSHOT"]

New snapshots are [published to clojars.org](https://clojars.org/clojurewerkz/crawlista) every day (if there are any changes).


## Crawlista is a Work In Progress

Crawlista is a work in progress. Please see our test suite for code examples.


## Supported Clojure versions

Crawlista is built from the ground up for Clojure 1.3 and up.


## License

Copyright (C) 2011 Michael S. Klishin

Distributed under the Eclipse Public License, the same as Clojure.
11 changes: 11 additions & 0 deletions project.clj
@@ -0,0 +1,11 @@
(defproject clojurewerkz/crawlista "1.0.0-SNAPSHOT"
:description "Support library for Clojure applications that crawl the Web"
:dependencies [[org.clojure/clojure "1.3.0"]
[clj-http "0.2.4"]
[org.jsoup/jsoup "1.6.1"]
[clojurewerkz/urly "1.0.0-SNAPSHOT"]]
:source-path "src/clojure"
:java-source-path "src/java"
:resources-path "src/resources"
:dev-resources-path "test/resources"
:warn-on-reflection true)
77 changes: 77 additions & 0 deletions src/clojure/clojurewerkz/crawlista/extraction.clj
@@ -0,0 +1,77 @@
(ns clojurewerkz.crawlista.extraction
(:import [org.jsoup Jsoup]
[org.jsoup.nodes Element]
[java.net URI URL MalformedURLException])
(:use [clojurewerkz.crawlista.string]
[clojurewerkz.crawlista.url]))

;;
;; Implementation
;;

(defn- urls-from
[anchors]
(map (fn [a] (.attr ^Element a "href")) anchors))


;;
;; API
;;

(defn extract-anchors
[body]
(seq (-> (Jsoup/parse body)
(.getElementsByTag "a"))))

(defn extract-local-anchors
[body uri]
(let [host (.getHost (URL. uri))]
(seq (-> (Jsoup/parse body)
(.getElementsByTag "a")))))

(defn extract-local-urls
[body uri]
(let [host (.getHost (URL. (strip-query-string uri)))
anchors (extract-local-anchors body uri)
hrefs (urls-from anchors)]
(distinct (map (fn [^String s] (normalize-url (absolutize s uri)))
(filter (fn [^String s] (local-to? (strip-query-string s) host)) hrefs)))))

(defn followable?
[^Element anchor]
(let [rel-value (.attr anchor "rel")]
(or (nil? rel-value)
(not (= "nofollow"
(-> rel-value .toLowerCase .trim))))))

(defn extract-local-followable-anchors
[body uri]
(filter followable? (extract-local-anchors body uri)))

(defn extract-local-followable-urls
[body uri]
(let [host (.getHost (URL. uri))
anchors (extract-local-followable-anchors body (strip-query-string uri))
urls (filter crawlable-href? (urls-from anchors))]
(distinct (map (fn [^String s] (normalize-url (absolutize s uri)))
(filter (fn [^String s] (local-to? s host)) urls)))))


(defn extract-title
[^String body]
(-> (Jsoup/parse body) .title))


(defn has-anchor?
([body uri]
(let [hrefs (urls-from (extract-anchors body))]
(some (fn [^String s]
(and s
(= (resourcify s) (resourcify uri)))) hrefs)))
([body uri text]
(let [anchors (extract-anchors body)]
(some (fn [^Element anchor]
(let [href (.attr anchor "href")]
(and href
(= (resourcify href) (resourcify uri))
(= (.text anchor) text)))) anchors))))
34 changes: 34 additions & 0 deletions src/clojure/clojurewerkz/crawlista/string.clj
@@ -0,0 +1,34 @@
(ns clojurewerkz.crawlista.string
(:use [clojure.string :only [split blank?]]))

(defn maybe-prepend
[^String s ^String prefix]
(.toLowerCase (if (.startsWith (.toLowerCase s) (.toLowerCase prefix))
s
(str prefix s))))

(defn maybe-append
[^String s ^String suffix]
(.toLowerCase (if (.endsWith (.toLowerCase s) (.toLowerCase suffix))
s
(str s suffix))))

(defn maybe-chopl
[^String s ^String prefix]
(let [ls (.toLowerCase s)]
(if (.startsWith ls prefix)
(.replaceAll ls (str "^" prefix) "")
s)))

(defn maybe-chopr
[^String s ^String suffix]
(let [ls (.toLowerCase s)]
(if (.endsWith ls suffix)
(.replaceAll ls (str suffix "$") "")
s)))

(defn hex-to-int
[^String s]
(Long/parseLong (if (.startsWith s "0x")
(subs s 2)
s) 16))
116 changes: 116 additions & 0 deletions src/clojure/clojurewerkz/crawlista/url.clj
@@ -0,0 +1,116 @@
(ns clojurewerkz.crawlista.url
(:import [java.net URI URL MalformedURLException]
[clojurewerkz.urly UrlLike])
(:use [clojure.string :only [split blank?]]
[clojurewerkz.crawlista.string]
[clojure.string :only [lower-case]]
[clojurewerkz.urly.core :only [path-of]]))


(defn strip-query-string
[^String s]
(.replaceAll s "\\?.*$" ""))

(def resourcify
(comp (fn [^String s]
(if-not (re-find #"\.([a-zA-Z0-9]+)$" (path-of s))
(maybe-append s "/")
s))
strip-query-string
lower-case))

(defn separate-query-string
[^String s]
(split s #"\?"))

(defn client-side-href?
[^String s]
(or (.startsWith s "#")
(.startsWith s "(")
(.startsWith (.toLowerCase s) "javascript")
(blank? s)))

(defn crawlable-href?
[^String s]
(and (not (client-side-href? s)) (try
(URI. (strip-query-string s))
true
(catch java.net.URISyntaxException se
false)
(catch Exception e
false))))


(defprotocol URLNormalization
(normalize-host [input] "Normalizes host by chopping off www. if necessary")
(normalize-url [input] "Normalizes URL by chopping off www. at the beginning and trailing slash at the end, if necessary")
(absolutize [input against] "Returns absolute URL")
(relativize [input against] "Returns relative URL"))

(extend-protocol URLNormalization
String
(normalize-host [input]
(try
(let [url (URL. input)
url* (URL. (.getProtocol url) (maybe-chopl (.toLowerCase (.getHost url)) "www.") (.getPort url) (.getFile url))]
(str url*))
(catch MalformedURLException e
(maybe-chopl (.toLowerCase input) "www."))))
(normalize-url [input]
(maybe-chopr (normalize-host input) "/"))
(absolutize [input against]
(let [[input-without-query-string query-string] (separate-query-string input)
resolved (.toString (.resolve (URI. against)
(URI. input-without-query-string)))]
(if query-string
(str resolved "?" query-string)
resolved)))


URL
(normalize-host [input]
(URL. (.getProtocol input) (maybe-chopl (.toLowerCase (.getHost input)) "www.") (.getPort input) (.getFile input)))


URI
(normalize-host [input]
(URI. (.getScheme input) nil (maybe-chopl (.toLowerCase (.getHost input)) "www.") (.getPort input) (.getPath input) nil nil))
(absolutize [input ^java.net.URI against]
(.resolve against input)))



(defprotocol DomainRoot
(root? [input] "Returns true if given URL/URI is the site root"))

(extend-protocol DomainRoot
String
(root? [input]
(root? (URI. (strip-query-string input))))

URI
(root? [input]
(.isEmpty (UrlLike/normalizePath (.getPath input))))

URL
(root? [input]
(root? (.toURI input))))


(defn- maybe-prepend-protocol
"Fixes broken URLs like //jobs.arstechnica.com/list/1186 (that parse fine and both have host and are not considered absolute by java.net.URI)"
([^String uri-str]
(maybe-prepend-protocol uri-str "http"))
([^String uri-str ^String proto]
(let [uri (URI. uri-str)]
(if (and (not (.isAbsolute uri))
(not (nil? (.getHost uri))))
(str proto ":" uri-str)
uri-str))))

(defn local-to?
[^String uri-str ^String host]
(let [uri (URI. (-> uri-str strip-query-string (maybe-prepend-protocol "http")))]
(or (and (.getHost uri)
(= (maybe-prepend (.toLowerCase host) "www.") (maybe-prepend (.toLowerCase (.getHost uri)) "www.")))
(not (.isAbsolute uri)))))
Empty file added src/java/.gitkeep
Empty file.

0 comments on commit 23b24f4

Please sign in to comment.