Skip to content

Commit

Permalink
only crawls each url once; more uncrawlable urls excluded; outputs mp…
Browse files Browse the repository at this point in the history
…3 urls to file
  • Loading branch information
Mary Cook committed Feb 17, 2010
1 parent 496e553 commit 1a33bd1
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 22 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ clojure-contrib
clojure-contrib.jar
clojure.jar
jline.jar
scrawl.tmproj
output.txt
53 changes: 31 additions & 22 deletions main.clj
Original file line number Diff line number Diff line change
@@ -1,37 +1,46 @@
(ns scrawl
(:require [clojure.contrib.http.agent :as http]))
(:require [clojure.contrib.http.agent :as http])
(:require [clojure.contrib.duck-streams :as ds]))

(defn trim-speech [string]
(. string substring 0 (- (count string) 1)))

(defn parse-urls-from [start-url]
(def agnt (http/http-agent start-url :method "GET"))
(def html (http/string agnt))
(re-seq #"http://.+?[\"']" html))
(def urls (re-seq #"http://.+?[\"']" html))
(map trim-speech urls))

(defn crawlable-url [url]
(cond
(re-matches #"(?i).*?\.css.*?" url) false
(re-matches #"(?i).*?\.gif.*?" url) false
(re-matches #"(?i).*?\.jpg.*?" url) false
(re-matches #"(?i).*?\.jpeg.*?" url) false
(re-matches #"(?i).*?\.mp3.*?" url) false
(re-matches #"(?i).*?\.cgi.*?" url) false
(re-matches #"(?i).*?\.dtd.*?" url) false
#(true) true
(re-matches #"(?i).*?\.css$" url) false
(re-matches #"(?i).*?\.gif$" url) false
(re-matches #"(?i).*?\.jpg$" url) false
(re-matches #"(?i).*?\.jpeg$" url) false
(re-matches #"(?i).*?\.mp3$" url) false
(re-matches #"(?i).*?\.cgi$" url) false
(re-matches #"(?i).*?\.dtd$" url) false
(re-matches #"(?i).*?\.js$" url) false
#(true) true
)
)

(defn remove-uncrawlable-urls [urls]
(filter #(crawlable-url %) urls))

(defn extract-urls-to-save [urls]
(filter #(re-matches #"http://.+?\.mp3.*" %) urls))
(defn remove-dupes-unwanted [f strings already-got]
(def unique-strings (remove #(.contains already-got %) strings))
(filter f unique-strings))

(defn crawl [urls-to-crawl urls-to-save]
(println urls-to-save)
(def all-linked-urls (parse-urls-from (first urls-to-crawl)))
(def new-urls-to-save (concat urls-to-save (extract-urls-to-save all-linked-urls)))
(def new-urls-to-crawl (concat (rest urls-to-crawl) (remove-uncrawlable-urls all-linked-urls)))
(crawl new-urls-to-crawl new-urls-to-save))
(defn crawl [urls-crawled urls-to-crawl urls-saved]
(def next-url (first urls-to-crawl))
(println (count urls-saved) " " next-url)
(def all-linked-urls (seq (into #{} (parse-urls-from next-url)))) ; unique urls on page
(def next-urls-crawled (cons next-url urls-crawled))
(def latest-urls-to-save (remove-dupes-unwanted #(re-matches #"http://.+?\.mp3.*" %) all-linked-urls urls-saved))
(ds/append-spit "output.txt" (println-str latest-urls-to-save))
(def next-urls-saved (concat urls-saved latest-urls-to-save))
(def latest-urls-to-crawl (remove-dupes-unwanted #(crawlable-url %) all-linked-urls urls-crawled))
(def next-urls-to-crawl (concat (rest urls-to-crawl) latest-urls-to-crawl))
(crawl next-urls-crawled next-urls-to-crawl next-urls-saved))

(crawl (list "http://www.saidthegramophone.com/") ())
(crawl () (list "http://www.saidthegramophone.com/") ())

(shutdown-agents)

0 comments on commit 1a33bd1

Please sign in to comment.