-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
only crawls each url once; more uncrawlable urls excluded; outputs mp…
…3 urls to file
- Loading branch information
Mary Cook
committed
Feb 17, 2010
1 parent
496e553
commit 1a33bd1
Showing
2 changed files
with
33 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,5 @@ clojure-contrib | |
clojure-contrib.jar | ||
clojure.jar | ||
jline.jar | ||
scrawl.tmproj | ||
output.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,37 +1,46 @@ | ||
(ns scrawl | ||
(:require [clojure.contrib.http.agent :as http])) | ||
(:require [clojure.contrib.http.agent :as http]) | ||
(:require [clojure.contrib.duck-streams :as ds])) | ||
|
||
(defn trim-speech [string] | ||
(. string substring 0 (- (count string) 1))) | ||
|
||
(defn parse-urls-from [start-url] | ||
(def agnt (http/http-agent start-url :method "GET")) | ||
(def html (http/string agnt)) | ||
(re-seq #"http://.+?[\"']" html)) | ||
(def urls (re-seq #"http://.+?[\"']" html)) | ||
(map trim-speech urls)) | ||
|
||
(defn crawlable-url [url] | ||
(cond | ||
(re-matches #"(?i).*?\.css.*?" url) false | ||
(re-matches #"(?i).*?\.gif.*?" url) false | ||
(re-matches #"(?i).*?\.jpg.*?" url) false | ||
(re-matches #"(?i).*?\.jpeg.*?" url) false | ||
(re-matches #"(?i).*?\.mp3.*?" url) false | ||
(re-matches #"(?i).*?\.cgi.*?" url) false | ||
(re-matches #"(?i).*?\.dtd.*?" url) false | ||
#(true) true | ||
(re-matches #"(?i).*?\.css$" url) false | ||
(re-matches #"(?i).*?\.gif$" url) false | ||
(re-matches #"(?i).*?\.jpg$" url) false | ||
(re-matches #"(?i).*?\.jpeg$" url) false | ||
(re-matches #"(?i).*?\.mp3$" url) false | ||
(re-matches #"(?i).*?\.cgi$" url) false | ||
(re-matches #"(?i).*?\.dtd$" url) false | ||
(re-matches #"(?i).*?\.js$" url) false | ||
#(true) true | ||
) | ||
) | ||
|
||
(defn remove-uncrawlable-urls [urls] | ||
(filter #(crawlable-url %) urls)) | ||
|
||
(defn extract-urls-to-save [urls] | ||
(filter #(re-matches #"http://.+?\.mp3.*" %) urls)) | ||
(defn remove-dupes-unwanted [f strings already-got] | ||
(def unique-strings (remove #(.contains already-got %) strings)) | ||
(filter f unique-strings)) | ||
|
||
(defn crawl [urls-to-crawl urls-to-save] | ||
(println urls-to-save) | ||
(def all-linked-urls (parse-urls-from (first urls-to-crawl))) | ||
(def new-urls-to-save (concat urls-to-save (extract-urls-to-save all-linked-urls))) | ||
(def new-urls-to-crawl (concat (rest urls-to-crawl) (remove-uncrawlable-urls all-linked-urls))) | ||
(crawl new-urls-to-crawl new-urls-to-save)) | ||
(defn crawl [urls-crawled urls-to-crawl urls-saved] | ||
(def next-url (first urls-to-crawl)) | ||
(println (count urls-saved) " " next-url) | ||
(def all-linked-urls (seq (into #{} (parse-urls-from next-url)))) ; unique urls on page | ||
(def next-urls-crawled (cons next-url urls-crawled)) | ||
(def latest-urls-to-save (remove-dupes-unwanted #(re-matches #"http://.+?\.mp3.*" %) all-linked-urls urls-saved)) | ||
(ds/append-spit "output.txt" (println-str latest-urls-to-save)) | ||
(def next-urls-saved (concat urls-saved latest-urls-to-save)) | ||
(def latest-urls-to-crawl (remove-dupes-unwanted #(crawlable-url %) all-linked-urls urls-crawled)) | ||
(def next-urls-to-crawl (concat (rest urls-to-crawl) latest-urls-to-crawl)) | ||
(crawl next-urls-crawled next-urls-to-crawl next-urls-saved)) | ||
|
||
(crawl (list "http://www.saidthegramophone.com/") ()) | ||
(crawl () (list "http://www.saidthegramophone.com/") ()) | ||
|
||
(shutdown-agents) |