only crawls each url once; more uncrawlable urls excluded; outputs mp…

…3 urls to file
maryrosecook · Feb 17, 2010 · 1a33bd1 · 1a33bd1
1 parent 496e553
commit 1a33bd1
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 22 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,5 @@ clojure-contrib
 clojure-contrib.jar
 clojure.jar
 jline.jar
+scrawl.tmproj
+output.txt
diff --git a/main.clj b/main.clj
@@ -1,37 +1,46 @@
 (ns scrawl
-   (:require [clojure.contrib.http.agent :as http]))
+   (:require [clojure.contrib.http.agent :as http])
+	 (:require [clojure.contrib.duck-streams :as ds]))
+
+(defn trim-speech [string]
+	(. string substring 0 (- (count string) 1)))
 
 (defn parse-urls-from [start-url]
 	(def agnt (http/http-agent start-url :method "GET"))
 	(def html (http/string agnt))
-	(re-seq #"http://.+?[\"']" html))
+	(def urls (re-seq #"http://.+?[\"']" html))
+	(map trim-speech urls))
 
 (defn crawlable-url [url]
 	(cond 
-       (re-matches #"(?i).*?\.css.*?" url) false 
-     	 (re-matches #"(?i).*?\.gif.*?" url) false
-			 (re-matches #"(?i).*?\.jpg.*?" url) false
-			 (re-matches #"(?i).*?\.jpeg.*?" url) false
-			 (re-matches #"(?i).*?\.mp3.*?" url) false
-			 (re-matches #"(?i).*?\.cgi.*?" url) false
-			 (re-matches #"(?i).*?\.dtd.*?" url) false
-			 #(true) true
+		(re-matches #"(?i).*?\.css$" url) false 
+		(re-matches #"(?i).*?\.gif$" url) false
+		(re-matches #"(?i).*?\.jpg$" url) false
+		(re-matches #"(?i).*?\.jpeg$" url) false
+		(re-matches #"(?i).*?\.mp3$" url) false
+		(re-matches #"(?i).*?\.cgi$" url) false
+		(re-matches #"(?i).*?\.dtd$" url) false
+		(re-matches #"(?i).*?\.js$" url) false
+		#(true) true
 	)
 )
 
-(defn remove-uncrawlable-urls [urls]
-	(filter #(crawlable-url %) urls))
-
-(defn extract-urls-to-save [urls]
-	(filter #(re-matches #"http://.+?\.mp3.*" %) urls))
+(defn remove-dupes-unwanted [f strings already-got]
+	(def unique-strings (remove #(.contains already-got %) strings))
+	(filter f unique-strings))
 
-(defn crawl [urls-to-crawl urls-to-save]
-	(println urls-to-save)
-	(def all-linked-urls (parse-urls-from (first urls-to-crawl)))	
-	(def new-urls-to-save (concat urls-to-save (extract-urls-to-save all-linked-urls)))
-	(def new-urls-to-crawl (concat (rest urls-to-crawl) (remove-uncrawlable-urls all-linked-urls)))
-	(crawl new-urls-to-crawl new-urls-to-save))
+(defn crawl [urls-crawled urls-to-crawl urls-saved]
+	(def next-url (first urls-to-crawl))
+	(println (count urls-saved) " " next-url)
+	(def all-linked-urls (seq (into #{} (parse-urls-from next-url)))) ; unique urls on page
+	(def next-urls-crawled (cons next-url urls-crawled))
+	(def latest-urls-to-save (remove-dupes-unwanted #(re-matches #"http://.+?\.mp3.*" %) all-linked-urls urls-saved))
+	(ds/append-spit "output.txt" (println-str latest-urls-to-save))
+	(def next-urls-saved (concat urls-saved latest-urls-to-save))
+	(def latest-urls-to-crawl (remove-dupes-unwanted #(crawlable-url %) all-linked-urls urls-crawled))
+	(def next-urls-to-crawl (concat (rest urls-to-crawl) latest-urls-to-crawl))
+	(crawl next-urls-crawled next-urls-to-crawl next-urls-saved))
 
-(crawl (list "http://www.saidthegramophone.com/") ())
+(crawl () (list "http://www.saidthegramophone.com/") ())
 
 (shutdown-agents)