Initial commit

nmar · Apr 12, 2013 · 4f97bd3 · 4f97bd3
1 parent dbce535
commit 4f97bd3
Show file tree

Hide file tree

Showing 2 changed files with 103 additions and 0 deletions.
diff --git a/clads.py b/clads.py
@@ -0,0 +1,73 @@
+import urllib2
+from bs4 import BeautifulSoup
+import re
+import time
+import os
+from os import listdir
+from os.path import isfile, join
+
+# set the working directory
+wd = "out/"
+# set the search keyword
+key = "iphone"
+onlyfiles = [f for f in listdir(wd) if isfile(join(wd,f))]
+
+# read the states.txt files from the working directory
+# for each file in the working directory
+for myfile in onlyfiles:
+	workingfile = open(wd+myfile, "r")
+	print "myfile is: " + myfile
+
+	# for each line in the file
+	# go to the results page and identify the URLs of the ad
+
+	for line in iter(workingfile):
+		requesturl = line.rstrip("\n") + "/search/laf?zoomToPosting=&query=" + key + "&srchType=A"
+		print "The result url is: " + requesturl
+		request = urllib2.Request(requesturl)
+		request.add_header('User-agent', 'Mozilla 5.10')
+		resultpage = urllib2.urlopen(request)
+		results = BeautifulSoup(resultpage)
+		listURL = results.findAll('a', href=re.compile("^http.*\/laf\/"))
+		# for each URL go ge the ad content
+		for ad in listURL:
+		 	adurl = ad.get('href')
+			reqadpage = urllib2.Request(adurl)
+			reqadpage.add_header('User-agent', 'Mozilla 5.10')
+			adpage = urllib2.urlopen(reqadpage)
+			print "Now go to get them: " + adurl
+			bspage = BeautifulSoup(adpage)
+			adcontent = bspage.find("section", id="postingbody").stripped_strings
+			for cleancontent in adcontent:
+				content = repr(cleancontent)
+				print "The content cleaned up is: " + content
+				outfile = open(wd+"out-" + myfile, "a")
+				print "Dropping content into: " + outfile.name
+				outfile.write(content)
+			outfile.close()
+			# cycle through the adurl and go get the content
+
+			time.sleep(10)
+	workingfile.close()
+	time.sleep(60)
+
+
+
+
+# this loads the lost and found search page for San Francisco
+#page = urllib2.urlopen('http://sanfrancisco.craigslist.org/search/laf?zoomToPosting=&query=iphone&srchType=A')
+# this puts page into BS
+#results = BeautifulSoup(page)
+#listURL = results.findAll('a', href=re.compile("^http.*\/laf\/"))
+#
+#for link in listURL:
+#	#get whole ad text from listURL  
+#	wholead = urllib2.urlopen(link.get('href'))
+#	ad = BeautifulSoup(wholead)
+#	textad = ad.findAll(id='postingbody')
+#
+#	print textad
+
+#listURL = results.findAll("p", {"class" : "pl"})
+#listURL = results.findAll(True , {"class" : "pl"})
+
diff --git a/clstates.py b/clstates.py
@@ -0,0 +1,30 @@
+from bs4 import BeautifulSoup
+import urllib2
+import re
+import os
+
+page = urllib2.urlopen('http://www.craigslist.org/about/sites')
+soup = BeautifulSoup(page)
+
+# change working directory to out/ so we don't mix code and results
+os.chdir("out/")
+
+# go through the soup, look for a state_delimiter stop at each one, find the ul and list the a tags in there
+
+# lets look at the first continent USA
+usa = soup.find("div", attrs={'class':'colmask'})
+
+# this cycles 50 times looking for the class called state_delimiter
+# once that is found, it looks for the ul following it and inside that looks for the href
+for i in range(0,51):
+	statespace = usa.find_all("div", attrs={'class':'state_delimiter'})[i].string
+	# remove space from state names
+	state = re.sub(r'\s', '', statespace)
+	statename = open(state+".txt", "w")
+	url = usa.find_all("div", attrs={'class':'state_delimiter'})[i].find_next("ul").find_all("a",href=re.compile("^http."))
+	# for each href found add it to the state.txt file
+	for link in url:
+		statename.write(link.get('href'))
+		statename.write("\n")
+	statename.close()
+