Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
nmar committed Apr 12, 2013
1 parent dbce535 commit 4f97bd3
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 0 deletions.
73 changes: 73 additions & 0 deletions clads.py
@@ -0,0 +1,73 @@
import urllib2
from bs4 import BeautifulSoup
import re
import time
import os
from os import listdir
from os.path import isfile, join

# set the working directory
wd = "out/"
# set the search keyword
key = "iphone"
onlyfiles = [f for f in listdir(wd) if isfile(join(wd,f))]

# read the states.txt files from the working directory
# for each file in the working directory
for myfile in onlyfiles:
workingfile = open(wd+myfile, "r")
print "myfile is: " + myfile

# for each line in the file
# go to the results page and identify the URLs of the ad

for line in iter(workingfile):
requesturl = line.rstrip("\n") + "/search/laf?zoomToPosting=&query=" + key + "&srchType=A"
print "The result url is: " + requesturl
request = urllib2.Request(requesturl)
request.add_header('User-agent', 'Mozilla 5.10')
resultpage = urllib2.urlopen(request)
results = BeautifulSoup(resultpage)
listURL = results.findAll('a', href=re.compile("^http.*\/laf\/"))
# for each URL go ge the ad content
for ad in listURL:
adurl = ad.get('href')
reqadpage = urllib2.Request(adurl)
reqadpage.add_header('User-agent', 'Mozilla 5.10')
adpage = urllib2.urlopen(reqadpage)
print "Now go to get them: " + adurl
bspage = BeautifulSoup(adpage)
adcontent = bspage.find("section", id="postingbody").stripped_strings
for cleancontent in adcontent:
content = repr(cleancontent)
print "The content cleaned up is: " + content
outfile = open(wd+"out-" + myfile, "a")
print "Dropping content into: " + outfile.name
outfile.write(content)
outfile.close()
# cycle through the adurl and go get the content

time.sleep(10)
workingfile.close()
time.sleep(60)




# this loads the lost and found search page for San Francisco
#page = urllib2.urlopen('http://sanfrancisco.craigslist.org/search/laf?zoomToPosting=&query=iphone&srchType=A')
# this puts page into BS
#results = BeautifulSoup(page)
#listURL = results.findAll('a', href=re.compile("^http.*\/laf\/"))
#
#for link in listURL:
# #get whole ad text from listURL
# wholead = urllib2.urlopen(link.get('href'))
# ad = BeautifulSoup(wholead)
# textad = ad.findAll(id='postingbody')
#
# print textad

#listURL = results.findAll("p", {"class" : "pl"})
#listURL = results.findAll(True , {"class" : "pl"})

30 changes: 30 additions & 0 deletions clstates.py
@@ -0,0 +1,30 @@
from bs4 import BeautifulSoup
import urllib2
import re
import os

page = urllib2.urlopen('http://www.craigslist.org/about/sites')
soup = BeautifulSoup(page)

# change working directory to out/ so we don't mix code and results
os.chdir("out/")

# go through the soup, look for a state_delimiter stop at each one, find the ul and list the a tags in there

# lets look at the first continent USA
usa = soup.find("div", attrs={'class':'colmask'})

# this cycles 50 times looking for the class called state_delimiter
# once that is found, it looks for the ul following it and inside that looks for the href
for i in range(0,51):
statespace = usa.find_all("div", attrs={'class':'state_delimiter'})[i].string
# remove space from state names
state = re.sub(r'\s', '', statespace)
statename = open(state+".txt", "w")
url = usa.find_all("div", attrs={'class':'state_delimiter'})[i].find_next("ul").find_all("a",href=re.compile("^http."))
# for each href found add it to the state.txt file
for link in url:
statename.write(link.get('href'))
statename.write("\n")
statename.close()

0 comments on commit 4f97bd3

Please sign in to comment.