Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
103 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import urllib2 | ||
from bs4 import BeautifulSoup | ||
import re | ||
import time | ||
import os | ||
from os import listdir | ||
from os.path import isfile, join | ||
|
||
# set the working directory | ||
wd = "out/" | ||
# set the search keyword | ||
key = "iphone" | ||
onlyfiles = [f for f in listdir(wd) if isfile(join(wd,f))] | ||
|
||
# read the states.txt files from the working directory | ||
# for each file in the working directory | ||
for myfile in onlyfiles: | ||
workingfile = open(wd+myfile, "r") | ||
print "myfile is: " + myfile | ||
|
||
# for each line in the file | ||
# go to the results page and identify the URLs of the ad | ||
|
||
for line in iter(workingfile): | ||
requesturl = line.rstrip("\n") + "/search/laf?zoomToPosting=&query=" + key + "&srchType=A" | ||
print "The result url is: " + requesturl | ||
request = urllib2.Request(requesturl) | ||
request.add_header('User-agent', 'Mozilla 5.10') | ||
resultpage = urllib2.urlopen(request) | ||
results = BeautifulSoup(resultpage) | ||
listURL = results.findAll('a', href=re.compile("^http.*\/laf\/")) | ||
# for each URL go ge the ad content | ||
for ad in listURL: | ||
adurl = ad.get('href') | ||
reqadpage = urllib2.Request(adurl) | ||
reqadpage.add_header('User-agent', 'Mozilla 5.10') | ||
adpage = urllib2.urlopen(reqadpage) | ||
print "Now go to get them: " + adurl | ||
bspage = BeautifulSoup(adpage) | ||
adcontent = bspage.find("section", id="postingbody").stripped_strings | ||
for cleancontent in adcontent: | ||
content = repr(cleancontent) | ||
print "The content cleaned up is: " + content | ||
outfile = open(wd+"out-" + myfile, "a") | ||
print "Dropping content into: " + outfile.name | ||
outfile.write(content) | ||
outfile.close() | ||
# cycle through the adurl and go get the content | ||
|
||
time.sleep(10) | ||
workingfile.close() | ||
time.sleep(60) | ||
|
||
|
||
|
||
|
||
# this loads the lost and found search page for San Francisco | ||
#page = urllib2.urlopen('http://sanfrancisco.craigslist.org/search/laf?zoomToPosting=&query=iphone&srchType=A') | ||
# this puts page into BS | ||
#results = BeautifulSoup(page) | ||
#listURL = results.findAll('a', href=re.compile("^http.*\/laf\/")) | ||
# | ||
#for link in listURL: | ||
# #get whole ad text from listURL | ||
# wholead = urllib2.urlopen(link.get('href')) | ||
# ad = BeautifulSoup(wholead) | ||
# textad = ad.findAll(id='postingbody') | ||
# | ||
# print textad | ||
|
||
#listURL = results.findAll("p", {"class" : "pl"}) | ||
#listURL = results.findAll(True , {"class" : "pl"}) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from bs4 import BeautifulSoup | ||
import urllib2 | ||
import re | ||
import os | ||
|
||
page = urllib2.urlopen('http://www.craigslist.org/about/sites') | ||
soup = BeautifulSoup(page) | ||
|
||
# change working directory to out/ so we don't mix code and results | ||
os.chdir("out/") | ||
|
||
# go through the soup, look for a state_delimiter stop at each one, find the ul and list the a tags in there | ||
|
||
# lets look at the first continent USA | ||
usa = soup.find("div", attrs={'class':'colmask'}) | ||
|
||
# this cycles 50 times looking for the class called state_delimiter | ||
# once that is found, it looks for the ul following it and inside that looks for the href | ||
for i in range(0,51): | ||
statespace = usa.find_all("div", attrs={'class':'state_delimiter'})[i].string | ||
# remove space from state names | ||
state = re.sub(r'\s', '', statespace) | ||
statename = open(state+".txt", "w") | ||
url = usa.find_all("div", attrs={'class':'state_delimiter'})[i].find_next("ul").find_all("a",href=re.compile("^http.")) | ||
# for each href found add it to the state.txt file | ||
for link in url: | ||
statename.write(link.get('href')) | ||
statename.write("\n") | ||
statename.close() | ||
|