Permalink
Browse files

tweaks to dis/allow repeated urls, with a test

  • Loading branch information...
1 parent c536edb commit f2662400bddf4ee5af21d9e480c306cc2ee22567 @nrabinowitz committed Jul 11, 2011
Showing with 37 additions and 7 deletions.
  1. +2 −0 .gitignore
  2. +8 −7 pjscrape.js
  3. +5 −0 tests/runtests.py
  4. +3 −0 tests/test_jquery_versions.js
  5. +16 −0 tests/test_recursive_allowrepeat.js
  6. +3 −0 tests/test_syntax.js
View
@@ -0,0 +1,2 @@
+*.pyc
+*.pyo
View
@@ -442,12 +442,6 @@ var pjs = (function(){
// look for more urls on this page
var moreUrls = page.evaluate(s.opts.moreUrls);
if (moreUrls && (!s.opts.maxDepth || s.depth < s.opts.maxDepth)) {
- // avoid repeat visits
- moreUrls = moreUrls.map(baseUrl).filter(function(url) {
- console.log("URL: " + url);
- console.log("Visited: " + JSON.stringify(visited));
- return !(url in visited);
- });
if (moreUrls.length) {
log.msg('Found ' + moreUrls.length + ' additional urls to scrape');
// make a new sub-suite
@@ -463,7 +457,14 @@ var pjs = (function(){
var runCounter = 0
function runNext() {
if (runCounter < s.urls.length) {
- s.scrape(baseUrl(s.urls[runCounter++]), scrapePage, complete);
+ url = baseUrl(s.urls[runCounter++]);
+ // avoid repeat visits
+ if (!config.allowRepeatUrls && url in visited) {
+ runNext();
+ } else {
+ // scrape this url
+ s.scrape(url, scrapePage, complete);
+ }
} else {
s.complete();
}
View
@@ -58,6 +58,11 @@ def test_recursive_noloop(self):
self.assertEqual(out, '["Test Page: Loop 1","Test Page: Loop 2"]',
"Failed, got: " + out)
+ def test_recursive_allowrepeat(self):
+ out = subprocess.check_output(COMMAND_BASE + ['test_recursive_allowrepeat.js']).strip()
+ self.assertEqual(out, '["Test Page: Loop 1","Test Page: Loop 2","Test Page: Loop 1","Test Page: Loop 2","Test Page: Loop 1"]',
+ "Failed, got: " + out)
+
def test_csv(self):
out = subprocess.check_output(COMMAND_BASE + ['test_csv.js']).strip()
# not sure why stdout uses \r\r\n, but that seems to be the case
@@ -1,3 +1,6 @@
+pjs.config({
+ allowRepeatUrls: true
+});
pjs.addSuite({
url: 'http://localhost:8888/test_site/jquery_versions.html',
@@ -0,0 +1,16 @@
+pjs.config({
+ allowRepeatUrls: true
+});
+
+var scraper = function() {
+ return $('h1').first().text();
+};
+
+pjs.addSuite({
+ url: 'http://localhost:8888/test_site/loop1.html',
+ moreUrls: function() {
+ return _pjs.getAnchorUrls('li a');
+ },
+ maxDepth: 4,
+ scraper: scraper
+});
View
@@ -1,3 +1,6 @@
+pjs.config({
+ allowRepeatUrls: true
+});
pjs.addSuite({
url: 'http://localhost:8888/test_site/index.html',

0 comments on commit f266240

Please sign in to comment.