Skip to content

Commit

Permalink
Crawler performance optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
pfleidi committed Mar 24, 2011
1 parent ce814fa commit 4b0e045
Showing 1 changed file with 30 additions and 16 deletions.
46 changes: 30 additions & 16 deletions example/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,39 +8,53 @@

var HttpClient = require('../');
var Url = require('url');
var Crypto = require('crypto');

var client = HttpClient.createClient();
var regex = /<a href=["'](\S*)["'].*>/g;

var crawled = [];
var failed = [];

var entryUrl = process.argv[2];
var urlMatch = new RegExp(process.argv[3]);

function getMatches(content) {
var matches = [];
var matches = [];
var match;

while (match = regex.exec(content)) {
if (match && urlMatch.test(match)) {
matches.push(match[1]);
}
// we need the url only once
if (matches.indexOf(match) === -1) {
matches.push(match[1]);
}
}
}
return matches;
}

function crawl(url) {
console.log('Crawling: ' + url);

client.get(url)
.on('error', function (err) {
console.log('Error: ' + err);
})
.on('http-error', function (data, resp) {
console.log('HTTP Status Code > 400');
console.log('Response: ' + data);
})
.on('success', function (data, resp) {
processContent(url, data);
});
var hash = Crypto.createHash('sha1').update(url).digest("hex");

if (crawled.indexOf(hash) === -1 && failed.indexOf(hash) === -1) {
console.log('Crawling: ' + url);

client.get(url)
.on('error', function (err) {
console.log('Error: ' + err);
failed.push(hash);
})
.on('http-error', function (data, resp) {
console.log('HTTP Status Code > 400 for: ' + url);
})
.on('success', function (data, resp) {
processContent(url, data);
})
.on('complete', function () {
crawled.push(hash);
});
}
}

function processContent(lastUrl, content) {
Expand Down

0 comments on commit 4b0e045

Please sign in to comment.