Skip to content

Commit

Permalink
Updated spider code to fetch and process data. It should look for a, …
Browse files Browse the repository at this point in the history
…script, img, and link tags and pull links out of those. This does nothing for css or dynamically generated links. I'm ok with this for now. Will check those later.

Signed-off-by: Nick Campbell <nicholas.j.campbell@gmail.com>
  • Loading branch information
ncb000gt committed May 13, 2011
1 parent 9fbead0 commit 6aef6cd
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 16 deletions.
42 changes: 42 additions & 0 deletions lib/handler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
var util = require('util');

function Handler(opts) {
if (opts.statuses) {
for (var p in opts.statuses) {
this.statuses[p] = opts.statuses[p];
}
}

if (opts.crawler) {
this.crawler = opts.crawler;

for (var status in this.statuses) {
this.crawler.on(status, this.statuses[status]);
}
}
}

exports.createHandler = function(opts) {
return new Handler(opts);
}

Handler.prototype.statuses = {
200: function(data) {
console.log('Default Handler (200): ' + util.inspect(data, true, 2));
},
301: function(data) {
console.log('Default Handler (301): ' + util.inspect(data, true, 2));
},
302: function(data) {
console.log('Default Handler (302): ' + util.inspect(data, true, 2));
},
303: function(data) {
console.log('Default Handler (303): ' + util.inspect(data, true, 2));
},
404: function(data) {
console.log('Default Handler (404): ' + util.inspect(data, true, 2));
},
500: function(data) {
console.log('Default Handler (500): ' + util.inspect(data, true, 2));
}
};
149 changes: 143 additions & 6 deletions lib/spider.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,47 @@
var emitter = require('events').EventEmitter,
util = require('util');
util = require('util'),
url = require('url'),
htmlparser = require('htmlparser'),
redback = require('redback').createClient(),
http = require('http');

function Spider(opts) {
emitter.call(this);
var MIN_GRACE_PERIOD = 1000,
MAX_CONCURRENT_GETS = 10;

this._concurrent_gets = 0;
this.host = opts.host;
this.keep_to_same_origin = true; //TODO: make configurable.

if ((opts.grace_period && opts.grace_period < MIN_GRACE_PERIOD) || !opts.grace_period) {
this.grace_period = MIN_GRACE_PERIOD;
} else {
this.grace_period = opts.grace_period;
}

if ((opts.concurrent_gets && opts.concurrent_gets > MAX_CONCURRENT_GETS) || !opts.concurrent_gets) {
this.concurrent_gets = MAX_CONCURRENT_GETS;
} else {
this.concurrent_gets = opts.concurrent_gets;
}

//TODO: super naive for now...fix to use redis
this.get_queue = [];
this.urls = {};
this.already_processed = {};
this.started = false;

var self = this;
setInterval(function() {
if (self.get_queue.length > 0 && self._concurrent_gets <= self.concurrent_gets) {
self._concurrent_gets++;
self.get(self.get_queue.shift());
} else if (self.started && self.get_queue.length === 0) {
self.emit('done', self.urls);
self.removeAllListeners();
}
}, self.grace_period);
}
util.inherits(Spider, emitter);

Expand All @@ -11,10 +50,108 @@ exports.createCrawler = function(opts) {
}

Spider.prototype.crawl = function(opts) {
console.log('Queued "http://' + this.host + opts.path + '" for checking.');
this.get_queue.push(opts);
this.started = true;
}

// make the actual request
Spider.prototype.get = function(opts) {
var self = this;

//simulate some data for now.
setTimeout(function() {
self.emit('404', {url: 'http://www.dgdsfgsd.com/'});
}, 1000);
console.log('opts: ' + util.inspect(opts, true, 2));
var req = http.request({
method: "GET",
host: (opts.host || self.host),
path: opts.path
},
function(res) {
var headers = res.headers;
var status = res.statusCode;
opts.status = res.statusCode;
opts.host = self.host;
opts.headers = headers;
self.emit(status, opts);
//TODO: get body for post processing and getting more links YA!
var chunks = [];
var len = 0;
res.on('data', function(chunk) {
chunks.push(chunk);
len += chunk.length;
});

res.on('end', function() {
var offset = 0;
var buf = new Buffer(len);
for (var i = 0; i < chunks.length; i++) {
chunks[i].copy(buf, offset, 0);
offset += chunks[i].length;
}
self.process(buf.toString(), opts);
});
self._concurrent_gets--;
});

req.on('error', function(err) {
console.log('err on "' + opts.path + '" with: ' + err);
});
//for now no post data
req.end();
}

Spider.prototype.handleLink = function(link, opts) {
var self = this;

if (!(link in self.urls)) {
self.urls[link] = {
status: opts.status,
from: {}
};
}

if (!(opts.path in self.urls[link].from)) {
self.urls[link].from[opts.path] = '';
}

if (!(link in self.already_processed)) {
var p_link = url.parse(link);
if (self.keep_to_same_origin && !(p_link.host)) {
var new_opts = {path: link};
if (p_link.host) {
new_opts.host = p_link.host;
}
self.get_queue.push(new_opts);
}
self.already_processed[link] = ''; //no need to do this all again for the same shit...
}
}

// get more links bro!
Spider.prototype.process = function(body, opts) {
var self = this;

if (opts.headers['content-type'].match(/text\/html/)) {
var handler = new htmlparser.DefaultHandler(function (err, dom) {
function handleEls(els) {
for (var i = 0; i < els.length; i++) {
var el = els[i];

if (el && el.attribs) {
if ((el.name === 'img' || el.name === 'script') && el.attribs.src) {
self.handleLink(el.attribs.src, opts);
} else if ((el.name === 'link' || el.name === 'a') && el.attribs.href) {
self.handleLink(el.attribs.href, opts);
}
}

if (el.children && el.children.length > 0) {
handleEls(el.children);
}
}
}

handleEls(dom);
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(body);
}
}
28 changes: 21 additions & 7 deletions server.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
var redback = require('redback').createClient(),
spider = require('./lib/spider'),
var _spider = require('./lib/spider'),
_handler = require('./lib/handler'),
fs = require('fs'),
url = require('url'),
util = require('util'),
express = require('express');

var LOG_PATH = '/var/log/oh-bot.log',
Expand All @@ -26,15 +28,27 @@ app.get('/stats', function(req, res) {
});

app.post('/crawl', function(req, res) {
var crawler = spider.createCrawler();
var _domain = req.body.domain,
p_domain = url.parse(_domain),
crawler = _spider.createCrawler({
host: p_domain.hostname,
grace_period: 100 //ms
});
var handler = _handler.createHandler({crawler: crawler});

crawler.on('404', function() {
console.log('404 mofo');
crawler.on('done', function(urls) {
console.log('Site crawl is done.');
console.log('urls processed: ' + util.inspect(urls, true, 3));
});

crawler.crawl();
crawler.crawl({
path: p_domain.pathname
});

res.render('fire', {});
res.render('crawl', {
host: p_domain.host,
path: p_domain.pathname
});
});

app.get('/', function(req, res) {
Expand Down
2 changes: 2 additions & 0 deletions views/crawl.jade
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
div
| http://#{host}#{path} has been queued for takeoff.
2 changes: 0 additions & 2 deletions views/fire.jade

This file was deleted.

2 changes: 1 addition & 1 deletion views/main.jade
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ div
img(src="http://i.imgur.com/zvAvS.gif")

form(action="/crawl",method="POST")
label(for="domain")
label(for="domain") URL:
input(type="text",name="domain",id="domain")
input(type="submit",value="Crawl")

0 comments on commit 6aef6cd

Please sign in to comment.