Permalink
Browse files

Added --spoof, removed daemon/expresso dependencies, added 2 new buil…

…tins
  • Loading branch information...
1 parent b014b24 commit 3101b0cbb3107cfea059d628d90ebe5b8aed452c @chriso chriso committed Mar 1, 2011
Showing with 168 additions and 17 deletions.
  1. +5 −0 HISTORY.md
  2. +1 −1 builtin/install
  3. +56 −0 builtin/urls.js
  4. +69 −0 builtin/urls_recurse.js
  5. +2 −2 lib/node.io/dom.js
  6. +5 −1 lib/node.io/interfaces/cli.js
  7. +22 −9 lib/node.io/request.js
  8. +6 −1 lib/node.io/utils.js
  9. +2 −3 package.json
View
@@ -1,3 +1,8 @@
+### v0.2.3-2
+ * Removed daemon and expresso as a required dependencies
+ * Added --spoof for spoofing user agents
+ * Added built-in modules for recursively scraping urls
+
### v0.2.3-1
* Fixed relative Location header bug
* Moved soupselect to ./vendor as its package.json is broken
View
@@ -1,4 +1,4 @@
mkdir -p ~/.node_libraries
coffee -c *.coffee
cp -f *.js ~/.node_libraries
-rm -rf ./*.js
+
View
@@ -0,0 +1,56 @@
+var nodeio = require('node.io'),
+ resolve = require('url').resolve,
+ options = {timeout: 10, max: 20, spoof: true},
+ pattern = null;
+
+exports.job = new nodeio.Job(options, {
+ run: function (url) {
+ var self = this;
+
+ //Allow a pattern to be specified at the command line
+ if (this.options.args.length) {
+ pattern = new RegExp(this.options.args[0], 'i');
+ }
+
+ this.getHtml(url, function (err, $, data) {
+ try {
+
+ //If there was an error or the page was incomplete, retry
+ if (err) throw err;
+
+ var children = [];
+
+ //Iterate over all links on the page
+ $('a').each('href', function (href) {
+
+ //Resolve relative links
+ href = resolve(url, href);
+
+ //Ignore links to the same page
+ if (href == url || href.substr(url.length, 1).match(/[#?&]/)) {
+ return;
+ }
+
+ //If a pattern has been specified, output the urls that match the pattern
+ if (pattern && href.match(pattern)) {
+
+ children.push(href);
+
+ //Otherwise match urls that are children of the base url
+ } else if (href.indexOf(url) !== -1) {
+
+ children.push(href);
+
+ }
+
+ });
+
+ //Output URLs
+ children.length ? self.emit(children) : self.skip();
+
+ } catch (e) {
+ self.retry();
+ }
+ });
+ }
+});
View
@@ -0,0 +1,69 @@
+var nodeio = require('node.io'),
+ resolve = require('url').resolve,
+ options = {timeout: 10, max: 20, spoof: true},
+ add_pattern = recurse_pattern = null;
+
+exports.job = new nodeio.Job(options, {
+ run: function (url) {
+ var self = this;
+
+ //Allow patterns to be specified at the command line
+ //Pattern 1: Urls to recurse
+ //Pattern 2: Urls to output
+ if (this.options.args.length >= 1) {
+ recurse_pattern = new RegExp(this.options.args[0], 'i');
+ } else if (this.options.args.length >= 2) {
+ add_pattern = new RegExp(this.options.args[1], 'i');
+ }
+
+ this.getHtml(url, function (err, $, data) {
+ try {
+
+ //If there was an error or the page was incomplete, retry
+ if (err) throw err;
+
+ var children = [];
+
+ //Iterate over all links on the page
+ $('a').each('href', function (href) {
+
+ //Resolve relative links
+ href = resolve(url, href);
+
+ //Recurse urls matching the pattern
+ if (recurse_pattern && href.match(recurse_pattern)) {
+ self.add(href);
+ }
+
+ //Ignore links to the same page
+ if (href == url || href.substr(url.length, 1).match(/[#?&]/)) {
+ return;
+ }
+
+ //If a pattern has been specified, output the urls that match the pattern
+ if (add_pattern && href.match(add_pattern)) {
+ children.push(href);
+
+ //Otherwise match urls that are children of the base url
+ } else if (href.indexOf(url) !== -1) {
+ children.push(href);
+ }
+
+ });
+
+ //If there's no recurse pattern, recurse all children
+ if (!recurse_pattern) {
+ chilren.forEach(function (url) {
+ self.add(url);
+ });
+ }
+
+ //Output urls
+ children.length ? self.emit(children) : self.skip();
+
+ } catch (e) {
+ self.retry();
+ }
+ });
+ }
+});
View
@@ -42,8 +42,8 @@ Job.prototype.parseHtml = function (data, callback, response) {
headers = response && response.headers ? response.headers : {};
if (this.options.jsdom) {
var features = {
- FetchExternalResources: false,
- ProcessExternalResources: false,
+ FetchExternalResources: false,
+ ProcessExternalResources: false,
QuerySelector: false
};
var $, window = require('jsdom').jsdom(data, null, {features:features}).createWindow(),
@@ -23,7 +23,8 @@ var usage = ''
+ ' -f, --fork [NUM] Fork NUM workers. If NUM isn\'t specified, a\n'
+ ' process is spawned for each CPU core\n'
+ ' -u, --unpack <PASS> Unpack a job using the specified password\n'
- + ' -d, --daemon Daemonize the process\n'
+ + ' -d, --daemon Daemonize the process (requires daemon.node)\n'
+ + ' --spoof Spoof request headers\n'
+ ' -b, --benchmark Benchmark the operation\n'
+ ' -g, --debug Debug the operation\n'
+ ' -v, --version Display the current version\n'
@@ -79,6 +80,9 @@ exports.cli = function (args, exit) {
case '--debug':
options.debug = true;
break;
+ case '--spoof':
+ options.spoof = true;
+ break;
case '-t':
case '--timeout':
options.global_timeout = args.shift();
View
@@ -21,6 +21,24 @@ var default_headers = {
};
/**
+ * Some user-agents for spoofing
+ */
+var user_agents = [
+ 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
+ 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
+ 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 6.0)',
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13',
+ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6',
+ 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)',
+ 'Opera/9.20 (Windows NT 6.0; U; en)',
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ru; rv:1.9.2) Gecko/20100115 Firefox/3.6',
+ 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; MS-RTC LM 8)',
+ 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/6.0',
+ 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7; en-us) AppleWebKit/533.4 (KHTML, like Gecko) Version/4.1 Safari/533.4',
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_6) AppleWebKit/534.22 (KHTML, like Gecko) Chrome/11.0.683.0 Safari/534.22'
+];
+
+/**
* Makes a GET request to the specified resource. See doRequest().
*
* @param {String} resource
@@ -207,6 +225,10 @@ Job.prototype.doRequest = function (method, resource, body, headers, callback, p
utils.put(headers, this.next);
this.next = {};
+ if (this.options.spoof) {
+ headers['user-agent'] = user_agents[Math.floor(Math.random() * user_agents.length)];
+ }
+
if (url.search) {
req_url += url.search;
}
@@ -236,23 +258,14 @@ Job.prototype.doRequest = function (method, resource, body, headers, callback, p
//This method is called on each event if the instance is already complete (i.e. timed out)
var cleanup = function () {
-
- //I'll clean up this mess once I figure out which destroy() to call (API is unclear with 0.2.4 => 0.3.1)
if (request) {
if (request.socket && request.socket.destroy) {
request.socket.destroy();
}
- /* In node >=0.3.6 typeof request.destroy === 'function'
- * yet calling it fails..
- if (request.destroy) {
- request.destroy();
- }
- */
}
if (host.destroy) {
host.destroy();
}
-
};
//Watch for errors
View
@@ -6,7 +6,6 @@
var cwd = process.cwd(),
fs = require('fs'),
- daemon = require('daemon'),
exec = require('child_process').exec;
/**
@@ -270,6 +269,12 @@ exports.daemonize = function (arg, callback) {
var lock_file = '/tmp/nodeio.pid',
log_file = '/tmp/nodeio.log';
+ try {
+ var daemon = require('daemon');
+ } catch(e) {
+ exports.status.fatal('Please run `npm install daemon`');
+ }
+
var start = function () {
daemon.run(log_file, lock_file, function (err) {
if (err) return status('Error starting daemon: ' + err, 'error');
View
@@ -1,6 +1,6 @@
{ "name" : "node.io",
"description" : "A distributed data scraping and processing framework for node.js",
- "version" : "0.2.3-1",
+ "version" : "0.2.3-2",
"homepage" : "http://github.com/chriso/node.io",
"keywords" : ["data","mapreduce","map","reduce","scraping","html","parsing","parse","scrape","process","processing","data"],
"author" : "Chris O'Hara <cohara87@gmail.com>",
@@ -17,9 +17,8 @@
"engines": { "node": ">=0.2.5" },
"dependencies": {
"validator": ">= 0.1.1",
- "expresso": ">= 0.7.0",
"coffee-script": ">= 0.9.5",
- "daemon": ">= 0.1.0",
+ "htmlparser": ">= 1.7.3",
"jquery": ">= 1.4.4"
},
"scripts": {

0 comments on commit 3101b0c

Please sign in to comment.