Permalink
Browse files

Initial commit of node-scraper

  • Loading branch information...
0 parents commit 6320b2fb109da06a50d9b4fd7a60d7e049c80e53 @mape committed Dec 5, 2010
Showing with 288 additions and 0 deletions.
  1. +20 −0 LICENSE
  2. +39 −0 README.md
  3. +154 −0 deps/jquery-1.4.2.min.js
  4. +14 −0 examples/advanced.js
  5. +9 −0 examples/simple.js
  6. +38 −0 lib/scraper.js
  7. +14 −0 package.json
20 LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2010 Mathias Pettersson, mape@mape.me
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,39 @@
+# node-scraper
+
+A little module that makes scraping websites a little easier. Uses node.js and jQuery.
+
+## Installation
+
+Via [npm](http://github.com/isaacs/npm):
+
+ $ npm install scraper
+
+## Usage
+
+### Simple
+First argument is an url as a string, second is a callback which exposes a jQuery object with your scraped site as "body".
+
+ var scraper = require('scraper');
+ scraper('http://search.twitter.com/search?q=javascript', function(err, jQuery) {
+ if (err) {throw err}
+
+ jQuery('.msg').each(function() {
+ console.log(jQuery(this).text().trim()+'\n');
+ });
+ });
+### Advanced
+First argument is an object containing settings for the "request" instance used internally, second is a callback which exposes a jQuery object with your scraped site as "body".
+
+ var scraper = require('scraper');
+ scraper({
+ 'uri': 'http://search.twitter.com/search?q=nodejs'
+ , 'headers': {
+ 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
+ }}
+ , function(err, $) {
+ if (err) {throw err}
+
+ $('.msg').each(function() {
+ console.log($(this).text().trim()+'\n');
+ });
+ });

Large diffs are not rendered by default.

Oops, something went wrong.
@@ -0,0 +1,14 @@
+var scraper = require('scraper');
+
+scraper({
+ 'uri': 'http://search.twitter.com/search?q=nodejs'
+ , 'headers': {
+ 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
+ }}
+ , function(err, $) {
+ if (err) {throw err;}
+
+ $('.msg').each(function() {
+ console.log($(this).text().trim()+'\n');
+ });
+});
@@ -0,0 +1,9 @@
+var scraper = require('scraper');
+
+scraper('http://search.twitter.com/search?q=javascript', function(err, $) {
+ if (err) {throw err;}
+
+ $('.msg').each(function() {
+ console.log($(this).text().trim()+'\n');
+ });
+});
@@ -0,0 +1,38 @@
+var request = require('request');
+var jsdom = require('jsdom');
+
+var defaults = {
+ 'uri': null
+ , 'headers': {
+ 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
+ }
+};
+module.exports = function scrape(requestOptions, callback) {
+ var settings = {};
+ Object.keys(defaults).forEach(function(key) {
+ settings[key] = requestOptions[key] || defaults[key];
+ });
+
+ if (typeof requestOptions === 'string') {
+ settings['uri'] = requestOptions;
+ }
+
+ if (!settings['uri']) {
+ callback(new Error('You must supply an uri.'), null, null);
+ }
+
+ request(settings, function (err, response, body) {
+ if (err) {
+ callback(err, null, null);
+ }
+ if (response.statusCode == 200) {
+ var window = jsdom.jsdom().createWindow();
+ jsdom.jQueryify(window, '../deps/jquery-1.4.2.min.js' , function() {
+ window.$('body').append(body);
+ callback(null, window.$);
+ });
+ } else {
+ callback(new Error('Request to '+settings['uri']+' ended with status code: '+response.statusCode), null, null);
+ }
+ });
+};
@@ -0,0 +1,14 @@
+{
+ "name" : "scraper",
+ "description" : "Easier web scraping using jQuery.",
+ "version" : "0.0.1",
+ "author" : "Mathias Pettersson <mape@mape.me>",
+ "engines" : ["node"],
+ "directories" : { "lib" : "./lib" },
+ "main" : "./lib/scraper",
+ "repository" : { "type":"git", "url":"https://github.com/mape/node-scraper.git" },
+ "dependencies" : {
+ "request" : ">=0.10.0",
+ "jsdom" : ">=0.1.20"
+ }
+}

0 comments on commit 6320b2f

Please sign in to comment.