Skip to content
This repository has been archived by the owner on Aug 12, 2021. It is now read-only.

Commit

Permalink
tests, readme file
Browse files Browse the repository at this point in the history
  • Loading branch information
ondrs committed Feb 8, 2014
1 parent 48b8354 commit b15ad9a
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 11 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -1 +1,2 @@
node_modules
.log
37 changes: 37 additions & 0 deletions README.md
@@ -0,0 +1,37 @@
# node Krawler

## How to install
```
npm install krawler
```

## Basic example

```javascript
var crawler = new Krawler;

crawler
.queue('http://ondraplsek.cz')
.on('data', function($, url, response) {

// $ - cheerio instance
// url of the current webpage
// response object from mikeal/request

})
.on('err', function(err, url) {
// there has ben an 'err' on 'url'
})
.on('end', function() {
// all URLs has been fetched
});
```

Krawler provides three types of built in parses
- cheerio (default)
- xml
- json

```javascript
```

7 changes: 4 additions & 3 deletions lib/krawler.js
Expand Up @@ -40,7 +40,7 @@ util.inherits(Krawler, events.EventEmitter);
* @type {number}
* @const
*/
Krawler.prototype.VERSION = 0.5;
Krawler.prototype.VERSION = 0.1;

/**
*
Expand All @@ -53,6 +53,7 @@ Krawler.prototype._options = {};
/**
*
* @param {Array|string} urls
* @returns {Krawler}
*/
Krawler.prototype.queue = function(urls) {
var self = this;
Expand Down Expand Up @@ -80,11 +81,11 @@ Krawler.prototype.queue = function(urls) {
});

}, function(err) {
// no error
// no error can
self.emit('end');
});


return this;
};


Expand Down
2 changes: 1 addition & 1 deletion license.md
@@ -1,6 +1,6 @@
The MIT License (MIT)

Copyright (c) 2013 Ondřej Plšek (http://www.ondraplsek.cz)
Copyright (c) 2014 Ondřej Plšek (http://www.ondraplsek.cz)

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
Expand Down
13 changes: 6 additions & 7 deletions package.json
@@ -1,6 +1,6 @@
{
"name": "krawler",
"version": "0.3.0",
"version": "0.1.0",
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously.",
"keywords": [
"dom",
Expand Down Expand Up @@ -28,10 +28,9 @@
{
"type": "git",
"url": "https://github.com/ondrs/node-krawler.git"
}
,
},
"dependencies": {
"underscore": "~1.5.2"
"underscore": "~1.5.2",
"async": "0.2.x",
"xml2js": "0.2.x",
"q": "~1.0.0",
Expand All @@ -41,13 +40,13 @@
"request": "~2.33.0"
},
"devDependencies": {
"mocha": "~1.17.1"
"chai": "~1.9.0"
},
"scripts": {
"test": "node test/testrunner.js"
"test": "mocha test/test.js"
},
"engines": [
"node >=0.10.x"
"node >=0.8.x"
],
"directories": {
"lib": "lib"
Expand Down
132 changes: 132 additions & 0 deletions test/test.js
@@ -0,0 +1,132 @@
var Krawler = require(__dirname + '/../lib/krawler.js');
var expect = require('chai').expect;

describe('Krawler tests', function() {

this.timeout(10000);

it('should fetch and parse HTML page', function(done) {

var crawler = new Krawler;

crawler
.fetchUrl('http://www.google.com')
.done(function(result) {
/** @type {cheerio} */
var $ = result.data;
expect($('title').text()).to.be.equal('Google');
done();
});
});

it('should fetch and parse XML page', function(done) {

var crawler = new Krawler({
parser: 'xml'
});

crawler
.fetchUrl('http://www.w3schools.com/xml/note.xml')
.done(function(result) {
/** @type {Object} */
var xml = result.data;
expect(xml).to.be.instanceOf(Object);
done();
});
});

it('should fetch and parse JSON page', function(done) {

var crawler = new Krawler({
parser: 'json'
});

crawler
.fetchUrl('https://graph.facebook.com/facebook')
.done(function(result) {
/** @type {Object} */
var json = result.data;
expect(json).to.be.instanceOf(Object);
done();
});
});

it('should fetch raw page', function(done) {

var crawler = new Krawler({
parser: false
});

crawler
.fetchUrl('http://www.google.com')
.done(function(result) {
var str = result.data;
expect(str).to.be.equal(str.toString());
done();
});
});


it('should fetch several HTML pages in queue', function(done) {

var urls = [],
fetched = [],
crawler = new Krawler;

for(var i = 0; i < 3; ++i) {
urls.push('https://www.google.cz/?q=' + i);
}

crawler
.queue(urls)
.on('data', function(data, url, response) {
fetched.push(urls);
})
.on('error', function(err, url) {
done(err);
})
.on('end', function() {
expect(urls.length).to.be.equal(fetched.length);
done();
})

});

it('should fetch sing HTML page in queue', function(done) {

var crawler = new Krawler;

crawler
.queue('https://www.google.cz')
.on('data', function(data, url, response) {
expect(url).to.be.equal('https://www.google.cz');
})
.on('error', function(err, url) {
done(err);
})
.on('end', function() {
done();
})

});


it('forceutf8 - from latin-1', function(done) {

var crawler = new Krawler({
forceUTF8: true
});

crawler
.fetchUrl('http://czyborra.com/charsets/iso8859.html')
.done(function(result) {
/** @type {cheerio} */
var $ = result.data;
expect($.html()).to.have.string('Jörg');
done();
});

});

});

0 comments on commit b15ad9a

Please sign in to comment.