First commit

node-js-libs · Nov 16, 2010 · ce350aa · ce350aa
commit ce350aa
Show file tree

Hide file tree

Showing 40 changed files with 3,552 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2010 Chris O'Hara <cohara87@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+*This is a _major_ work in progess.. check back soon!*
diff --git a/bin/node.io b/bin/node.io
@@ -0,0 +1,3 @@
+#!/usr/bin/env node
+
+require('node.io').cli(process.argv.slice(2));
diff --git a/examples/duplicates.js b/examples/duplicates.js
@@ -0,0 +1,45 @@
+// This module can find/remove duplicates in a list
+//
+//   1. To remove duplicates from a list and output unique lines:
+//       $ cat list.txt | node.io duplicates
+//
+//   2. To output lines that appear more than once:
+//       $ cat list.txt | node.io duplicates find
+//
+// To output the results to a file, use either:
+//       $ cat list.txt | node.io -s duplicates > unique.txt
+//       $ node.io -i list.txt -o unique.txt duplicates
+
+var Job = require('../lib/node.io/job').Job;
+
+var seen_lines = [], emitted_lines = [];
+
+function reduce(lines) {
+    var args = this.options.args, emit = [];
+
+    lines.forEach(function(line) {
+        if (args === 'find') {
+
+            //Output duplicate lines
+            if (seen_lines.indexOf(line) >= 0 && !~emitted_lines.indexOf(line)) {
+                emit.push(line);
+                emitted_lines.push(line); //Only output once
+            } else {
+                seen_lines.push(line);
+            }
+
+        } else {
+
+            //Remove duplicate lines (default)
+            if (!~seen_lines.indexOf(line)) {
+                emit.push(line);
+                seen_lines.push(line);
+            }
+
+        }
+    });
+
+    this.emit(emit);
+}
+
+exports.job = new Job({max:20},{reduce:reduce});
diff --git a/examples/google_pagerank.js b/examples/google_pagerank.js
@@ -0,0 +1,99 @@
+// This module checks a domain's Google pagerank (rate limits obviously apply)
+//
+//   1. To find the rank of a domain for a given keyword:
+//       $ echo "mastercard.com" | node.io -s google_pagerank    
+//          => mastercard.com,7
+
+var Job = require('../').Job;
+
+exports.job = new Job({timeout:10, retries:3}, {
+
+    run: function google(input) {
+        var self = this;
+
+        var url = input;
+        if (!~url.indexOf('http://')) url = 'http://'+url;
+
+        var ch = '6'+GoogleCH(strord('info:'+url));
+
+        this.get('http://www.google.com/search?client=navclient-auto&ch='+ch+'&features=Rank&q=info:'+encodeURIComponent(url), function(err, data) {
+            if (err) self.retry();
+
+            if (!~data.indexOf('Rank_1:1:')) {
+                self.emit(input+',');
+            } else {
+                self.emit(input+','+data.substr(9));
+            }
+        });        
+    }, 
+
+    fail: function(input) {
+        this.emit(input+',');
+    }
+
+});
+
+function zF(a,b) {
+    var z = parseInt(80000000,16);
+    if (z & a) {
+        a = a>>1;
+        a &=~z;
+        a |= 0x40000000;
+        a = a>>(b-1);
+    } else {
+        a = a>>b;
+    }
+    return(a);
+}
+
+function mix(a,b,c) { 
+    a-=b; a-=c; a^=(zF(c,13));
+    b-=c; b-=a; b^=(a<<8);
+    c-=a; c-=b; c^=(zF(b,13));
+    a-=b; a-=c; a^=(zF(c,12));
+    b-=c; b-=a; b^=(a<<16);
+    c-=a; c-=b; c^=(zF(b,5));
+    a-=b; a-=c; a^=(zF(c,3));
+    b-=c; b-=a; b^=(a<<10);
+    c-=a; c-=b; c^=(zF(b,15));
+    return (new Array((a),(b),(c)));
+}
+function GoogleCH(url,length) {
+    if(arguments.length == 1) length=url.length;
+    var a=0x9E3779B9, b=0x9E3779B9, c=0xE6359A60, k=0, len=length, mx=new Array();
+    while(len>=12) { 
+        a+=(url[k+0]+(url[k+1]<<8)+(url[k+2]<<16)+(url[k+3]<<24));
+        b+=(url[k+4]+(url[k+5]<<8)+(url[k+6]<<16)+(url[k+7]<<24));
+        c+=(url[k+8]+(url[k+9]<<8)+(url[k+10]<<16)+(url[k+11]<<24));
+        mx=mix(a,b,c);
+        a=mx[0]; b=mx[1]; c=mx[2]; 
+        k+=12; len-=12;
+    }
+    c+=length;
+    switch(len) { 
+        case 11: c+=url[k+10]<<24;
+        case 10: c+=url[k+9]<<16;
+        case 9:c+=url[k+8]<<8;
+        case 8:b+=(url[k+7]<<24);
+        case 7:b+=(url[k+6]<<16);
+        case 6:b+=(url[k+5]<<8);
+        case 5:b+=(url[k+4]);
+        case 4:a+=(url[k+3]<<24);
+        case 3:a+=(url[k+2]<<16);
+        case 2:a+=(url[k+1]<<8);
+        case 1:a+=(url[k+0]);
+    }
+    mx=mix(a,b,c);
+    if(mx[2]<0) {
+        return(0x100000000+mx[2]);
+    } else { 
+        return(mx[2]);
+    }
+}
+function strord(string) { 
+    var result=new Array();
+    for(i=0;i<string.length;i++){
+        result[i]=string[i].charCodeAt(0);
+    }
+    return(result);
+}
diff --git a/examples/google_rank.js b/examples/google_rank.js
@@ -0,0 +1,43 @@
+// This module checks a domain's Google rank for a given keyword (rate limits obviously apply)
+//
+//   1. To find the rank of a domain for a given keyword:
+//       $ echo "mastercard.com,Credit Cards" | node.io -s google_rank    
+//          => mastercard.com,Credit Cards,9
+
+var Job = require('../').Job;
+
+exports.job = new Job({timeout:10, retries:3}, {
+
+    run: function google(input) {
+        var links, self = this;
+
+        var input = input.split(',');
+
+        this.getHtml('http://www.google.com/search?hl=en&num=100&q='+encodeURIComponent(input[1]), function(err, $, data) {
+            if (err) self.retry();
+
+            var rank, i = 0;
+
+            if (links = $('a.l')) {
+                links.each('href', function(href) {
+                    i++;
+                    if (href.indexOf('www.'+input[0]+'/') >= 0) {
+                        rank = i;
+                    } else if (href.indexOf('/'+input[0]+'/') >= 0) {
+                        rank = i;
+                    }
+                });
+                if (rank) {
+                    self.emit(input[0]+','+input[1]+','+rank);
+                } else {
+                    self.emit(input+',');
+                }
+            }
+        });
+    }, 
+
+    fail: function(input) {
+        this.emit(input+',');
+    }
+
+});
diff --git a/examples/google_spell.js b/examples/google_spell.js
@@ -0,0 +1,27 @@
+// This module uses Google suggest to spell check a word or list of words (rate limits obviously apply)
+//
+//   1. To output the result of Google suggest:
+//       $ echo "definately" | node.io -s google_spell    
+//          => definitely
+
+var Job = require('../').Job;
+
+exports.job = new Job({timeout:10, retries:3}, {
+
+    run: function google(input) {
+        var spell, self = this;
+
+        this.getHtml('http://www.google.com/search?hl=en&q='+encodeURIComponent(input), function(err, $) {
+            if (err) self.retry();
+
+            if (spell = $('a.spell')) {
+                self.emit(spell.first().fulltext);
+            }
+        });
+    }, 
+
+    fail: function(input) {
+        this.emit(input);
+    }
+
+});
diff --git a/examples/reddit.js b/examples/reddit.js
@@ -0,0 +1,46 @@
+//This module pulls the front page stories and scores from reddit.com
+//There are API's for doing this - this is just as a quick demonstration of 
+//parsing HTML using htmlparser and an augmented soupselect
+
+var Job = require('../').Job;
+
+function reddit() {
+    var self = this;
+
+    this.getHtml('http://www.reddit.com/', function(err, $) {
+        //Handle any http / parsing errors
+        if (err) self.exit(err);
+
+        var titles = [], scores = [], output = [];
+
+        //Select all titles on the page
+        $('a.title').each(function(a) {
+            titles.push(a.text);
+        });
+
+        //Select all scores on the page
+        $('div.score.unvoted').each(function(div) {
+            scores.push(div.text);
+        });
+
+        //Mismatch? page probably didn't load properly
+        if (scores.length != titles.length) {
+            self.exit('Title / score mismatch');
+        }
+
+        //Output = [score] title
+        for (var i = 0, len = scores.length; i < len; i++) {
+            //Ignore upcoming stories
+            if (scores[i] == '&bull;') continue;
+
+            //Check the data is ok
+            this.assert(scores[i]).isInt();
+
+            output.push('['+scores[i]+'] '+titles[i]);
+        }
+
+        self.emit(output);
+    });
+}
+
+exports.job = new Job({timeout:10, once:true}, {input:false, run:reddit});
diff --git a/examples/resolve.js b/examples/resolve.js
@@ -0,0 +1,78 @@
+// This module wraps the dns.lookup() method. There are a few different uses:
+// (In each case replace domains.txt with your list of domains)
+//
+//   1. To resolve domains and return "domain,ip":
+//       $ cat domains.txt | node.io resolve
+//
+//   2. To return domains that do not resolve:
+//       $ cat domains.txt | node.io resolve notfound
+//
+//   3. To return domains that do resolve:
+//       $ cat domains.txt | node.io resolve found
+//
+// To output the results to a file, use either:
+//       $ cat domains.txt | node.io -s resolve > result.txt
+//       $ node.io -i domains.txt -o result.txt resolve
+
+var Job = require('../').Job, dns = require('dns');
+
+var options = {
+    max: 100,
+    timeout: 10,
+    retries: 3
+}
+
+var methods = {
+
+    run: function(domain) {
+        var self = this, type = this.options.args;
+
+        dns.lookup(domain, 4, function(err, ip) {
+            if (err) {
+
+                //The domain didn't resolve
+                switch(err.errno) {
+                    case 4: case 8: // == notfound
+                        if (type === 'notfound') {
+                            self.emit(domain);
+                        } else if (type === 'found') {
+                            self.skip();
+                        } else {
+                            self.emit(domain + ',');
+                        }
+                        break;
+                    default: self.retry();
+                }
+
+            } else {
+
+                //The domain resolved successfully
+                if (type === 'notfound') {
+                    self.skip();
+                } else if (type === 'found') {
+                    self.emit(domain);
+                } else {
+                    self.emit(domain + ',' + ip);
+                }
+
+            }
+        });
+    },
+
+    fail: function(status, domain) {
+
+        //The domain either timed out or exceeded the max number of retries
+        if (type === 'notfound') {
+            self.emit(domain);
+        } else if (type === 'found') {
+            self.skip();
+        } else {
+            self.emit(domain + ',');
+        }
+        this.emit(domain+',');
+
+    }
+
+}
+
+exports.job = new Job(options, methods);
diff --git a/examples/resources/domains.txt b/examples/resources/domains.txt
@@ -0,0 +1,10 @@
+google.com
+youtube.com
+download.com
+cnet.com
+wow.com
+google.com.au
+amazon.com
+asdfhkasdhfkashdjkashdk.com
+asdjfh98eua9sdfunm.com
+,,,,,,,,.com