diff --git a/README b/README deleted file mode 100644 index e69de29..0000000 diff --git a/README.rdoc b/README.rdoc new file mode 100644 index 0000000..fffe792 --- /dev/null +++ b/README.rdoc @@ -0,0 +1,33 @@ += ankusa + +Ankusa is a Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage. Because it uses HBase as a backend, the training corpus can be many terabytes in size. + +== Installation +First, install hbaserb: + git clone git://github.com/bmuller/hbaserb.git + cd hbaserb + gem build hbaserb.gemspec && gem install hbaserb + +Then, install ankusa: + git clone git://github.com/livingsocial/ankusa.git + cd ankusa + gem build ankusa.gemspec && gem install ankusa + +== Basic Usage + require 'rubygems' + require 'ankusa' + require 'hbaserb' + + # connect to HBase + client = HBaseRb::Client.new 'localhost' + + c = Classifier.new client + c.train :spam, "This is some spammy text" + c.train :good, "This is not the bad stuff" + + # This will return the most likely class (as symbol) + puts c.classify "This is some spammy text" + + # This will return Hash with classes as keys and + # membership probability as values + puts c.classes "This is some spammy text" diff --git a/ankusa.gemspec b/ankusa.gemspec new file mode 100644 index 0000000..a220c76 --- /dev/null +++ b/ankusa.gemspec @@ -0,0 +1,18 @@ +Gem::Specification.new do |s| + s.name = "ankusa" + s.version = "0.0.1" + s.authors = ["Brian Muller"] + s.date = %q{2010-11-29} + s.description = "Naive Bayes classifier with HBase storage" + s.summary = "Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage" + s.email = "brian.muller@livingsocial.com" + s.files = [ + "lib/ankusa.rb", + "lib/ankusa/classifier.rb", + ] + s.homepage = "https://github.com/livingsocial/ankusa" + s.require_paths = ["lib"] + s.rubygems_version = "1.3.5" + s.add_dependency('hbaserb', '>= 0.0.1') + s.add_dependency('stemmer', '>= 1.0.1') +end \ No newline at end of file diff --git a/lib/ankusa.rb b/lib/ankusa.rb new file mode 100644 index 0000000..1ebf744 --- /dev/null +++ b/lib/ankusa.rb @@ -0,0 +1,2 @@ +$:.unshift File.dirname(__FILE__) +require 'ankusa/classifier' diff --git a/lib/ankusa/classifier.rb b/lib/ankusa/classifier.rb new file mode 100644 index 0000000..d1cd30f --- /dev/null +++ b/lib/ankusa/classifier.rb @@ -0,0 +1,24 @@ +require 'stemmer' + +module Ankusa + + class Classifier + def initialize(hbase_client) + @hbase = hbase_client + end + + def train(klass, text) + # word.stem + end + + def untrain(klass, text) + end + + def classify(text) + end + + def classes(text) + end + end + +end