Permalink
Browse files

incremented to version 0.0.9, fixed bug w/ getting probs for non-exis…

…tant classes
  • Loading branch information...
1 parent c4aac3a commit 839fd1ec5f1423d0f7e95d7c73381f5d04956826 @bmuller bmuller committed May 28, 2011
Showing with 49 additions and 31 deletions.
  1. +2 −1 README.rdoc
  2. +4 −26 Rakefile
  3. +20 −0 ankusa.gemspec
  4. +1 −0 lib/ankusa.rb
  5. +3 −0 lib/ankusa/classifier.rb
  6. +8 −4 lib/ankusa/naive_bayes.rb
  7. +3 −0 lib/ankusa/version.rb
  8. +8 −0 test/classifier_base.rb
View
@@ -20,7 +20,8 @@ Using the naive Bayes classifier:
require 'ankusa'
require 'ankusa/hbase_storage'
- # connect to HBase
+ # connect to HBase. Alternatively, just for this test, use in memory storage with
+ # storage = Ankusa::MemoryStorage.new
storage = Ankusa::HBaseStorage.new 'localhost'
c = Ankusa::NaiveBayesClassifier.new storage
View
@@ -1,12 +1,13 @@
require 'rubygems'
-require 'rake'
+require 'bundler'
require 'rake/testtask'
require 'rake/rdoctask'
-require 'rake/gempackagetask'
+
+Bundler::GemHelper.install_tasks
desc "Create documentation"
Rake::RDocTask.new("doc") { |rdoc|
- rdoc.title = "HBaseRb - Naive Bayes classifier with HBase storage"
+ rdoc.title = "Ankusa - Naive Bayes classifier with big data storage"
rdoc.rdoc_dir = 'docs'
rdoc.rdoc_files.include('README.rdoc')
rdoc.rdoc_files.include('lib/**/*.rb')
@@ -39,26 +40,3 @@ Rake::TestTask.new("test_filesystem") { |t|
t.test_files = FileList['test/hasher_test.rb', 'test/file_system_classifier_test.rb']
t.verbose = true
}
-
-spec = Gem::Specification.new do |s|
- s.name = "ankusa"
- s.version = "0.0.8"
- s.authors = ["Brian Muller"]
- s.date = %q{2011-01-05}
- s.description = "Text classifier with HBase or Cassandra storage"
- s.summary = "Text classifier in Ruby that uses Hadoop's HBase or Cassandra for storage"
- s.email = "brian.muller@livingsocial.com"
- s.files = FileList["lib/**/*", "[A-Z]*", "Rakefile", "docs/**/*"]
- s.homepage = "https://github.com/livingsocial/ankusa"
- s.require_paths = ["lib"]
- s.add_dependency('fast-stemmer', '>= 1.0.0')
- s.requirements << "Either hbaserb >= 0.0.3 or cassandra >= 0.7"
-end
-
-Rake::GemPackageTask.new(spec) do |pkg|
- pkg.need_zip = true
- pkg.need_tar = true
-end
-
-desc "Default task: builds gem and runs tests"
-task :default => [ :gem, :test ]
View
@@ -0,0 +1,20 @@
+$:.push File.expand_path("../lib", __FILE__)
+require "ankusa/version"
+require "rake"
+require "date"
+
+Gem::Specification.new do |s|
+ s.name = "ankusa"
+ s.version = Ankusa::VERSION
+ s.authors = ["Brian Muller"]
+ s.date = Date.today.to_s
+ s.description = "Text classifier with HBase or Cassandra storage"
+ s.summary = "Text classifier in Ruby that uses Hadoop's HBase or Cassandra for storage"
+ s.email = "brian.muller@livingsocial.com"
+ s.files = FileList["lib/**/*", "[A-Z]*", "Rakefile", "docs/**/*"]
+ s.homepage = "https://github.com/livingsocial/ankusa"
+ s.require_paths = ["lib"]
+ s.add_dependency('fast-stemmer', '>= 1.0.0')
+ s.requirements << "Either hbaserb >= 0.0.3 or cassandra >= 0.7"
+ s.rubyforge_project = "ankusa"
+end
View
@@ -1,3 +1,4 @@
+require 'ankusa/version'
require 'ankusa/extensions'
require 'ankusa/classifier'
require 'ankusa/naive_bayes'
@@ -50,6 +50,9 @@ def get_word_probs(word, classnames)
@storage.get_word_counts(word).each { |k,v| probs[k] = v if classnames.include? k }
vs = vocab_sizes
classnames.each { |cn|
+ # if we've never seen the class, the word prob is 0
+ next if not vs.has_key? cn
+
# use a laplacian smoother
probs[cn] = (probs[cn] + 1).to_f / (@storage.get_total_word_count(cn) + vs[cn]).to_f
}
@@ -1,4 +1,5 @@
module Ankusa
+ INFTY = 1.0 / 0.0
class NaiveBayesClassifier
include Classifier
@@ -12,7 +13,7 @@ def classify(text, classes=nil)
def classifications(text, classnames=nil)
result = log_likelihoods text, classnames
result.keys.each { |k|
- result[k] = Math.exp result[k]
+ result[k] = (result[k] == INFTY) ? 0 : Math.exp(result[k])
}
# normalize to get probs
@@ -28,16 +29,19 @@ def log_likelihoods(text, classnames=nil)
TextHash.new(text).each { |word, count|
probs = get_word_probs(word, classnames)
- classnames.each { |k| result[k] += (Math.log(probs[k]) * count) }
+ classnames.each { |k|
+ # log likelihood should be infinity if we've never seen the klass
+ result[k] += probs[k] > 0 ? (Math.log(probs[k]) * count) : INFTY
+ }
}
- # add the prior and exponentiate
+ # add the prior
doc_counts = doc_count_totals.select { |k,v| classnames.include? k }.map { |k,v| v }
doc_count_total = (doc_counts.inject { |x,y| x+y } + classnames.length).to_f
classnames.each { |k|
result[k] += Math.log((@storage.get_doc_count(k) + 1).to_f / doc_count_total)
}
-
+
result
end
@@ -0,0 +1,3 @@
+module Ankusa
+ VERSION = "0.0.9"
+end
@@ -65,6 +65,10 @@ def test_probs
cs = @classifier.classifications("spam is tastey", [:spam, :good])
assert_equal cs[:spam], spam
assert_equal cs[:good], good
+
+ # test for class we didn't train on
+ cs = @classifier.classifications("spam is super tastey if you are a zombie", [:spam, :nothing])
+ assert_equal cs[:nothing], 0
end
def test_prob_result
@@ -107,5 +111,9 @@ def test_distances_result
klass = @classifier.classify("spam is tastey")
assert_equal cs, klass
assert_equal klass, :spam
+
+ # assert distance from class we didn't train with is Infinity (1.0/0.0 is a way to get at Infinity)
+ cs = @classifier.distances("spam is tastey", [:spam, :nothing])
+ assert_equal cs[:nothing], (1.0/0.0)
end
end

0 comments on commit 839fd1e

Please sign in to comment.