Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

creating tests...

  • Loading branch information...
commit c867b05f1fd629987e0d147b43769deb1612be4b 1 parent bfd83b3
@bmuller bmuller authored
View
2  .gitignore
@@ -0,0 +1,2 @@
+docs
+pkg
View
45 Rakefile
@@ -0,0 +1,45 @@
+require 'rubygems'
+require 'rake'
+require 'rake/testtask'
+require 'rake/rdoctask'
+require 'rake/gempackagetask'
+
+desc "Create documentation"
+Rake::RDocTask.new("doc") { |rdoc|
+ rdoc.title = "HBaseRb - Naive Bayes classifier with HBase storage"
+ rdoc.rdoc_dir = 'docs'
+ rdoc.rdoc_files.include('README.rdoc')
+ rdoc.rdoc_files.include('lib/**/*.rb')
+}
+
+# Run the unit tests
+desc "Run all unit tests"
+Rake::TestTask.new("test") { |t|
+ t.libs << "lib"
+ t.test_files = FileList['test/*_test.rb']
+ t.verbose = true
+}
+
+spec = Gem::Specification.new do |s|
+ s.name = "ankusa"
+ s.version = "0.0.1"
+ s.authors = ["Brian Muller"]
+ s.date = %q{2010-11-29}
+ s.description = "Naive Bayes classifier with HBase storage"
+ s.summary = "Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage"
+ s.email = "brian.muller@livingsocial.com"
+ s.files = FileList["lib/**/*", "[A-Z]*", "Rakefile", "docs/**/*"]
+ s.homepage = "https://github.com/livingsocial/ankusa"
+ s.require_paths = ["lib"]
+ s.rubygems_version = "1.3.5"
+ s.add_dependency('hbaserb', '>= 0.0.1')
+ s.add_dependency('fast-stemmer', '>= 1.0.0')
+end
+
+Rake::GemPackageTask.new(spec) do |pkg|
+ pkg.need_zip = true
+ pkg.need_tar = true
+end
+
+desc "Default task: builds gem and runs tests"
+task :default => [ :gem, :test ]
View
18 ankusa.gemspec
@@ -1,18 +0,0 @@
-Gem::Specification.new do |s|
- s.name = "ankusa"
- s.version = "0.0.1"
- s.authors = ["Brian Muller"]
- s.date = %q{2010-11-29}
- s.description = "Naive Bayes classifier with HBase storage"
- s.summary = "Naive Bayes classifier in Ruby that uses Hadoop's HBase for storage"
- s.email = "brian.muller@livingsocial.com"
- s.files = [
- "lib/ankusa.rb",
- "lib/ankusa/classifier.rb",
- ]
- s.homepage = "https://github.com/livingsocial/ankusa"
- s.require_paths = ["lib"]
- s.rubygems_version = "1.3.5"
- s.add_dependency('hbaserb', '>= 0.0.1')
- s.add_dependency('fast-stemmer', '>= 1.0.0')
-end
View
3  lib/ankusa.rb
@@ -1,4 +1,3 @@
-$:.unshift File.dirname(__FILE__)
require 'ankusa/classifier'
require 'ankusa/hasher'
-
+require 'ankusa/nbclass'
View
24 lib/ankusa/classifier.rb
@@ -37,14 +37,14 @@ def classify(text)
def classifications(text)
classes = {}
- results = {}
+ result = {}
@classnames.each { |k|
classes[k] = NBClass.new k, summary_table, freq_table
result[k] = 0
}
TextHash.new(text).each { |word,count|
- probs = get_counts(word)
+ probs = get_word_probs(word, classes)
@classnames.each { |k|
result[k] += Math.log(probs[k] / classes[k].word_count)
}
@@ -54,9 +54,10 @@ def classifications(text)
result[k] += Math.log(classes[k].doc_count / doc_count_total)
}
- # todo
- # normalize logs to make probs
- # implement get_counts
+ sum = result.inject { |x,y| x+y }
+ result.keys.each { |klass|
+ result[klass] = result[klass] / sum
+ }
result
end
@@ -91,6 +92,19 @@ def doc_count_total
end
protected
+ def get_word_probs(word, classes)
+ probs = {}
+ @classnames.each { |cn| probs[cn] = 0.0001 }
+ row = freq_table.get_row(word)
+ return probs if row.length == 0
+
+ row.first.columns.each { |colname, cell|
+ classname = colname.split(':')[1]
+ probs[classname] = cell.to_i64.to_f / classes[classname].word_count
+ }
+ probs
+ end
+
def init_tables
if not @hbase.has_table? @ftablename
@hbase.create_table @ftablename, "classes", "total"
View
2  lib/ankusa/hasher.rb
@@ -24,7 +24,7 @@ def add_word(word)
word = word.downcase
if not Ankusa::STOPWORDS.include? word
@word_count += 1
- key = word.intern
+ key = word.stem.intern
store key, fetch(key, 0)+1
end
end
View
33 test/classifier_test.rb
@@ -0,0 +1,33 @@
+require File.join File.dirname(__FILE__), 'helper'
+
+class ClassifierTest < Test::Unit::TestCase
+ def initialize(name)
+ @freq_tablename = "ankusa_word_frequencies_test"
+ @sum_tablename = "ankusa_summary_test"
+ super(name)
+ end
+
+ def setup
+ @hbase = HBaseRb::Client.new CONFIG['hbase_host'], CONFIG['hbase_port']
+ @ankusa = Ankusa::Classifier.new @hbase, @freq_tablename, @sum_tablename
+ @freq_table = @hbase.get_table(@freq_tablename)
+ @sum_table = @hbase.get_table(@sum_tablename)
+ end
+
+ def test_train
+ @ankusa.train :spam, "spam and more spam"
+ @ankusa.train :good, "words for processing"
+ @ankusa.train :good, "good word"
+ assert_equal @freq_table.get(:spam, "classes:spam").first.to_i64, 2
+ assert_equal @freq_table.get(:word, "classes:good").first.to_i64, 2
+ assert_equal @sum_table.get(:good, "totals:wordcount").first.to_i64, 4
+ assert_equal @sum_table.get(:good, "totals:doccount").first.to_i64, 2
+ assert_equal @sum_table.get(:spam, "totals:wordcount").first.to_i64, 2
+ assert_equal @sum_table.get(:spam, "totals:doccount").first.to_i64, 1
+ end
+
+ def teardown
+ @ankusa.drop_tables
+ @hbase.close
+ end
+end
View
2  test/config.yml
@@ -0,0 +1,2 @@
+hbase_host: b63
+hbase_port: 9090
View
9 test/hasher_test.rb
@@ -0,0 +1,9 @@
+require File.join File.dirname(__FILE__), 'helper'
+
+class HasherTest < Test::Unit::TestCase
+ def test_stemming
+ t = Ankusa::TextHash.new "Words word a the at fish fishing fishes? /^/ The at a of! @#$!"
+ assert_equal t.length, 2
+ assert_equal t.word_count, 5
+ end
+end
View
9 test/helper.rb
@@ -0,0 +1,9 @@
+require 'rubygems'
+require 'test/unit'
+require 'yaml'
+require 'hbaserb'
+
+$:.unshift(File.join File.dirname(__FILE__), '..', 'lib')
+require 'ankusa'
+
+CONFIG = YAML.load_file File.join(File.dirname(__FILE__), "config.yml")
Please sign in to comment.
Something went wrong with that request. Please try again.