Skip to content

Commit

Permalink
simple simplification of counting, only need to pass the totals, not …
Browse files Browse the repository at this point in the history
…the whole classifier
  • Loading branch information
Ian D. Eccles committed May 31, 2011
1 parent bca2e28 commit c6127aa
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 30 deletions.
4 changes: 2 additions & 2 deletions lib/simple_bayes/bayes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def untrain(name, text)
def classifications text, default_prob = 0.005
doc = Document.new text
categories.values.map do |cat|
prob_cat = cat.probability self
prob_cat = cat.probability total_unique.to_f
prob_doc = cat.probability_of_document(doc, default_prob)
[prob_cat * prob_doc, cat]
end
Expand All @@ -61,7 +61,7 @@ def classifications text, default_prob = 0.005
def log_classifications text, default_prob = 0.005
doc = Document.new text
categories.values.map do |cat|
prob_cat = cat.log_probability self
prob_cat = cat.log_probability total_unique.to_f
prob_doc = cat.log_probability_of_document(doc, default_prob)
[prob_cat + prob_doc, cat]
end
Expand Down
25 changes: 10 additions & 15 deletions lib/simple_bayes/category.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ module SimpleBayes
# Categories are going to be a shared idea, so why not break out the
# common category functionality into its own class.
class Category
MIN_LOG_PROBABILITY = -Float::MAX
include TermOccurrence

attr_reader :name, :term_occurrences
Expand All @@ -17,18 +18,13 @@ def initialize name
@term_occurrences = Hash.new 0
end

def log_probability classifier
uniqs = classifier.total_unique.to_f
cat_uniqs = total_unique
if uniqs > 0 && cat_uniqs > 0
Math.log(cat_uniqs) - Math.log(uniqs)
else
-Float::MAX
end
def log_probability uniqs
tot = total_unique
(uniqs > 0 && tot > 0) ? Math.log(tot/uniqs) : MIN_LOG_PROBABILITY
end

def probability classifier
Math.exp log_probability(classifier)
def probability uniqs
Math.exp log_probability(uniqs)
end

# Calculates the probability of a document given this category, ie:
Expand All @@ -44,13 +40,12 @@ def probability_of_document doc, default_prob = 0.005
end

def log_probability_of_document doc, default_prob = 0.005
all_occurs = total_occurrences.to_f
return -Float::MAX if all_occurs < 1
all = total_occurrences.to_f
return MIN_LOG_PROBABILITY if all < 1

doc.inject(0) do |sum, (t,_)|
t_occurs = occurrences_of(t)
log_prob_t = (t_occurs > 0 ? (Math.log(t_occurs) - Math.log(all_occurs)) : Math.log(default_prob) )
sum + log_prob_t
term = occurrences_of(t)
sum + Math.log(term > 0 ? term/all : default_prob)
end
end
end
Expand Down
19 changes: 6 additions & 13 deletions spec/simple_bayes/category_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,33 @@

describe SimpleBayes::Category do
let(:category) { SimpleBayes::Category.new 'unnamed' }
let(:classifier) { mock('classifier') }

it "should include TermOccurrence" do
category.should be_a_kind_of(SimpleBayes::TermOccurrence)
end

it "should calculate its log probability based on the unique terms of itself and a classifier" do
category.stub(:total_unique => 8)
classifier.stub(:total_unique => 32)
category.log_probability(classifier).should be_within(1.0e-10).of(Math.log 0.25)
category.log_probability(32.0).should be_within(1.0e-10).of(Math.log 0.25)
end

it "should return the 'biggest' negative float if either it or the classifier lack unique terms" do
category.stub(:total_unique => 0)
classifier.stub(:total_unique => 32)
category.log_probability(classifier).should == -Float::MAX
category.log_probability(32.0).should == -Float::MAX
category.stub(:total_unique => 8)
classifier.stub(:total_unique => 0)
category.log_probability(classifier).should == -Float::MAX
category.log_probability(0.0).should == -Float::MAX
end

it "should calculate its probability based on the unique terms of itself and a classifier" do
category.stub(:total_unique => 8)
classifier.stub(:total_unique => 32)
category.probability(classifier).should be_within(1.0e-10).of(0.25)
category.probability(32.0).should be_within(1.0e-10).of(0.25)
end

it "should be close to 0 if either it or the classifier lack unique terms" do
category.stub(:total_unique => 0)
classifier.stub(:total_unique => 32)
category.probability(classifier).should be_within(1.0e-10).of(0.0)
category.probability(32.0).should be_within(1.0e-10).of(0.0)
category.stub(:total_unique => 8)
classifier.stub(:total_unique => 0)
category.probability(classifier).should be_within(1.0e-10).of(0.0)
category.probability(0.0).should be_within(1.0e-10).of(0.0)
end

it "should calculate the log probability of a document given the category" do
Expand Down

0 comments on commit c6127aa

Please sign in to comment.