Skip to content

Commit

Permalink
adding classifications2
Browse files Browse the repository at this point in the history
  • Loading branch information
Ian D. Eccles committed May 25, 2011
1 parent cce4569 commit 5e01d50
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 3 deletions.
2 changes: 2 additions & 0 deletions .rspec
@@ -0,0 +1,2 @@
-f p
-c
33 changes: 30 additions & 3 deletions lib/simple_bayes/bayes.rb
Expand Up @@ -21,7 +21,7 @@ def initialize(*categories)
@categories = Hash.new

categories.each do |category|
@categories[category] = Hash.new
@categories[category] = Hash.new { |h,k| h[k] = 0 }
end

@total_words = 0
Expand All @@ -37,7 +37,6 @@ def initialize(*categories)

def train(category, text)
WordHash.new(text).each do |word, count|
@categories[category][word] ||= 0
@categories[category][word] += count
@total_words += count
end
Expand All @@ -55,7 +54,6 @@ def untrain(category, text)
WordHash.new(text).each do |word, count|
if @total_words >= 0
orig = @categories[category][word]
@categories[category][word] ||= 0
@categories[category][word] -= count
if @categories[category][word] <= 0
@categories[category].delete(word)
Expand All @@ -73,11 +71,40 @@ def classifications(text)
total = category_words.values.inject(0) {|sum, element| sum+element}
WordHash.new(text).each do |word, count|
s = category_words.has_key?(word) ? category_words[word] : 0.1
# This is only (kind of) bayes if P(A) = P(B) = 1.0
score[category.to_s] += Math.log(s/total.to_f)
end
end
score
end

def classifications2 text
score = {}
@categories.each do |category, category_words|
score[category.to_s] = 0
cat_total = category_words.values.inject(0) { |sum, n| sum + n }.to_f
# P(A), roughly
cat_prob = cat_total / @total_words
unique_words = 0
WordHash.new(text).each do |word, count|
# Increment for each unique word
unique_words += 1
# P(B), we'll want to pre-calculate some of this to save time
word_total = @categories.inject(0) do |sum, (cat, cws)|
sum + cws[word]
end.to_f
word_prob = word_total / @total_words
# P(B|A)
word_given_prob = category_words[word] / cat_total
# And now, Bayes' Theorem: P(A|B) = P(B|A) * P(A) / P(B)
score[category.to_s] += word_given_prob * cat_prob / word_prob
end
score.each do |cat, s|
score[cat] = s / unique_words
end
end
score
end

def classify(text)
(classifications(text).sort_by { |a| -a[1] })[0][0]
Expand Down

0 comments on commit 5e01d50

Please sign in to comment.