Skip to content

Commit

Permalink
term vectors and cosine similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
Ian D. Eccles committed May 31, 2011
1 parent 35134af commit bca2e28
Show file tree
Hide file tree
Showing 10 changed files with 234 additions and 3 deletions.
4 changes: 2 additions & 2 deletions Gemfile
Expand Up @@ -4,6 +4,6 @@ gemspec

group :development do
gem 'guard', '~> 0.3.4'
gem 'guard-rspec', '~> 0.3.1'
gem 'guard-rspec', '~> 0.3.1'
gem 'growl', '~> 1.0.3'
end
end
2 changes: 2 additions & 0 deletions lib/simple_bayes.rb
Expand Up @@ -32,4 +32,6 @@
require 'simple_bayes/term_occurrence'
require 'simple_bayes/document'
require 'simple_bayes/category'
require 'simple_bayes/term_vector'
require 'simple_bayes/bayes'
require 'simple_bayes/cosine_similar'
10 changes: 10 additions & 0 deletions lib/simple_bayes/categorical.rb
Expand Up @@ -25,5 +25,15 @@ def add_category name
def remove_category name
categories.delete(name.to_sym)
end

def categories_including term
categories.values.select { |c| c.occurrences_of(term) > 0 }
end

def inverse_frequency_of term
including = categories_including(term).size
return 0 if including.zero?
Math.log( categories.size / including.to_f )
end
end
end
42 changes: 42 additions & 0 deletions lib/simple_bayes/cosine_similar.rb
@@ -0,0 +1,42 @@
# encoding: utf-8

# Author:: Ian D. Eccles
# Copyright:: Copyright (c) 2011 Ian D. Ecles
# License:: LGPL

module SimpleBayes
class CosineSimilar
include Categorical

attr_reader :categories

def initialize *cats
@categories = {}
create_categories cats
end

def train name, text
doc = Document.new text
category(name).store_document doc
end

def untrain name, text
doc = Document.new text
category(name).remove_document doc
end

def classifications text
dv = TermVector.tf_idf(Document.new(text), self)
categories.values.map do |c|
[ TermVector.tf_idf(c, self).cosine(dv), c ]
end
end

def classify text
# classifications will be in the range [0, 1]
classifications(text).inject([-1, nil]) do |max, c_pair|
max.first > c_pair.first ? max : c_pair
end.last.name
end
end
end
6 changes: 5 additions & 1 deletion lib/simple_bayes/term_occurrence.rb
Expand Up @@ -37,11 +37,15 @@ def remove_term term, count=nil
0
end
end

def occurrences_of term
term_occurrences.key?(term) ? term_occurrences[term] : 0
end

def frequency_of term
occurrences_of(term) / total_occurrences.to_f
end

def total_occurrences
term_occurrences.inject(0) { |sum, (w,c)| sum + c }
end
Expand Down
50 changes: 50 additions & 0 deletions lib/simple_bayes/term_vector.rb
@@ -0,0 +1,50 @@
# encoding: utf-8

# Author:: Ian D. Eccles
# Copyright:: Copyright (c) 2011 Ian D. Ecles
# License:: LGPL


# There is probably some factoring / composing to be done between this
# and TermOccurrence to get one module/class to handle all the term related
# counting business.
module SimpleBayes
class TermVector
DEFAULT_MEASUREMENT = lambda { |t| 1 }

def initialize doc, &measure
measure ||= DEFAULT_MEASUREMENT
@vector = Hash[doc.term_occurrences.map { |t, _| [t, measure.call(t)] }]
end

def [] term
@vector.key?(term) ? @vector[term] : 0
end

def terms; @vector.keys; end

def norm
Math.sqrt( terms.inject(0) { |sum, t| sum + self[t]*self[t] } )
end

# Returns cos A, where A is the angle between this vector and the
# other.
def cosine other
dot_prod = terms.inject(0) do |sum, t|
sum + self[t] * other[t]
end
norms = norm * other.norm
dot_prod / norms.to_f
end

class << self
# With the ability to pass a component calculation via a block,
# it doesn't make sense to keep a separate TermVector subclass around.
def tf_idf doc, idf
new(doc) do |t|
doc.frequency_of(t) * idf.inverse_frequency_of(t)
end
end
end
end
end
15 changes: 15 additions & 0 deletions spec/simple_bayes/categorical_spec.rb
Expand Up @@ -46,4 +46,19 @@
classifier.remove_category :lame
classifier.category_names.should =~ ['CUIDADO LLAMA', :blather_blather]
end

it "should return the categories with at least on occurrence of a term" do
classifier.create_categories [:cat1, 'CUIDADO LLAMA', :blather_blather]
classifier.category(:cat1).store_term 'hello'
classifier.category(:blather_blather).store_term 'hello'
classifier.categories_including('hello').should =~ [ classifier.category(:cat1),
classifier.category(:blather_blather) ]
classifier.categories_including('sinner').should be_empty
end

it "should compute the inverse frequency of a term" do
classifier.create_categories [:cat1, :cat2, :cat3, :cat4, :cat5]
classifier.stub(:categories_including => [1, 2])
classifier.inverse_frequency_of('test').should == Math.log(5 / 2.0)
end
end
44 changes: 44 additions & 0 deletions spec/simple_bayes/cosine_similar_spec.rb
@@ -0,0 +1,44 @@
require 'spec_helper'

describe SimpleBayes::CosineSimilar do
let(:classifier) {
SimpleBayes::CosineSimilar.new :interesting, :uninteresting
}
let(:category) { mock('category') }
let(:document) { mock('document') }

it "should train categories" do
SimpleBayes::Document.stub(:new => document)
category.should_receive(:store_document).with(document)
classifier.stub(:category => category)
classifier.train :some_category, "this is a bit of text"
end

it "should untrain categories" do
SimpleBayes::Document.stub(:new => document)
category.should_receive(:remove_document).with(document)
classifier.stub(:category => category)
classifier.untrain :some_category, "this is a bit of text"
end

# Once factored, Bayes had a very simple expression for classifications,
# not quite so with CosineSimilar
it "should weight categories based on the angle between them and the text" do
# idfs:
# stuff => log(2/2)
# uninteresting => log(2/1)
# text => log(2/1)
# about => log(2/1)
# interesting => log(2/1)
# monotremes => log(2/1)
# platypus => 0 (ish)
classifier.train :uninteresting, "uninteresting text stuff"
classifier.train :interesting, "about interesting monotremes stuff"
# tf's:
# stuff => 1/5
# monotremes => 2/5
# about => 1/5
# platypus => 1/5
classifier.classifications("monotremes stuff about monotremes platypus").should =~ [ [0.7745966692414833, classifier.category(:interesting)], [0, classifier.category(:uninteresting)] ]
end
end
9 changes: 9 additions & 0 deletions spec/simple_bayes/term_occurrence_spec.rb
Expand Up @@ -81,4 +81,13 @@
term_occurrence.remove_term 'hello', 1
term_occurrence.total_occurrences.should == 7
end

it "should calculate the frequency of a term" do
term_occurrence.store_term 'hello', 5
term_occurrence.store_term 'world', 3
term_occurrence.remove_term 'hello', 1
term_occurrence.frequency_of('hello').should == (4 / 7.0)
term_occurrence.frequency_of('world').should == (3 / 7.0)
term_occurrence.frequency_of('nothing').should == 0.0
end
end
55 changes: 55 additions & 0 deletions spec/simple_bayes/term_vector_spec.rb
@@ -0,0 +1,55 @@
require 'spec_helper'

describe SimpleBayes::TermVector do
let(:terms) {
mock('terms', :term_occurrences => {
'hello' => 19,
'there' => 310,
'testing' => 9,
'faces' => 2
})
}
let(:vector) { SimpleBayes::TermVector.new(terms) }

it "should access components with []" do
vector['hello'].should == 1
vector['faces'].should == 1
vector['nothing'].should == 0
end

it "should return a list of its terms" do
vector.terms.should =~ ['hello', 'there', 'testing', 'faces']
end

it "should calculate a norm" do
vector.norm.should == 2.0
end

it "should measure components with a provided block" do
vector2 = SimpleBayes::TermVector.new(terms) do |t|
t.length
end
vector2['hello'].should == 5
vector2['testing'].should == 7
vector2['nothing'].should == 0
vector2.norm.should == Math.sqrt(25 + 25 + 49 + 25)
end

it "should calculate the cosine of the angle between it and another" do
other_terms = mock('other terms', :term_occurrences => {
'fresh' => 1,
'testing' => 5,
'there' => 3,
'help' => 2,
'pants' => 4,
'fridge' => 9,
'lost' => 6,
'kitten' => 8,
'popsicle' => 7
})
other_vector = SimpleBayes::TermVector.new(other_terms)
# Norms should be 2.0 and 3.0
# dot product should be 2
vector.cosine(other_vector).should == 1 / 3.0
end
end

0 comments on commit bca2e28

Please sign in to comment.