Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Ian D. Eccles
committed
May 31, 2011
1 parent
35134af
commit bca2e28
Showing
10 changed files
with
234 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# encoding: utf-8 | ||
|
||
# Author:: Ian D. Eccles | ||
# Copyright:: Copyright (c) 2011 Ian D. Ecles | ||
# License:: LGPL | ||
|
||
module SimpleBayes | ||
class CosineSimilar | ||
include Categorical | ||
|
||
attr_reader :categories | ||
|
||
def initialize *cats | ||
@categories = {} | ||
create_categories cats | ||
end | ||
|
||
def train name, text | ||
doc = Document.new text | ||
category(name).store_document doc | ||
end | ||
|
||
def untrain name, text | ||
doc = Document.new text | ||
category(name).remove_document doc | ||
end | ||
|
||
def classifications text | ||
dv = TermVector.tf_idf(Document.new(text), self) | ||
categories.values.map do |c| | ||
[ TermVector.tf_idf(c, self).cosine(dv), c ] | ||
end | ||
end | ||
|
||
def classify text | ||
# classifications will be in the range [0, 1] | ||
classifications(text).inject([-1, nil]) do |max, c_pair| | ||
max.first > c_pair.first ? max : c_pair | ||
end.last.name | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# encoding: utf-8 | ||
|
||
# Author:: Ian D. Eccles | ||
# Copyright:: Copyright (c) 2011 Ian D. Ecles | ||
# License:: LGPL | ||
|
||
|
||
# There is probably some factoring / composing to be done between this | ||
# and TermOccurrence to get one module/class to handle all the term related | ||
# counting business. | ||
module SimpleBayes | ||
class TermVector | ||
DEFAULT_MEASUREMENT = lambda { |t| 1 } | ||
|
||
def initialize doc, &measure | ||
measure ||= DEFAULT_MEASUREMENT | ||
@vector = Hash[doc.term_occurrences.map { |t, _| [t, measure.call(t)] }] | ||
end | ||
|
||
def [] term | ||
@vector.key?(term) ? @vector[term] : 0 | ||
end | ||
|
||
def terms; @vector.keys; end | ||
|
||
def norm | ||
Math.sqrt( terms.inject(0) { |sum, t| sum + self[t]*self[t] } ) | ||
end | ||
|
||
# Returns cos A, where A is the angle between this vector and the | ||
# other. | ||
def cosine other | ||
dot_prod = terms.inject(0) do |sum, t| | ||
sum + self[t] * other[t] | ||
end | ||
norms = norm * other.norm | ||
dot_prod / norms.to_f | ||
end | ||
|
||
class << self | ||
# With the ability to pass a component calculation via a block, | ||
# it doesn't make sense to keep a separate TermVector subclass around. | ||
def tf_idf doc, idf | ||
new(doc) do |t| | ||
doc.frequency_of(t) * idf.inverse_frequency_of(t) | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
require 'spec_helper' | ||
|
||
describe SimpleBayes::CosineSimilar do | ||
let(:classifier) { | ||
SimpleBayes::CosineSimilar.new :interesting, :uninteresting | ||
} | ||
let(:category) { mock('category') } | ||
let(:document) { mock('document') } | ||
|
||
it "should train categories" do | ||
SimpleBayes::Document.stub(:new => document) | ||
category.should_receive(:store_document).with(document) | ||
classifier.stub(:category => category) | ||
classifier.train :some_category, "this is a bit of text" | ||
end | ||
|
||
it "should untrain categories" do | ||
SimpleBayes::Document.stub(:new => document) | ||
category.should_receive(:remove_document).with(document) | ||
classifier.stub(:category => category) | ||
classifier.untrain :some_category, "this is a bit of text" | ||
end | ||
|
||
# Once factored, Bayes had a very simple expression for classifications, | ||
# not quite so with CosineSimilar | ||
it "should weight categories based on the angle between them and the text" do | ||
# idfs: | ||
# stuff => log(2/2) | ||
# uninteresting => log(2/1) | ||
# text => log(2/1) | ||
# about => log(2/1) | ||
# interesting => log(2/1) | ||
# monotremes => log(2/1) | ||
# platypus => 0 (ish) | ||
classifier.train :uninteresting, "uninteresting text stuff" | ||
classifier.train :interesting, "about interesting monotremes stuff" | ||
# tf's: | ||
# stuff => 1/5 | ||
# monotremes => 2/5 | ||
# about => 1/5 | ||
# platypus => 1/5 | ||
classifier.classifications("monotremes stuff about monotremes platypus").should =~ [ [0.7745966692414833, classifier.category(:interesting)], [0, classifier.category(:uninteresting)] ] | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
require 'spec_helper' | ||
|
||
describe SimpleBayes::TermVector do | ||
let(:terms) { | ||
mock('terms', :term_occurrences => { | ||
'hello' => 19, | ||
'there' => 310, | ||
'testing' => 9, | ||
'faces' => 2 | ||
}) | ||
} | ||
let(:vector) { SimpleBayes::TermVector.new(terms) } | ||
|
||
it "should access components with []" do | ||
vector['hello'].should == 1 | ||
vector['faces'].should == 1 | ||
vector['nothing'].should == 0 | ||
end | ||
|
||
it "should return a list of its terms" do | ||
vector.terms.should =~ ['hello', 'there', 'testing', 'faces'] | ||
end | ||
|
||
it "should calculate a norm" do | ||
vector.norm.should == 2.0 | ||
end | ||
|
||
it "should measure components with a provided block" do | ||
vector2 = SimpleBayes::TermVector.new(terms) do |t| | ||
t.length | ||
end | ||
vector2['hello'].should == 5 | ||
vector2['testing'].should == 7 | ||
vector2['nothing'].should == 0 | ||
vector2.norm.should == Math.sqrt(25 + 25 + 49 + 25) | ||
end | ||
|
||
it "should calculate the cosine of the angle between it and another" do | ||
other_terms = mock('other terms', :term_occurrences => { | ||
'fresh' => 1, | ||
'testing' => 5, | ||
'there' => 3, | ||
'help' => 2, | ||
'pants' => 4, | ||
'fridge' => 9, | ||
'lost' => 6, | ||
'kitten' => 8, | ||
'popsicle' => 7 | ||
}) | ||
other_vector = SimpleBayes::TermVector.new(other_terms) | ||
# Norms should be 2.0 and 3.0 | ||
# dot product should be 2 | ||
vector.cosine(other_vector).should == 1 / 3.0 | ||
end | ||
end |