working on testing

livingsocial · Dec 19, 2010 · a23658a · a23658a
commit a23658a
Show file tree

Hide file tree

Showing 11 changed files with 1,014 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+docs
+pkg
diff --git a/LICENSE b/LICENSE
diff --git a/README.rdoc b/README.rdoc
@@ -0,0 +1,127 @@
+= ABAnalyzer
+
+ABAnalyzer is a Ruby library that will perform testing to determine if there is a statistical difference in categorical data.
+
+ABAnalyzer 
+
+== Installation
+First, install HBase/Hadoop or Cassandra (>= 0.7.0-rc2).  Then, install the appropriate gem:
+  gem install hbaserb
+  # or
+  gem install cassandra
+
+If you're using HBase, make sure the HBase Thrift interface has been started as well.  Then:
+  gem install ankusa
+
+== Basic Usage
+Using the naive Bayes classifier:
+
+  require 'rubygems'
+v = { :rep => { :male => 200, :female => 250 }, :dem => { :male => 150, :female => 300}, :ind => { :male => 50, :female => 50 }}
+v = { :m => { :g => 127, :s => 99, :b => 264 }, :g => { :g => 116, :s => 67, :b => 161 } }
+v = { :non => { :bought => 80, :didnt => 9920 }, :sent => { :bought => 160, :didnt => 9840 } }
+r = ABAnalyzer::Results.new v
+puts r.different?
+
+
+
+
+
+
+
+  require 'ankusa'
+  require 'ankusa/hbase_storage'
+
+  # connect to HBase 
+  storage = Ankusa::HBaseStorage.new 'localhost'
+  c = Ankusa::NaiveBayesClassifier.new storage
+
+  # Each of these calls will return a bag-of-words
+  # has with stemmed words as keys and counts as values
+  c.train :spam, "This is some spammy text"
+  c.train :good, "This is not the bad stuff"
+
+  # This will return the most likely class (as symbol)
+  puts c.classify "This is some spammy text"
+
+  # This will return Hash with classes as keys and 
+  # membership probability as values
+  puts c.classifications "This is some spammy text"
+
+  # If you have a large corpus, the probabilities will
+  # likely all be 0.  In that case, you must use log
+  # likelihood values
+  puts c.log_likelihoods "This is some spammy text"
+
+  # get a list of all classes
+  puts c.classnames
+
+  # close connection
+  storage.close
+
+
+== KL Diverence Classifier
+There is a Kullback–Leibler divergence classifier as well.  KL divergence is a distance measure (though not a true metric because it does not satisfy the triangle inequality).  The KL classifier simply measures the relative entropy between the text you want to classify and each of the classes.  The class with the shortest "distance" is the best class.  You may find that for a especially large corpus it may be slightly faster to use this classifier (since prior probablities are never calculated, only likelihoods).
+
+The API is the same as the NaiveBayesClassifier, except rather than calling "classifications" if you want actual numbers you call "distances".  
+
+  require 'rubygems'
+  require 'ankusa'
+  require 'ankusa/hbase_storage'
+
+  # connect to HBase 
+  storage = Ankusa::HBaseStorage.new 'localhost'
+  c = Ankusa::KLDivergenceClassifier.new storage
+
+  # Each of these calls will return a bag-of-words
+  # has with stemmed words as keys and counts as values
+  c.train :spam, "This is some spammy text"
+  c.train :good, "This is not the bad stuff"
+
+  # This will return the most likely class (as symbol)
+  puts c.classify "This is some spammy text"
+
+  # This will return Hash with classes as keys and 
+  # distances >= 0 as values
+  puts c.distances "This is some spammy text"
+
+  # get a list of all classes
+  puts c.classnames
+
+  # close connection
+  storage.close
+
+== Storage Methods
+Ankusa has a generalized storage interface that has been implemented for HBase, Cassandra, and in-memory storage.
+
+Memory storage can be used when you have a very small corpora
+  require 'ankusa/memory_storage'
+  storage = Ankusa::MemoryStorage.new
+
+HBase storage:
+  require 'ankusa/hbase_storage'
+  # defaults: host='localhost', port=9090, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary"
+  storage = Ankusa::HBaseStorage.new host, port, frequency_tablename, summary_tablename
+
+For Cassandra storage:
+* You will need Cassandra version 0.7.0-rc2 or greater.
+* You will need to set a max number classes since current implementation of the Ruby Cassandra client doesn't support table scans.  
+* Prior to using the Cassandra storage you will need to run the following command from the cassandra-cli: "create keyspace ankusa with replication_factor = 1".  This should be fixed with a new release candidate for Cassandra.
+
+To use the Cassandra storage class:
+  require 'ankusa/cassandra_storage'
+  # defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
+  storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes
+
+
+== Running Tests
+You can run the tests for any of the three storage methods.  For instance, for memory storage:
+  rake test_memory
+
+For the other methods you will need to edit the file test/config.yml and set the configuration params.  Then:
+  rake test_hbase
+  # or
+  rake test_cassandra
+
+
+
diff --git a/Rakefile b/Rakefile
@@ -0,0 +1,42 @@
+require 'rubygems'
+require 'rake'
+require 'rake/testtask'
+require 'rake/rdoctask'
+require 'rake/gempackagetask'
+
+desc "Create documentation"
+Rake::RDocTask.new("doc") { |rdoc|
+  rdoc.title = "ABAnalyzer - A/B test analysis library for Ruby"
+  rdoc.rdoc_dir = 'docs'
+  rdoc.rdoc_files.include('README.rdoc')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+}
+
+desc "Run all unit tests"
+Rake::TestTask.new("test") { |t|
+  t.libs << "lib"
+  t.test_files = FileList['test/*_test.rb']
+  t.verbose = true
+}
+
+spec = Gem::Specification.new do |s|
+  s.name = "abanalyzer"
+  s.version = "0.0.1"
+  s.authors = ["Brian Muller"]
+  s.date = %q{2010-12-19}
+  s.description = "A/B test analysis library for Ruby - performs Chi-Square tests and G-tests on A/B results."
+  s.summary = "Performs statistical tests for significant differences in categorical data."
+  s.email = "brian.muller@livingsocial.com"
+  s.files = FileList["lib/**/*", "[A-Z]*", "Rakefile", "docs/**/*"]
+  s.homepage = "https://github.com/livingsocial/abanalyzer"
+  s.require_paths = ["lib"]
+  s.add_dependency('statistics', '>= 0.54')
+end
+
+Rake::GemPackageTask.new(spec) do |pkg|
+  pkg.need_zip = true
+  pkg.need_tar = true
+end
+
+desc "Default task: builds gem and runs tests"
+task :default => [ :gem, :test ]
diff --git a/lib/abanalyzer.rb b/lib/abanalyzer.rb
@@ -0,0 +1,3 @@
+require 'abanalyzer/matrix'
+require 'abanalyzer/abtest'
+require 'abanalyzer/exceptions'
diff --git a/lib/abanalyzer/abtest.rb b/lib/abanalyzer/abtest.rb
@@ -0,0 +1,51 @@
+require 'statistics2'
+
+module ABAnalyzer
+
+  class ABTest
+    # values should be hash of hashes, with top level hash the group names:
+    # { :groupa => { :yes => 20, :no => 10 }, :groupb => { :yes => 18, :no => 8 } }
+    def initialize(values)
+      @values = Matrix.new values
+    end
+
+    def different?(sig=0.05)
+      gtest_p < sig
+    end
+
+    def chisquare_p
+      sum = 0
+      @values.each_cell { |colname, rowname, value|
+        ex = expected(colname, rowname)
+        test_sufficient_data(colname, rowname, ex, value)
+        sum += ((value - ex) ** 2) / ex
+      }
+      1 - Statistics2.chi2dist(df, sum)
+    end
+
+    def gtest_p
+      sum = 0
+      @values.each_cell { |colname, rowname, value|
+        ex = expected(colname, rowname)
+        test_sufficient_data(colname, rowname, ex, value)
+        sum += value * Math.log(value / ex)
+      }
+      1 - Statistics2.chi2dist(df, 2*sum)      
+    end
+
+    private
+    def test_sufficient_data(colname, rowname, expected, value)
+      msg = "Insufficient data size for column #{colname} row #{rowname}.  Expected value must be >= 5, and value must be > 0."
+      raise InsufficientDataError, msg if expected < 5 or value == 0
+    end
+
+    def expected(colname, rowname)
+      (@values.row_sum(rowname) * @values.column_sum(colname)).to_f / @values.total_sum.to_f
+    end
+
+    def df
+      (@values.columns.length - 1) * (@values.rows.length - 1)
+    end
+  end
+
+end
diff --git a/lib/abanalyzer/exceptions.rb b/lib/abanalyzer/exceptions.rb
@@ -0,0 +1,7 @@
+module ABAnalyzer
+  class InsufficientDataError < RuntimeError
+  end
+
+  class MatrixFormatError < RuntimeError
+  end
+end
diff --git a/lib/abanalyzer/matrix.rb b/lib/abanalyzer/matrix.rb
@@ -0,0 +1,58 @@
+module ABAnalyzer
+
+  class Matrix
+    attr_reader :columns, :rows
+    def initialize(values)
+      @values = values
+      @columns = values.keys
+      @rows = values[@columns.first].keys
+      validate
+    end
+
+    def validate
+      @values.each { |colname, column|
+        if column.keys.map { |s| s.to_s }.sort != @rows.map { |s| s.to_s }.sort
+          raise MatrixFormatError, "Column #{colname} has row names that don't match the first column's."
+        end
+      }
+      coltotal = @columns.map { |col| column_sum(col) }.inject { |a,b| a+b }
+      rowtotal = @rows.map { |col| row_sum(col) }.inject { |a,b| a+b }
+      raise MatrixFormatError, "Column sums do not equal row sums" if coltotal != rowtotal
+    end
+
+    def get_column(name)
+      @values[name].values
+    end
+
+    def get_row(name)
+      @values.map { |colname, rows|
+        rows[name]
+      }
+    end
+
+    def each_cell
+      @columns.each { |colname|
+        @rows.each { |rowname|
+          yield colname, rowname, get(colname, rowname)
+        }
+      }
+    end
+
+    def get(colname, rowname)
+      @values[colname][rowname]
+    end
+
+    def column_sum(name)
+      get_column(name).inject { |a,b| a+b }
+    end
+
+    def row_sum(name)
+      get_row(name).inject { |a,b| a+b }
+    end
+
+    def total_sum
+      @columns.map { |col| column_sum(col) }.inject { |a,b| a+b }      
+    end
+  end
+
+end
diff --git a/test/abtest_test.rb b/test/abtest_test.rb
@@ -0,0 +1,24 @@
+require File.join File.dirname(__FILE__), 'helper'
+
+class ABTestTest < Test::Unit::TestCase
+  def setup
+    @values = { :rep => { :male => 200, :female => 250 }, :dem => { :male => 150, :female => 300}, :ind => { :male => 50, :female => 50 }}
+  end
+
+  def test_test_creation
+    assert_raise ABAnalyzer::InsufficientDataError do 
+      m = ABAnalyzer::ABTest.new({ :one => { :a => 10, :b => 20 }, :two => { :a => 5, :b => 0 } })
+      p = m.gtest_p
+    end
+
+    assert_raise ABAnalyzer::InsufficientDataError do 
+      m = ABAnalyzer::ABTest.new({ :one => { :a => 1, :b => 1 }, :two => { :a => 1, :b => 1 } })
+      p = m.gtest_p
+    end
+  end
+
+  def test_matrix_functions
+    m = ABAnalyzer::Matrix.new @values
+    assert_equal 1, 1
+  end
+end
diff --git a/test/helper.rb b/test/helper.rb
@@ -0,0 +1,5 @@
+require 'rubygems'
+require 'test/unit'
+
+$:.unshift(File.join File.dirname(__FILE__), '..', 'lib')
+require 'abanalyzer'
diff --git a/test/matrix_test.rb b/test/matrix_test.rb
@@ -0,0 +1,21 @@
+require File.join File.dirname(__FILE__), 'helper'
+
+class MatrixTest < Test::Unit::TestCase
+  def setup
+    @values = { :rep => { :male => 200, :female => 250 }, :dem => { :male => 150, :female => 300}, :ind => { :male => 50, :female => 50 }}
+  end
+
+  def test_matrix_creation
+    assert_raise ABAnalyzer::MatrixFormatError do 
+      ABAnalyzer::Matrix.new({ :one => { :a => 10, :b => 20 }, :two => { :a => 5 } })
+    end
+  end
+
+  def test_matrix_functions
+    m = ABAnalyzer::Matrix.new @values
+    assert_equal m.get(:rep, :male), 200
+    assert_equal m.column_sum(:rep), 450
+    assert_equal m.row_sum(:male), 400
+    assert_equal m.total_sum, 1000
+  end
+end