Merge branch 'neighbor_regions'

Conflicts: results.txt
nirvdrum · Aug 24, 2009 · cc28e50 · cc28e50
2 parents 939c266 + 9a65743
commit cc28e50
Show file tree

Hide file tree

Showing 25 changed files with 9,158 additions and 142 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,3 @@
 .idea
+cache/
+test/cache/
diff --git a/Rakefile b/Rakefile
@@ -0,0 +1,13 @@
+require 'rake'
+require 'rake/testtask'
+
+desc 'Default: run unit tests.'
+task :default => :test
+
+desc 'Test the app.'
+Rake::TestTask.new(:test) do |t|
+  t.libs << '.'
+  t.libs << 'test'
+  t.pattern = 'test/**/*_test.rb'
+  t.verbose = true
+end
diff --git a/cache.rb b/cache.rb
@@ -0,0 +1,25 @@
+require 'memcache'
+
+class Cache
+
+  def self.cache_dir
+    @@cache_dir ||= 'cache'
+  end
+
+  def self.fetch(key, &block)
+    @@cache ||= {}
+
+    # Return the cached value if already in memory.
+    return @@cache[key] unless @@cache[key].nil?
+
+    # Try to load from file.
+    @@cache[key] = File.open(File.join(cache_dir, key), 'rb'){ |io| Marshal.load(io) } rescue nil
+    return @@cache[key] unless @@cache[key].nil?
+
+    # Barring all else, perform the operation to obtain value to cache.
+    @@cache[key] = block.call
+    File.open(File.join(cache_dir, key), 'wb'){ |f| Marshal.dump(@@cache[key], f) }
+
+    @@cache[key]
+  end
+end
diff --git a/data_loader.rb b/data_loader.rb
@@ -1,65 +1,78 @@
 require 'rubygems'
 require 'ai4r'
+require 'memoize'
 
 require 'repository'
 require 'watcher'
+require 'cache'
 
 class DataLoader
 
-  def self.load_watchings(data_dir='data')
-    data_labels = ['user_id', 'repo_id']
-    data_items = []
+  def self.load_watchings
+    Cache.fetch('watchings') do
+      data_labels = ['user_id', 'repo_id']
+      data_items = []
 
-    IO.foreach(File.join(data_dir, 'data.txt')) do |line|
-      data_items << line.strip.split(':')
-    end
+      IO.foreach(File.join(data_dir, 'data.txt')) do |line|
+        data_items << line.strip.split(':')
+      end
 
-    data_set = Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => data_items)
-    data_set
+      data_set = Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => data_items)
+      data_set
+    end
   end
 
-  def self.load_repositories(data_dir='data')
-    repositories = {}
-
-    relationships = {}
+  def self.load_repositories
+    Cache.fetch('repositories') do
+      @@repositories ||= {}
+      return @@repositories unless @@repositories.empty?
 
-    IO.foreach(File.join(data_dir, 'repos.txt')) do |line|
-      repo_id, repo_data = line.strip.split(':')
-      name, created_at, parent_id = repo_data.split(',')
+      relationships = {}
 
-      # Add the repository to the result hash.
-      repositories[repo_id] = Repository.new(repo_id, name, created_at)
+      IO.foreach(File.join(data_dir, 'repos.txt')) do |line|
+        repo_id, repo_data = line.strip.split(':')
+        name, created_at, parent_id = repo_data.split(',')
 
-      # Keep track of parent-child relationships.
-      relationships[repo_id] = parent_id unless parent_id.nil?
-    end
+        # Add the repository to the result hash.
+        @@repositories[repo_id] = Repository.new(repo_id, name, created_at)
 
-    # Now that all the repositories have been loaded, establish any parent-child relationships.
-    relationships.each do |child_id, parent_id|
-      repositories[child_id].parent = repositories[parent_id]
-    end
+        # Keep track of parent-child relationships.
+        relationships[repo_id] = parent_id unless parent_id.nil?
+      end
 
-    # Load in the watchers.
-    watchers = {}
-    IO.foreach(File.join(data_dir, 'data.txt')) do |line|
-      user_id, repo_id = line.strip.split(':')
-      watcher = watchers[user_id] || Watcher.new(user_id)
-      watchers[user_id] = watcher
-      repositories[repo_id].watchers << watcher
-    end
+      # Now that all the repositories have been loaded, establish any parent-child relationships.
+      relationships.each do |child_id, parent_id|
+        @@repositories[child_id].parent = @@repositories[parent_id]
+      end
 
-    repositories
+      # Load in the watchers.
+      watchers = {}
+      IO.foreach(File.join(data_dir, 'data.txt')) do |line|
+        user_id, repo_id = line.strip.split(':')
+        watcher = watchers[user_id] || Watcher.new(user_id)
+        watchers[user_id] = watcher
+        @@repositories[repo_id].watchers << watcher
+      end
+
+      @@repositories
+    end
   end
 
-  def self.load_predictings(data_file='data/test.txt')
-    data_labels = ['user_id']
-    data_items = []
+  def self.load_predictings
+    Cache.fetch('predictings') do
+      data_labels = ['user_id']
+      data_items = []
 
-    IO.foreach(data_file) do |line|
-      data_items << [line.strip]
+      IO.foreach(File.join(data_dir, 'test.txt')) do |line|
+        data_items << [line.strip]
+      end
+
+      data_set = Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => data_items)
+      data_set
     end
+  end
 
-    data_set = Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => data_items)
-    data_set
+  def self.data_dir
+    @@data_dir ||= 'data'
   end
-end
+end
diff --git a/data_set_utilities.rb b/data_set_utilities.rb
@@ -15,4 +15,34 @@ def cross_validation(num_folds)
     folds
   end
 
+  def to_models
+    watchers = {}
+    repositories = {}
+
+    raw_repositories = DataLoader.load_repositories
+
+    # Discover watchers, repositories, and mappings.
+    data_items.each do |sample|
+      user_id, repo_id = sample
+
+      watchers[user_id] ||= Watcher.new user_id
+
+      unless repo_id.nil?
+        repositories[repo_id] ||= Repository.new attr_reader :epo_id, "#{raw_repositories[repo_id].owner}/#{raw_repositories[repo_id].name}", raw_repositories[repo_id].created_at
+        watchers[user_id].associate repositories[repo_id]
+      end
+    end
+
+    # Map parent-child repo relationships.  Since raw_repositories may consist of repo <=> watchers or repo <=> repo
+    # that do not exist in the data set, make sure we always look up in the local repo list.
+    raw_repositories.each do |repo_id, repo|
+      if !repositories[repo_id].nil? && !repo.parent.nil? && !repositories[repo.parent.id].nil?
+        repositories[repo_id].parent = repositories[repo.parent.id]
+      end
+    end
+
+    {:watchers => watchers, :repositories => repositories}
+  end
+
+
 end
diff --git a/ext/data_set.rb b/ext/data_set.rb
@@ -1,22 +1,31 @@
 require 'rubygems'
 require 'ai4r'
 require 'enumerator'
+require 'memoize'
 
 require 'data_set_utilities'
 
 module Ai4r
   module Data
     class DataSet
 
+      include Memoize
       include DataSetUtilities
 
+      alias :old_initialize :initialize
+      def initialize(*args)
+        old_initialize *args
+
+        memoize :to_models
+      end
+
       def stratify(num_folds)
         # Although the data will ultimately be sorted by class value, the entries within that class value should be
         # randomized to start.  Otherwise, stratification will always lead to the same resulting folds.
-        randomized = data_items.sort_by { rand }
+        randomized = data_items #.sort_by { rand }
 
         # Sort the data items by class so we can ensure the folds match the underlying distribution.
-        sorted = data_items.sort { |x,y| x.last <=> y.last }
+        sorted = randomized.sort { |x,y| x.last <=> y.last }
 
         # Split the sorted data into folds by grabbing every num_folds item out of the data.  This should ensure
         # that each fold matches the underlying data distribution.
@@ -33,7 +42,7 @@ def stratify(num_folds)
 
           # Randomize the data again to guard against classifiers that are prone to choosing the first
           # sample drawn from the data set.
-          folds << Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => fold.sort_by { rand })
+          folds << Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => fold)#.sort_by { rand })
         end
 
         folds