Skip to content

Commit

Permalink
Merge branch 'neighbor_regions'
Browse files Browse the repository at this point in the history
Conflicts:
	results.txt
  • Loading branch information
nirvdrum committed Aug 24, 2009
2 parents 939c266 + 9a65743 commit cc28e50
Show file tree
Hide file tree
Showing 25 changed files with 9,158 additions and 142 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
.idea
cache/
test/cache/
13 changes: 13 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
require 'rake'
require 'rake/testtask'

desc 'Default: run unit tests.'
task :default => :test

desc 'Test the app.'
Rake::TestTask.new(:test) do |t|
t.libs << '.'
t.libs << 'test'
t.pattern = 'test/**/*_test.rb'
t.verbose = true
end
25 changes: 25 additions & 0 deletions cache.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
require 'memcache'

class Cache

def self.cache_dir
@@cache_dir ||= 'cache'
end

def self.fetch(key, &block)
@@cache ||= {}

# Return the cached value if already in memory.
return @@cache[key] unless @@cache[key].nil?

# Try to load from file.
@@cache[key] = File.open(File.join(cache_dir, key), 'rb'){ |io| Marshal.load(io) } rescue nil
return @@cache[key] unless @@cache[key].nil?

# Barring all else, perform the operation to obtain value to cache.
@@cache[key] = block.call
File.open(File.join(cache_dir, key), 'wb'){ |f| Marshal.dump(@@cache[key], f) }

@@cache[key]
end
end
95 changes: 54 additions & 41 deletions data_loader.rb
Original file line number Diff line number Diff line change
@@ -1,65 +1,78 @@
require 'rubygems'
require 'ai4r'
require 'memoize'

require 'repository'
require 'watcher'
require 'cache'

class DataLoader

def self.load_watchings(data_dir='data')
data_labels = ['user_id', 'repo_id']
data_items = []
def self.load_watchings
Cache.fetch('watchings') do
data_labels = ['user_id', 'repo_id']
data_items = []

IO.foreach(File.join(data_dir, 'data.txt')) do |line|
data_items << line.strip.split(':')
end
IO.foreach(File.join(data_dir, 'data.txt')) do |line|
data_items << line.strip.split(':')
end

data_set = Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => data_items)
data_set
data_set = Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => data_items)
data_set
end
end

def self.load_repositories(data_dir='data')
repositories = {}

relationships = {}
def self.load_repositories
Cache.fetch('repositories') do
@@repositories ||= {}
return @@repositories unless @@repositories.empty?

IO.foreach(File.join(data_dir, 'repos.txt')) do |line|
repo_id, repo_data = line.strip.split(':')
name, created_at, parent_id = repo_data.split(',')
relationships = {}

# Add the repository to the result hash.
repositories[repo_id] = Repository.new(repo_id, name, created_at)
IO.foreach(File.join(data_dir, 'repos.txt')) do |line|
repo_id, repo_data = line.strip.split(':')
name, created_at, parent_id = repo_data.split(',')

# Keep track of parent-child relationships.
relationships[repo_id] = parent_id unless parent_id.nil?
end
# Add the repository to the result hash.
@@repositories[repo_id] = Repository.new(repo_id, name, created_at)

# Now that all the repositories have been loaded, establish any parent-child relationships.
relationships.each do |child_id, parent_id|
repositories[child_id].parent = repositories[parent_id]
end
# Keep track of parent-child relationships.
relationships[repo_id] = parent_id unless parent_id.nil?
end

# Load in the watchers.
watchers = {}
IO.foreach(File.join(data_dir, 'data.txt')) do |line|
user_id, repo_id = line.strip.split(':')
watcher = watchers[user_id] || Watcher.new(user_id)
watchers[user_id] = watcher
repositories[repo_id].watchers << watcher
end
# Now that all the repositories have been loaded, establish any parent-child relationships.
relationships.each do |child_id, parent_id|
@@repositories[child_id].parent = @@repositories[parent_id]
end

repositories
# Load in the watchers.
watchers = {}
IO.foreach(File.join(data_dir, 'data.txt')) do |line|
user_id, repo_id = line.strip.split(':')
watcher = watchers[user_id] || Watcher.new(user_id)
watchers[user_id] = watcher
@@repositories[repo_id].watchers << watcher
end

@@repositories
end
end

def self.load_predictings(data_file='data/test.txt')
data_labels = ['user_id']
data_items = []
def self.load_predictings
Cache.fetch('predictings') do
data_labels = ['user_id']
data_items = []

IO.foreach(data_file) do |line|
data_items << [line.strip]
IO.foreach(File.join(data_dir, 'test.txt')) do |line|
data_items << [line.strip]
end

data_set = Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => data_items)
data_set
end
end

data_set = Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => data_items)
data_set
def self.data_dir
@@data_dir ||= 'data'
end
end
end
30 changes: 30 additions & 0 deletions data_set_utilities.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,34 @@ def cross_validation(num_folds)
folds
end

def to_models
watchers = {}
repositories = {}

raw_repositories = DataLoader.load_repositories

# Discover watchers, repositories, and mappings.
data_items.each do |sample|
user_id, repo_id = sample

watchers[user_id] ||= Watcher.new user_id

unless repo_id.nil?
repositories[repo_id] ||= Repository.new attr_reader :epo_id, "#{raw_repositories[repo_id].owner}/#{raw_repositories[repo_id].name}", raw_repositories[repo_id].created_at
watchers[user_id].associate repositories[repo_id]
end
end

# Map parent-child repo relationships. Since raw_repositories may consist of repo <=> watchers or repo <=> repo
# that do not exist in the data set, make sure we always look up in the local repo list.
raw_repositories.each do |repo_id, repo|
if !repositories[repo_id].nil? && !repo.parent.nil? && !repositories[repo.parent.id].nil?
repositories[repo_id].parent = repositories[repo.parent.id]
end
end

{:watchers => watchers, :repositories => repositories}
end


end
15 changes: 12 additions & 3 deletions ext/data_set.rb
Original file line number Diff line number Diff line change
@@ -1,22 +1,31 @@
require 'rubygems'
require 'ai4r'
require 'enumerator'
require 'memoize'

require 'data_set_utilities'

module Ai4r
module Data
class DataSet

include Memoize
include DataSetUtilities

alias :old_initialize :initialize
def initialize(*args)
old_initialize *args

memoize :to_models
end

def stratify(num_folds)
# Although the data will ultimately be sorted by class value, the entries within that class value should be
# randomized to start. Otherwise, stratification will always lead to the same resulting folds.
randomized = data_items.sort_by { rand }
randomized = data_items #.sort_by { rand }

# Sort the data items by class so we can ensure the folds match the underlying distribution.
sorted = data_items.sort { |x,y| x.last <=> y.last }
sorted = randomized.sort { |x,y| x.last <=> y.last }

# Split the sorted data into folds by grabbing every num_folds item out of the data. This should ensure
# that each fold matches the underlying data distribution.
Expand All @@ -33,7 +42,7 @@ def stratify(num_folds)

# Randomize the data again to guard against classifiers that are prone to choosing the first
# sample drawn from the data set.
folds << Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => fold.sort_by { rand })
folds << Ai4r::Data::DataSet.new(:data_labels => data_labels, :data_items => fold)#.sort_by { rand })
end

folds
Expand Down
Loading

0 comments on commit cc28e50

Please sign in to comment.