Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

file 251 lines (198 sloc) 7.77 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
# Copyright 2009 Kevin J. Menard Jr.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

require 'rubygems'
require 'pp'
require 'ai4r'
require 'logger'

require 'data_loader'
require 'data_exporter'
require 'ext/data_set'
require 'nearest_neighbors'
require 'cache'


require 'irb'

module IRB # :nodoc:
  def self.start_session(binding)
    unless @__initialized
      args = ARGV
      ARGV.replace(ARGV.dup)
      IRB.setup(nil)
      ARGV.replace(args)
      @__initialized = true
    end

    workspace = WorkSpace.new(binding)

    irb = Irb.new(workspace)

    @CONF[:IRB_RC].call(irb.context) if @CONF[:IRB_RC]
    @CONF[:MAIN_CONTEXT] = irb.context

    catch(:IRB_EXIT) do
      irb.eval_input
    end
  end
end

class Array
  def sum
    inject(0.0) { |sum, e| sum + e }
  end

  def mean
    length == 0 ? 0 : sum / length
  end

  def median
    sort[length / 2]
  end
end

def analyze(test_data, training_data, prediction, all_predictions, classifier, evaluations)
  no_region_count = 0
  most_popular_count = 0
  most_forked_count = 0
  able_to_predict = 0
  total_able_to_be_predicted = 0
  test_data[:watchers].values.each do |test_watcher|
    total_able_to_be_predicted += test_watcher.repositories.size

    test_watcher.repositories.each do |test_repo_id|
      unless training_data[:repositories][test_repo_id].nil?
        able_to_predict += 1

        unless classifier.training_regions[test_repo_id].nil?
          most_popular_count += 1 if classifier.training_regions[test_repo_id].most_popular.id == test_repo_id
          most_forked_count += 1 if classifier.training_regions[test_repo_id].most_forked.id == test_repo_id
        end
      end
    end
  end

  test_data[:watchers].values.each do |test_watcher|
    test_watcher.repositories.each do |test_repo_id|
      next if training_data[:repositories][test_repo_id].nil?
      next if evaluations[test_watcher.id].nil?

      if training_data[:watchers][test_watcher.id].nil?
        $LOG.info { "No training data for watcher #{test_watcher.id} -- impossible to predict" }
        next
      end

      if evaluations[test_watcher.id][test_repo_id].nil? && !training_data[:repositories][test_repo_id].nil?
        $LOG.info { "Failed to find #{test_watcher.id}:#{test_repo_id}" }
      end
    end
  end

  prediction.each do |p|
    p.repositories.each do |repo|
      if !test_data[:watchers][p.id].repositories.include?(repo)
        $LOG.info "Bad prediction #{p.id}:#{repo} with distance #{evaluations[p.id][repo].mean}"
      end
    end

    unless training_data[:watchers][p.id].nil?
      test_data[:watchers][p.id].repositories.delete_if {|r| training_data[:repositories][r].nil?}
      $LOG.info "Accuracy for watcher #{p.id}: #{NearestNeighbors.accuracy(test_data[:watchers][p.id], p) * 100}%"
    end
  end

  all_predictions.each_with_index do |p, i|
    p.repositories.each do |repo|
      if test_data[:watchers][p.id].repositories.include?(repo) && !prediction[i].repositories.include?(repo)
        $LOG.info "Missing prediction #{p.id}:#{repo} with distance #{evaluations[p.id][repo].mean}"
      end
    end
  end

  has_parent_count = 0
  has_children_count = 0
  same_owner_count = 0
  total_repo_count = 0
  test_data[:watchers].values.each do |test_watcher|
    next if test_watcher.nil?

    test_watcher.repositories.each do |test_repo_id|
      total_repo_count += 1
      next if training_data[:repositories][test_repo_id].nil?

      has_parent_count += 1 unless training_data[:repositories][test_repo_id].parent.nil?
      has_children_count += 1 unless training_data[:repositories][test_repo_id].children.empty?

      unless training_data[:watchers][test_watcher.id].nil?
        training_data[:watchers][test_watcher.id].repositories.each do |training_repo_id|
          same_owner_count += 1 if training_data[:repositories][training_repo_id].owner == training_data[:repositories][test_repo_id].owner
        end
      end

    end
  end

  $LOG.info "Has parent ratio: #{(has_parent_count / total_repo_count.to_f) * 100}%"
  $LOG.info "Has children ratio: #{(has_children_count / total_repo_count.to_f) * 100}%"
  $LOG.info "Same owner ratio: #{(same_owner_count / total_repo_count.to_f) * 100}%"

  $LOG.info ">>> Best possible prediction accuracy: #{(able_to_predict / total_able_to_be_predicted.to_f) * 100}%"
  $LOG.info ">>> Actual repo was most popular: #{(most_popular_count / total_able_to_be_predicted.to_f) * 100}%"
  $LOG.info ">>> Actual repo was most forked: #{(most_forked_count / total_able_to_be_predicted.to_f) * 100}%"
end



$LOG = Logger.new(STDOUT)
$LOG.level = Logger::INFO
$LOG.datetime_format = "%Y-%m-%d %H:%M:%S"


$LOG.info "Loading data."
data_set = DataLoader.load_watchings

$LOG.info "Building classifier."
count = 0
predictions = {}
data_set.cross_validation(10) do |training_set, large_test_set|

  reduced_data_set = large_test_set.stratify(100).each do |test_set|

  test_data = test_set.to_models
  training_data = training_set.to_models

  $LOG.info ">>> Starting fold #{count + 1}."
  $LOG.info ">>> Training."
  knn = NearestNeighbors.new(training_set)

  $LOG.info ">>> Classifying."
  #test_set = Ai4r::Data::DataSet.new(:data_items => [['83']])
  evaluations = knn.evaluate(test_set)
  prediction = NearestNeighbors.predict(evaluations, 10)
  all_predictions = NearestNeighbors.predict(evaluations, 10000)

  predictions[prediction] ||= []
  predictions[prediction] << knn

  analyze(test_data, training_data, prediction, all_predictions, knn, evaluations)

  $LOG.info ">>> Results for fold #{count + 1}: #{NearestNeighbors.score(test_set, prediction) * 100}% / #{NearestNeighbors.score(test_set, all_predictions) * 100}%"

  IRB.start_session(binding)

  count += 1
    break
  end
  break
end






#
#$LOG.info "Training."
#knn = NearestNeighbors.new(data_set)
#
#$LOG.info "Evaluating."
#predictings = DataLoader.load_predictings
#evaluations = knn.evaluate(predictings)
#predictions = NearestNeighbors.predict(evaluations, 10)
#
#repos_by_popularity = []
#sorted_regions = knn.training_regions.values.sort { |x,y| y.most_popular.watchers.size <=> x.most_popular.watchers.size }
#repos_by_popularity = sorted_regions.collect {|x| x.most_popular.id}
#
#$LOG.info "Printing results file."
#File.open('results.txt', 'w') do |file|
#
# predictions.each do |watcher|
# # Add the ten most popular repositories that the user is not already a watcher of to his repo list if
# # we don't have any predictions.
# if watcher.repositories.empty?
# if knn.training_watchers[watcher.id].nil?
# puts "No data for watcher: #{watcher.id}"
# repos_by_popularity[0..10].each do |repo_id|
# watcher.repositories << repo_id
# end
# else
# added_repo_count = 0
# repos_by_popularity.each do |suggested_repo_id|
# unless knn.training_watchers[watcher.id].repositories.include?(suggested_repo_id)
# watcher.repositories << suggested_repo_id
# added_repo_count += 1
# end
#
# break if added_repo_count == 10
# end
# end
# end
#
## $LOG.debug "Score (#{watcher.id}): #{NearestNeighbors.accuracy(knn.training_watchers[watcher.id], watcher)} -- #{watcher.to_s}"
# file.puts watcher.to_s
# end
#end



Something went wrong with that request. Please try again.