diff --git a/Procfile b/Procfile index e093165..091de96 100644 --- a/Procfile +++ b/Procfile @@ -1,2 +1,2 @@ unicorn: bundle exec unicorn -p 3000 -c ./config/unicorn.rb -sidekiq: bundle exec sidekiq -v -q apk_decompiler -q apk_downloader -q app_query_fetcher,10 -q app_query_dispatcher,10 -c 10 +sidekiq: bundle exec sidekiq -v -q search_app -q process_app,20 -t 60 -c 10 diff --git a/app/models/app.rb b/app/models/app.rb index 2484fb9..e5d381b 100644 --- a/app/models/app.rb +++ b/app/models/app.rb @@ -75,11 +75,14 @@ def self.from_market(app) raise ParseError.new "#{e.class}: #{e}\n#{app.inspect}" end - def repo(options={}) - Repository.new(self.id, options) + def self.discovered_app(app_id) + if Redis.instance.sadd('apps', app_id) + # New app! + ProcessApp.perform_async(app_id) + end end - def id_version - @id_version ||= "#{id}-#{version_code}" + def self.discovered_apps(app_ids) + app_ids.each { |app_id| discovered_app(app_id) } end end diff --git a/app/models/lib.rb b/app/models/lib.rb deleted file mode 100644 index 292278e..0000000 --- a/app/models/lib.rb +++ /dev/null @@ -1,74 +0,0 @@ -class Lib - include Mongoid::Document - include Mongoid::Timestamps - - field :name - field :_id, :as => :name - field :num_apks, :type => Integer - - field :display_name - field :url - field :category - - has_many :apks, :foreign_key => :lib_names - index :num_apks => 1 - - default_scope order_by(:num_apks => -1) - - CATEGORY_NAMES = { - "android" => "Android Core", - "ads" => "Advertising Platform", - "api" => "Service API", - "app_framwork" => "Application Framework", - "core" => "Core", - "analytics" => "Analytics", - "bug_tracking" => "Bug Tracking", - "barcode" => "Barcode", - "cloud_storage" => "Cloud Storage", - "billing" => "Billing", - "data" => "Data", - "mail" => "Email", - "oauth" => "OAuth", - "html" => "XML/HTML parser", - "ui" => "UI Tools", - "orm" => "ORM", - "lang" => "Lang", - "malware" => "Malware", - "gfx" => "Graphics Engine", - "logger" => "Logger", - "maps" => "Maps", - "audio" => "Audio", - "gaming" => "Gaming", - "app_maker" => "App Maker" - } - - def self.lib_re - Regexp.new "^(#{Lib.all.map { |l| l.name.gsub('.','/') }.join('|')})" - end - - def self.search_in_sources(query, options={}) - size = options[:size] || 10 - field = options[:field] || :path - libs = options[:libs] - lib_re = self.lib_re if libs == :filtered - - res = Source.tire.search(:per_page => 0) do - query do - filtered do - query { query == '*' ? all : @value = { :wildcard => { :path => query } } } - filter :not, :exists => {:field => :lib} if !libs || libs == :filtered - end - end - facet(field) { terms :field => field, :size => size } - end - - detail = Hash[res.facets[field.to_s]['terms'].map { |f| [f['term'], f['count']] }] - detail = detail.reject { |k,v| k =~ lib_re } if libs == :filtered - - { :total => res.total, :detail => detail } - end - - def discover! - LibFinder.perform_async(id) - end -end diff --git a/app/models/repository.rb b/app/models/repository.rb index 958f9a9..c8b1b28 100644 --- a/app/models/repository.rb +++ b/app/models/repository.rb @@ -6,7 +6,9 @@ class EmptyCommit < RuntimeError; end # Rugged gets its hands on new, not initialize def self.new(app_id, options={}) - path = REPO_PATH.join(*app_id.split('.')) + componants = app_id.split('.') + componants.last << '.git' + path = REPO_PATH.join(*componants) if options[:auto_create] && !path.exist? init_at(path.to_s, :bare) diff --git a/app/models/source.rb b/app/models/source.rb deleted file mode 100644 index 933bddd..0000000 --- a/app/models/source.rb +++ /dev/null @@ -1,79 +0,0 @@ -class Source - include Tire::Model::Persistence - - property :apk_eid - property :path - property :lib - property :core - property :lines - property :num_lines - property :size - - tire.mapping :_all => {:enabled => false} do - indexes :apk_eid, :index => :not_analyzed, :store => :yes - indexes :path, :index => :not_analyzed, :store => :yes - indexes :lib, :index => :not_analyzed, :store => :yes - indexes :core, :type => :boolean, :store => :yes - indexes :lines, :analyzer => :simple - indexes :num_lines, :type => :integer, :store => :yes - indexes :size, :type => :integer, :store => :yes - end - - def self.index_sources!(apk) - base_len = nil - lib_re = Lib.lib_re - core_re = Regexp.new("^#{apk.package_name.gsub('.', '/')}/") - - sources = [] - apk.source_dir.find do |f| - base_len = f.to_s.size if base_len.nil? - - if f.file? && f.extname == '.java' - lines = f.open.lines.map(&:chomp) - path = f.to_s[base_len+1..-1] - attrs = { :id => Moped::BSON::ObjectId.new.to_s, - :apk_eid => apk.eid, - :path => path, - :lines => lines, - :num_lines => lines.count, - :size => f.size} - attrs[:lib] = $1.gsub('/', '.') if path =~ lib_re - attrs[:core] = true if path =~ core_re - sources << new(attrs) - end - end - - index.import sources - true - end - - def self.purge_index!(apk) - Tire::Configuration.client.delete "#{index.url}/_query?q=apk_eid:#{apk.eid}" - end - - def self.search(query, options={}) - tire.search(options) do - query { string query, :default_field => :lines, :default_operator => 'AND' } - highlight :lines => {:fragment_size => 300, :number_of_fragments => 100000}, :options => {:tag => ''} - fields :apk_eid, :path - - facet(:num_lines) { statistical :num_lines } - facet(:size) { statistical :size } - end - end - - def self.filter_lines(results, regex=nil) - per_file_lines = [] - results.each do |source| - next unless source.highlight - matched_lines = source.highlight[:lines] - matched_lines = matched_lines.grep(regex) if regex - next if matched_lines.empty? - - per_file_lines << {:apk_eid => source.apk_eid, - :path => source.path, - :lines => matched_lines} - end - per_file_lines - end -end diff --git a/app/worker/apk_core_finder.rb b/app/worker/apk_core_finder.rb deleted file mode 100644 index 4c7d9e7..0000000 --- a/app/worker/apk_core_finder.rb +++ /dev/null @@ -1,27 +0,0 @@ -class ApkCoreFinder - include Sidekiq::Worker - sidekiq_options :queue => name.underscore - - def perform(apk_id) - apk = Apk.find(apk_id) - - has_core = false - transform_proc = proc do |doc| - has_core = true - doc[:core] = true - doc - end - - Source.index.reindex('sources', :transform => transform_proc) do - query do - boolean do - must { @value = { :wildcard => { :path => "#{apk.package_name.gsub(/\./, '/')}/*" } } } - must { term 'apk_eid', apk.eid } - must_not { term 'core', true } - end - end - end - - apk.update_attributes(:has_core => true) if has_core - end -end diff --git a/app/worker/apk_core_writer.rb b/app/worker/apk_core_writer.rb deleted file mode 100644 index b042471..0000000 --- a/app/worker/apk_core_writer.rb +++ /dev/null @@ -1,26 +0,0 @@ -class ApkCoreWriter - include Sidekiq::Worker - sidekiq_options :queue => name.underscore - - def perform(apk_id) - apk = Apk.find(apk_id) - - res = Source.tire.search do - size 1000000 - query do - boolean do - must { term 'core', true } - must { term 'apk_eid', apk.eid } - end - end - fields :lines, :path - end - - dir = Rails.root.join('play', 'src') - res.results.each do |file| - path = dir.join file.path - FileUtils.mkdir_p path.dirname - File.open(path, 'w') { |f| f.write((file.lines << '').join("\n")) } - end - end -end diff --git a/app/worker/apk_decompiler.rb b/app/worker/apk_decompiler.rb deleted file mode 100644 index 2cd041d..0000000 --- a/app/worker/apk_decompiler.rb +++ /dev/null @@ -1,31 +0,0 @@ -class ApkDecompiler - include Sidekiq::Worker - sidekiq_options :queue => name.underscore, :retry => 2 - - def perform(apk_id) - Helpers.has_java_exceptions do - apk = Apk.find(apk_id) - Source.purge_index!(apk) - begin - Decompiler.decompile(apk.file, apk.source_dir) - Source.index_sources!(apk) - apk.update_attributes(:decompiled => true) - rescue Timeout::Error - # swallow - rescue Exception => e - if e.message =~ /Crashed/ - # swallow - elsif e.message =~ /dex2jar failed/ - # swallow - elsif e.message =~ /Couldn't decompile/ - # swallow - else - Source.purge_index!(apk) - raise e - end - ensure - FileUtils.rm_rf(apk.source_dir) - end - end - end -end diff --git a/app/worker/app_query_dispatcher.rb b/app/worker/app_query_dispatcher.rb deleted file mode 100644 index 7d4123f..0000000 --- a/app/worker/app_query_dispatcher.rb +++ /dev/null @@ -1,25 +0,0 @@ -class AppQueryDispatcher - include Sidekiq::Worker - sidekiq_options :queue => name.underscore - - def perform(app_query_id) - Helpers.has_java_exceptions do - query = AppQuery.find(app_query_id) - results = query.crawler.crawl - query.update_attributes(:fetched_at => Time.at(Time.now.to_i), # no millisecs - :total_apps => results.num_apps, - :total_apps_fetched => 0) - - num_apps_to_fetch = [query.total_apps, Crawler::App::MAX_START].min - increment = Crawler::App::PER_PAGE - 1 # for page aliasing / races on google backend - (num_apps_to_fetch / increment.to_f).ceil.times do |page| - if (page == 0) - # We already have it - AppQueryFetcher.save_apps(query, results.apps) - else - AppQueryFetcher.perform_async(query.id, page) - end - end - end - end -end diff --git a/app/worker/app_query_fetcher.rb b/app/worker/app_query_fetcher.rb deleted file mode 100644 index 9d95ae7..0000000 --- a/app/worker/app_query_fetcher.rb +++ /dev/null @@ -1,27 +0,0 @@ -class AppQueryFetcher - include Sidekiq::Worker - sidekiq_options :queue => name.underscore - - def perform(app_query_id, page) - Helpers.has_java_exceptions do - query = AppQuery.find(app_query_id) - start = page * Crawler::App::PER_PAGE - # google is tight on 401 - start = [start, Crawler::App::MAX_START - Crawler::App::PER_PAGE].min - self.class.save_apps(query, query.crawler(:start => start).crawl.apps) - end - end - - def self.save_apps(query, apps) - apps.each do |app| - next if App.where(:app_id => app[:app_id]).count > 0 - - app = App.new(app) - app.upsert - unless app.price - app = App.where(:id => app.id).first # mongoid workaround - app.download_latest_apk! - end - end - end -end diff --git a/app/worker/lib_finder.rb b/app/worker/lib_finder.rb deleted file mode 100644 index 488d115..0000000 --- a/app/worker/lib_finder.rb +++ /dev/null @@ -1,32 +0,0 @@ -class LibFinder - include Sidekiq::Worker - sidekiq_options :queue => name.underscore - - def perform(lib_id) - lib = Lib.find(lib_id) - mark_sources(lib) - mark_apks(lib) - end - - def mark_sources(lib) - transform_proc = proc { |doc| doc[:lib] = lib.name; doc } - Source.index.reindex('sources', :transform => transform_proc) do - query do - boolean do - must { @value = { :wildcard => { :path => "#{lib.name.gsub(/\./, '/')}/*" } } } - must_not { term 'lib', lib.name } - end - end - end - end - - def mark_apks(lib) - res = Source.tire.search(:per_page => 0) do - query { term 'lib', lib.name } - facet(:apk_eid) { terms :field => :apk_eid, :size => 10000000 } - end - apk_eids = res.facets['apk_eid']['terms'].map { |t| t['term'] } - Apk.where(:eid.in => apk_eids).add_to_set(:lib_names, lib.name) - self.update_attributes(:num_apks => lib.apks.count) - end -end diff --git a/app/worker/process_app.rb b/app/worker/process_app.rb new file mode 100644 index 0000000..37d9758 --- /dev/null +++ b/app/worker/process_app.rb @@ -0,0 +1,8 @@ +class ProcessApp + include Sidekiq::Worker + sidekiq_options :queue => name.underscore, :backtrace => true + + def perform(app_id) + Stack.process_app(app_id) + end +end diff --git a/app/worker/search_app.rb b/app/worker/search_app.rb new file mode 100644 index 0000000..117abcc --- /dev/null +++ b/app/worker/search_app.rb @@ -0,0 +1,13 @@ +class SearchApp + include Sidekiq::Worker + sidekiq_options :queue => name.underscore, :backtrace => true + + def perform(app_id, raw_url=nil) + results = Market.search(app_id, :raw_url => raw_url) + App.discovered_apps(results.app_ids) + + if results.next_page_url + self.class.perform_async(app_id, results.next_page_url) + end + end +end diff --git a/config/unicorn.rb b/config/unicorn.rb index 3e6892f..9f11fe7 100644 --- a/config/unicorn.rb +++ b/config/unicorn.rb @@ -69,6 +69,6 @@ Redis.instance.client.connect Sidekiq.configure_client do |config| - config.redis = { :url => Redis.instance.client.id, :namespace => 'google-play-crawler' } + config.redis = { :url => Redis.instance.client.id } end end diff --git a/lib/decompiler.rb b/lib/decompiler.rb deleted file mode 100644 index ee1f154..0000000 --- a/lib/decompiler.rb +++ /dev/null @@ -1,71 +0,0 @@ -require 'timeout' - -module Decompiler - unless File.exists?(Rails.root.join 'vendor', 'libjd-intellij.so') - raise "Please install libjd-intellij.so in ./vendor.\n" + - "It can be found at https://bitbucket.org/bric3/jd-intellij" - end - - - def self.script_exec(*args) - # XXX Workaround. jdcore crashes often. We need to fork. - # JRuby does not allow forking. - # XXX Workaround. JRuby spawn() is broken, it wait for our child - # and get it before we can do Process.wait() yielding a -ECHILD, - # Using Spoon works. - # XXX Workatound. jdcore sometimes uses > 60Gb of RAM. We need - # to limit that. rlimit is not supported easily with Spoon. - # Using script/decompile - # XXX Workaround. we need to timeout the execution, but timeout - # doesn't work with Process.wait() for some reason. We need to - # poll every second. - # XXX Workaround. Sometimes when jdcore crashes, we don't see it in - # $?.success? JRuby is broken. - - pid = Spoon.spawnp(*args.map(&:to_s)) - begin - Timeout.timeout(1.minute) do - loop do - break unless Process.wait(pid, Process::WNOHANG).to_i == 0 - sleep 1 - end - end - rescue Timeout::Error => e - Process.kill('KILL', pid) - Process.wait(pid) - raise e - end - - # $?.success? appears to be unreliable... JRuby is weird... - unless $?.success? - if $?.termsig - raise "Couldn't decompile jar properly. Crashed." - else - raise "Couldn't decompile jar properly" - end - end - end - - def self.jar2java(jar, out_dir) - script_exec("./script/decompile", jar, out_dir) - - unless Dir.exists?(out_dir) - raise "Couldn't decompile jar properly" - end - end - - def self.dex2jar(apk, jar) - script_exec("./script/dex2jar", "-f", apk, "-o", jar) - end - - def self.decompile(apk, out_dir) - jar = Tempfile.new(['apk', '.jar'], '/tmp') - begin - dex2jar(apk, jar.path) - jar2java(jar.path, out_dir) - ensure - jar.close - jar.unlink - end - end -end diff --git a/lib/market.rb b/lib/market.rb index e5963c6..f50ca5a 100644 --- a/lib/market.rb +++ b/lib/market.rb @@ -19,25 +19,36 @@ def self.api class SearchResult < Struct.new(:payload) def doc - payload[:payload][:search_response][:doc][0] + @doc ||= payload[:payload][:search_response][:doc][0] rescue nil end - def num_apps + def total_apps + return 0 unless doc doc[:container_metadata][:estimated_results] end + def next_page_url + return nil unless doc + doc[:container_metadata][:next_page_url] + end + def app_ids + return [] unless doc doc[:child].map { |app| app[:docid] } end end def self.search(query, options={}) - params = {} - params[:c] = 3 # App category - params[:q] = query - params[:n] = options[:per_page] if options[:per_page] - params[:o] = options[:start] if options[:start] - SearchResult.new api.get('search', params).body + if options[:raw_url] + SearchResult.new api.get(options[:raw_url]).body + else + params = {} + params[:c] = 3 # App category + params[:q] = query + params[:n] = options[:per_page] if options[:per_page] + params[:o] = options[:start] if options[:start] + SearchResult.new api.get('search', params).body + end end # def self.details_bulk(app_ids) diff --git a/lib/stack.rb b/lib/stack.rb index 544f20d..b0ed379 100644 --- a/lib/stack.rb +++ b/lib/stack.rb @@ -2,7 +2,7 @@ module Stack # Each step in a stack must be idempotent, so the chain can fail at any point, # and we can retry the whole thing. - def self.process_free_app(app_id) + def self.process_app(app_id) @create_app_stack ||= ::Middleware::Builder.new do use PrepareFS use FetchMarketDetails diff --git a/lib/stack/base.rb b/lib/stack/base.rb index 792f305..d071bf0 100644 --- a/lib/stack/base.rb +++ b/lib/stack/base.rb @@ -9,4 +9,24 @@ def initialize(stack, options={}) def call(env) @stack.call(env) end + + def exec_and_capture(*args) + options = args.extract_options! + args = args.map(&:to_s) + IO.popen('-') do |io| + unless io + trap("SIGINT", "IGNORE") + trap("SIGTERM", "IGNORE") + $stderr.reopen($stdout) + begin + exec(*args, options) + rescue Exception => e + STDERR.puts "#{e} while running #{args}, #{options}" + end + exit! 1 + end + + io.read + end + end end diff --git a/lib/stack/decompile_apk.rb b/lib/stack/decompile_apk.rb index 8a8c75b..0f3b86d 100644 --- a/lib/stack/decompile_apk.rb +++ b/lib/stack/decompile_apk.rb @@ -6,7 +6,9 @@ class DecompilationError < RuntimeError; end use_git :branch => :src def persist_to_git(env, git) - output = `script/decompile #{env[:scratch]} #{env[:apk_path].basename}` + return unless env[:app].free + + output = exec_and_capture('script/decompile', env[:scratch], env[:apk_path].basename) raise DecompilationError.new(output) unless $?.success? env[:src_dir] = env[:scratch].join('src') diff --git a/lib/stack/download_apk.rb b/lib/stack/download_apk.rb index 1f875bd..75cc9b0 100644 --- a/lib/stack/download_apk.rb +++ b/lib/stack/download_apk.rb @@ -10,7 +10,10 @@ class DownloadError < RuntimeError; end use_git :branch => :apk def persist_to_git(env, git) - download_info = Market.purchase(env[:app].id, env[:app].version_code) + app = env[:app] + return unless app.free + + download_info = Market.purchase(app.id, app.version_code) conn = Faraday.new(:ssl => {:verify => false}) do |f| f.response :follow_redirects @@ -30,7 +33,7 @@ def persist_to_git(env, git) end end - apk_filename = "#{env[:app].id_version}.apk" + apk_filename = "#{app.id}-#{app.version_code}.apk" git.commit do |index| index.add_file(apk_filename, response.body) @@ -41,7 +44,8 @@ def persist_to_git(env, git) end def parse_from_git(env, git) - apk_filename = "#{env[:app].id_version}.apk" + app = env[:app] + apk_filename = "#{app.id}-#{app.version_code}.apk" env[:apk_path] = env[:scratch].join(apk_filename) env[:apk_path].open('wb') { |f| f.write(git.read_file(apk_filename)) } end diff --git a/lib/stack/prepare_fs.rb b/lib/stack/prepare_fs.rb index b180420..d8436ff 100644 --- a/lib/stack/prepare_fs.rb +++ b/lib/stack/prepare_fs.rb @@ -12,7 +12,7 @@ def call(env) # It would be much more efficient to write a pack directly. # Expect horrible performance when saving sources. - output = `cd #{env[:repo].path} 2>&1 && git gc --prune=now -q 2>&1` + output = exec_and_capture("git gc --prune=now -q", :chdir => env[:repo].path) Rails.logger.info "Cannot garbage collect the repository: #{output}" unless $?.success? end end