Skip to content

Commit

Permalink
tidying parameters and code
Browse files Browse the repository at this point in the history
  • Loading branch information
mangled committed Nov 10, 2010
1 parent dee2830 commit 7731440
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 39 deletions.
9 changes: 3 additions & 6 deletions scripts/lib/crawler.rb
Expand Up @@ -33,14 +33,11 @@ def Poodle.crawl(options, cache, logger)

workers = ThreadsWait.new
1.upto(options[:threads]) do |i|
workers.join_nowait(
Thread.new do
Crawler.crawl(options, SolrIndexer.new(options), Analyzer.new, urls_to_crawl)
end
)
thread = Thread.new { Crawler.crawl(options, SolrIndexer.new(options), Analyzer.new, urls_to_crawl) }
workers.join_nowait(thread)
end
workers.all_waits

# There are smarter ways or finding unused items!
processed = urls_to_crawl.processed
cache.delete
Expand Down
26 changes: 16 additions & 10 deletions scripts/lib/indexers.rb
Expand Up @@ -4,6 +4,7 @@
require 'tempfile'
require 'pathname'
require 'cgi'
require 'digest/md5'

module Poodle
class SolrIndexer
Expand All @@ -13,19 +14,19 @@ def initialize(params)
@log = params[:log]
end

def index(item)
# Might be better having a class or struct - for readability?
temp_file = SolrIndexer.new_temp(item[:content])
def index(uri, content, title)
begin
if item[:title]
solr_url = URI.join(@solr.to_s, "update/extract?literal.id=#{item[:id]}&literal.crawled_title=#{CGI.escape(item[:title])}&commit=true&literal.url=#{CGI.escape(item[:uri].to_s)}")
temp_file = SolrIndexer.new_temp(content)
id = unique_id(uri)
if title
solr_url = URI.join(@solr.to_s, "update/extract?literal.id=#{id}&literal.crawled_title=#{CGI.escape(title)}&commit=true&literal.url=#{CGI.escape(uri.to_s)}")
else
solr_url = URI.join(@solr.to_s, "update/extract?literal.id=#{item[:id]}&commit=true&literal.url=#{CGI.escape(item[:uri].to_s)}")
solr_url = URI.join(@solr.to_s, "update/extract?literal.id=#{id}&commit=true&literal.url=#{CGI.escape(uri.to_s)}")
end
solr_args = "--silent \"#{solr_url}\" -H '#{CGI.escape("Content-type:" + item[:content].content_type)}' -F \"myfile=@#{Pathname.new(temp_file.path)}\""
@log.warn("#{item[:uri]} Curl failed") unless SolrIndexer.curl(solr_args)
solr_args = "--silent \"#{solr_url}\" -H '#{CGI.escape("Content-type:" + content.content_type)}' -F \"myfile=@#{Pathname.new(temp_file.path)}\""
@log.warn("#{uri} Curl failed") unless SolrIndexer.curl(solr_args)
ensure
temp_file.unlink()
temp_file.unlink() if temp_file
end
end

Expand All @@ -37,7 +38,12 @@ def SolrIndexer.new_temp(content)
temp_file.close()
temp_file
end


def unique_id(uri)
digest = Digest::MD5.new().update(uri.normalize().to_s)
digest.hexdigest
end

# This is here to simplify unit-testing, couldn't be bothered overriding back-tic's
def SolrIndexer.curl(s)
`curl #{s}`
Expand Down
17 changes: 2 additions & 15 deletions scripts/lib/web.rb
@@ -1,8 +1,6 @@
#!/usr/bin/env ruby
require 'rubygems'
require 'uri'
require 'digest/md5'
require 'set'

$:.unshift File.join(File.dirname(__FILE__), ".")
require 'analyzer'
Expand All @@ -14,8 +12,7 @@ class Crawler

def Crawler.crawl(params, indexer = nil, analyzer = Analyzer.new, urls = WorkQueue.new)
begin
urls.remove do |item|
uri, referer = item
urls.remove do |uri, referer|
if Crawler.should_analyze?(uri, params[:ignore], params[:accept])
sleep(params[:wait]) if params[:wait]
Crawler.analyze_and_index(uri, referer, params, urls, indexer, analyzer)
Expand All @@ -24,35 +21,25 @@ def Crawler.crawl(params, indexer = nil, analyzer = Analyzer.new, urls = WorkQue
end
end
end while !urls.done?
urls.processed
end

def Crawler.analyze_and_index(uri, referer, params, urls, indexer, analyzer)
begin

analyzer.extract_links(uri, referer, urls.last_crawled_site_at, params) do |title, new_links, content|

# Note: because links are added here they will be filtered on the current accept rules
# (on the parent) if these cmd line options change then the database is basically invalid?
new_links.each {|link| urls.add(link[0], link[1]) }

if Crawler.should_index?(uri, (params[:index] and indexer))
uri_id = Crawler.unique_id(uri)
indexer.index({ :uri => uri, :content => content, :id => uri_id, :title => title })
indexer.index(uri, content, title)
params[:log].info("Indexed #{uri}")
else
params[:log].warn("Skipping indexing #{uri}") unless params[:quiet]
end
end
rescue AnalyzerError => e
# Analyzer will have logged
end
end

def Crawler.unique_id(uri)
digest = Digest::MD5.new().update(uri.normalize().to_s)
digest.hexdigest
end

def Crawler.should_analyze?(uri, ignore, accept)
return false if uri.scheme != 'http' or uri.fragment
Expand Down
4 changes: 3 additions & 1 deletion scripts/test/test_indexers.rb
Expand Up @@ -71,7 +71,9 @@ def add_expect_uri(url, body = 'Hello world', content_type = "text/html", status
end

def crawl(p)
Crawler.crawl(p, SolrIndexer.new(p), Poodle::Analyzer.new, Poodle::WorkQueue.new([p[:url], ""]))
queue = Poodle::WorkQueue.new([p[:url], ""])
Crawler.crawl(p, SolrIndexer.new(p), Poodle::Analyzer.new, queue)
queue.processed
end
end
end
15 changes: 8 additions & 7 deletions scripts/test/test_web.rb
Expand Up @@ -28,8 +28,8 @@ def initialize(expectations = nil)
@expectations = expectations
end

def index(item)
@items << item
def index(uri, content, title)
@items << { :uri => uri, :content => content, :title => title }
@expectations.shift.call(@items[-1]) if @expectations
end
end
Expand Down Expand Up @@ -173,7 +173,9 @@ def params(url)
end

def crawl(url, indexer = FakeIndexer.new, analyzer = Poodle::Analyzer.new)
Crawler.crawl(params(url), indexer, analyzer, Poodle::WorkQueue.new([URI.parse(url), ""]))
queue = Poodle::WorkQueue.new([URI.parse(url), ""])
Crawler.crawl(params(url), indexer, analyzer, queue)
queue.processed
end

def add_expect_uri(url, body = 'Hello world', content_type = "text/html", status = ["200", "OK"])
Expand Down Expand Up @@ -204,16 +206,15 @@ def check_crawler(graph, ignore = [], index = true)
error_pages[url.to_s] = page if page[:status][0] != "200"
end

ids = Set.new
p = params(urll).merge({:ignore => ignore, :index => index})

indexer = FakeIndexer.new
links = Crawler.crawl(p, indexer, Poodle::Analyzer.new, Poodle::WorkQueue.new([p[:url], ""]))
queue = Poodle::WorkQueue.new([p[:url], ""])
Crawler.crawl(p, indexer, Poodle::Analyzer.new, queue)
links = queue.processed

indexer.items.each do |item|
assert(!error_pages.keys.include?(item[:uri].to_s))
assert(!ids.include?(item[:id]))
ids.add(item[:id])
assert_nil item[:title]
assert(urls.delete? item[:uri].to_s)
assert_equal(item[:content].content_type, pages[item[:uri].to_s][:content_type])
Expand Down

0 comments on commit 7731440

Please sign in to comment.