From 4f62f365efce1c18f8e035f491fc66f0c4cebc0e Mon Sep 17 00:00:00 2001 From: Andrew Korzhuev Date: Tue, 25 Dec 2012 15:37:04 +0400 Subject: [PATCH 1/3] Fixing errors on reading files with compound names --- lib/ruby_tika_app.rb | 7 +++---- ...implex11.pdf => graph sampling simplex - 11.pdf} | Bin spec/ruby_tika_app_spec.rb | 7 +++---- 3 files changed, 6 insertions(+), 8 deletions(-) rename spec/docs/{graph_sampling_simplex11.pdf => graph sampling simplex - 11.pdf} (100%) diff --git a/lib/ruby_tika_app.rb b/lib/ruby_tika_app.rb index 0ecea02..01aa5c5 100644 --- a/lib/ruby_tika_app.rb +++ b/lib/ruby_tika_app.rb @@ -16,14 +16,13 @@ def initialize status end def initialize(document) - - @document = document + @document = "file://#{document}" java_cmd = 'java' java_args = '-server -Djava.awt.headless=true' tika_path = "#{File.join(File.dirname(__FILE__))}/../ext/tika-app-1.2.jar" - @tika_cmd = "#{java_cmd} #{java_args} -jar #{tika_path}" + @tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}'" end def to_xml @@ -54,7 +53,7 @@ def to_metadata def run_tika(option) - final_cmd = "#{@tika_cmd} #{option} #{@document}" + final_cmd = "#{@tika_cmd} #{option} '#{@document}'" result = [] diff --git a/spec/docs/graph_sampling_simplex11.pdf b/spec/docs/graph sampling simplex - 11.pdf similarity index 100% rename from spec/docs/graph_sampling_simplex11.pdf rename to spec/docs/graph sampling simplex - 11.pdf diff --git a/spec/ruby_tika_app_spec.rb b/spec/ruby_tika_app_spec.rb index f05e965..f252c7c 100644 --- a/spec/ruby_tika_app_spec.rb +++ b/spec/ruby_tika_app_spec.rb @@ -3,7 +3,7 @@ describe RubyTikaApp do before(:each) do - @test_file = "#{File.join(File.dirname(__FILE__))}/docs/graph_sampling_simplex11.pdf" + @test_file = "#{File.join(File.dirname(__FILE__))}/docs/graph sampling simplex - 11.pdf" end describe "#to_xml" do @@ -18,7 +18,7 @@ xml_size = xml.size / 2 - xml[xml_size..(xml_size + 100)].should == "(Section IV). Besides,\nMHRW performs better in well connected graphs than in\nloosely connected graphs" + xml[xml_size..(xml_size + 100)].should == "sides,\nMHRW performs better in well connected graphs than in\nloosely connected graphs, as it was orig" end end @@ -30,7 +30,7 @@ it "middle" do rta = RubyTikaApp.new(@test_file) - rta.to_html[1000 ... 1100].should == "rceName\" content=\"graph_sampling_simplex11.pdf\"/>\n\n Date: Wed, 26 Dec 2012 20:52:36 +0400 Subject: [PATCH 2/3] Do not treat 'INFO - ' in stderr output as error --- lib/ruby_tika_app.rb | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/ruby_tika_app.rb b/lib/ruby_tika_app.rb index 01aa5c5..7fb8331 100644 --- a/lib/ruby_tika_app.rb +++ b/lib/ruby_tika_app.rb @@ -52,7 +52,6 @@ def to_metadata private def run_tika(option) - final_cmd = "#{@tika_cmd} #{option} '#{@document}'" result = [] @@ -62,7 +61,7 @@ def run_tika(option) stdout_result = stdout.read.strip stderr_result = stderr.read.strip - unless stderr_result.strip == "" then + unless strip_stderr(stderr_result).empty? raise(CommandFailedError.new(stderr_result), "execution failed with status #{stderr_result}: #{final_cmd}") end @@ -74,4 +73,8 @@ def run_tika(option) stderr.close end + def strip_stderr(s) + s.gsub(/^info - .*$/i, '').strip + end + end From fad8c86cfdeabd2673b790d8eb0db93f0cf81a7f Mon Sep 17 00:00:00 2001 From: Andrew Korzhuev Date: Fri, 4 Jan 2013 16:59:27 +0400 Subject: [PATCH 3/3] Ignore warnings along with infos --- lib/ruby_tika_app.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ruby_tika_app.rb b/lib/ruby_tika_app.rb index 7fb8331..fae69f7 100644 --- a/lib/ruby_tika_app.rb +++ b/lib/ruby_tika_app.rb @@ -74,7 +74,7 @@ def run_tika(option) end def strip_stderr(s) - s.gsub(/^info - .*$/i, '').strip + s.gsub(/^(info|warn) - .*$/i, '').strip end end