Permalink
Browse files

first commit

  • Loading branch information...
mrcsparker committed Nov 30, 2011
0 parents commit e474b00ab8cb75d37e7363d9649c340bd366ca09
@@ -0,0 +1,4 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*
2 .rspec
@@ -0,0 +1,2 @@
+--color
+--format documentation
@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+
+# Specify your gem's dependencies in ruby_tika_app.gemspec
+gemspec
2 README
@@ -0,0 +1,2 @@
+This is a simple frontend to the Java Tika parser
+command line jar / app.
@@ -0,0 +1 @@
+require "bundler/gem_tasks"
Binary file not shown.
@@ -0,0 +1,80 @@
+# Based on the rake remote task code
+
+require 'rubygems'
+require 'stringio'
+require 'open4'
+
+class RubyTikaApp
+
+ VERSION = "0.1"
+
+ class Error < RuntimeError; end
+
+ class CommandFailedError < Error
+ attr_reader :status
+ def initialize status
+ @status = status
+ end
+ end
+
+ def initialize(document)
+
+ @document = document
+
+ java_cmd = 'java'
+ java_args = '-server -Djava.awt.headless=true'
+ tika_path = "#{File.join(File.dirname(__FILE__))}/../ext/tika-app-0.10.jar"
+
+ @tika_cmd = "#{java_cmd} #{java_args} -jar #{tika_path}"
+ end
+
+ def to_xml
+ run_tika('--xml')
+ end
+
+ def to_html
+ run_tika('--html')
+ end
+
+ def to_json
+ run_tika('--json')
+ end
+
+ def to_text
+ run_tika('--text')
+ end
+
+ def to_text_main
+ run_tika('--text-main')
+ end
+
+ def to_metadata
+ run_tika('--metadata')
+ end
+
+ private
+
+ def run_tika(option)
+
+ final_cmd = "#{@tika_cmd} #{option} #{@document}"
+ result = []
+
+
+ pid, stdin, stdout, stderr = Open4::popen4(final_cmd)
+
+ stdout_result = stdout.read.strip
+ stderr_result = stderr.read.strip
+
+ unless stderr_result.strip == "" then
+ raise(CommandFailedError.new(stderr_result),
+ "execution failed with status #{stderr_result}: #{final_cmd}")
+ end
+
+ stdout_result
+ ensure
+ stdin.close
+ stdout.close
+ stderr.close
+ end
+
+end
@@ -0,0 +1,3 @@
+module RubyTikaApp
+ VERSION = "0.0.1"
+end
@@ -0,0 +1,26 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "ruby_tika_app"
+
+Gem::Specification.new do |s|
+ s.name = "ruby_tika_app"
+ s.version = RubyTikaApp::VERSION
+ s.authors = ["Chris Parker"]
+ s.email = ["mrcsparker@gmail.com"]
+ s.homepage = "http://github.com"
+ s.summary = %q{Wrapper around the tika-app jar}
+ s.description = %q{Wrapper around the tika-app jar}
+
+ s.rubyforge_project = "ruby_tika_app"
+
+ s.files = `git ls-files`.split("\n")
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+ s.require_paths = ["lib"]
+
+ s.add_dependency("open4")
+
+ s.add_development_dependency("rspec", "~> 2.7.0")
+ s.add_development_dependency("bundler", ">= 1.0.15")
+
+end
Binary file not shown.
@@ -0,0 +1,53 @@
+require 'spec_helper'
+
+describe RubyTikaApp do
+
+ before(:each) do
+ @test_file = "#{File.join(File.dirname(__FILE__))}/docs/graph_sampling_simplex11.pdf"
+ end
+
+ describe "#to_xml" do
+ it "header" do
+ rta = RubyTikaApp.new(@test_file)
+ rta.to_xml[0..37].should == "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
+ end
+
+ it "middle" do
+ rta = RubyTikaApp.new(@test_file)
+ xml = rta.to_xml
+
+ xml_size = xml.size / 2
+
+ xml[xml_size..(xml_size + 100)].should == "HRW considers all the duplicated nodes as valid nodes.\nThese duplicated nodes make the node distribut"
+ end
+ end
+
+ describe "#to_html" do
+ it "header" do
+ rta = RubyTikaApp.new(@test_file)
+ rta.to_html[0..42].should == "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+ end
+
+ it "middle" do
+ rta = RubyTikaApp.new(@test_file)
+ rta.to_html[1000 ... 1100].should == "ersity of Goettingen, Germany\n3 Department of Computer Science, U.C. Santa Barbara, USA\n4 Deutsche T"
+ end
+ end
+
+ describe "#to_json" do
+
+ end
+
+ describe "#to_text" do
+
+ end
+
+ describe "#to_text_main" do
+
+ end
+
+ describe "#to_metadata" do
+
+ end
+
+end
@@ -0,0 +1,9 @@
+require 'rubygems'
+require 'bundler/setup'
+
+require 'ruby_tika_app'
+require 'rspec'
+
+RSpec.configure do |config|
+
+end

0 comments on commit e474b00

Please sign in to comment.