Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

first commit

  • Loading branch information...
commit e474b00ab8cb75d37e7363d9649c340bd366ca09 0 parents
Chris Parker authored November 29, 2011
4  .gitignore
... ...
@@ -0,0 +1,4 @@
  1
+*.gem
  2
+.bundle
  3
+Gemfile.lock
  4
+pkg/*
2  .rspec
... ...
@@ -0,0 +1,2 @@
  1
+--color
  2
+--format documentation
4  Gemfile
... ...
@@ -0,0 +1,4 @@
  1
+source "http://rubygems.org"
  2
+
  3
+# Specify your gem's dependencies in ruby_tika_app.gemspec
  4
+gemspec
2  README
... ...
@@ -0,0 +1,2 @@
  1
+This is a simple frontend to the Java Tika parser
  2
+command line jar / app.
1  Rakefile
... ...
@@ -0,0 +1 @@
  1
+require "bundler/gem_tasks"
BIN  ext/tika-app-0.10.jar
Binary file not shown
80  lib/ruby_tika_app.rb
... ...
@@ -0,0 +1,80 @@
  1
+# Based on the rake remote task code
  2
+
  3
+require 'rubygems'
  4
+require 'stringio'
  5
+require 'open4'
  6
+
  7
+class RubyTikaApp
  8
+
  9
+  VERSION = "0.1"
  10
+
  11
+  class Error < RuntimeError; end
  12
+
  13
+  class CommandFailedError < Error
  14
+    attr_reader :status
  15
+    def initialize status
  16
+      @status = status
  17
+    end
  18
+  end
  19
+
  20
+  def initialize(document)
  21
+
  22
+    @document = document
  23
+
  24
+    java_cmd = 'java'
  25
+    java_args = '-server -Djava.awt.headless=true'
  26
+    tika_path = "#{File.join(File.dirname(__FILE__))}/../ext/tika-app-0.10.jar"
  27
+
  28
+    @tika_cmd = "#{java_cmd} #{java_args} -jar #{tika_path}"
  29
+  end
  30
+
  31
+  def to_xml
  32
+    run_tika('--xml')
  33
+  end
  34
+
  35
+  def to_html
  36
+    run_tika('--html')
  37
+  end
  38
+
  39
+  def to_json
  40
+    run_tika('--json')
  41
+  end
  42
+
  43
+  def to_text
  44
+    run_tika('--text')
  45
+  end
  46
+
  47
+  def to_text_main
  48
+    run_tika('--text-main')
  49
+  end
  50
+
  51
+  def to_metadata
  52
+    run_tika('--metadata')
  53
+  end
  54
+
  55
+  private
  56
+
  57
+  def run_tika(option)
  58
+
  59
+    final_cmd = "#{@tika_cmd} #{option} #{@document}"
  60
+    result = []
  61
+
  62
+
  63
+    pid, stdin, stdout, stderr = Open4::popen4(final_cmd)
  64
+
  65
+    stdout_result = stdout.read.strip
  66
+    stderr_result = stderr.read.strip
  67
+
  68
+    unless stderr_result.strip == "" then
  69
+      raise(CommandFailedError.new(stderr_result),
  70
+            "execution failed with status #{stderr_result}: #{final_cmd}")
  71
+    end
  72
+
  73
+    stdout_result
  74
+  ensure
  75
+    stdin.close
  76
+    stdout.close
  77
+    stderr.close
  78
+  end
  79
+
  80
+end
3  lib/ruby_tika_app/version.rb
... ...
@@ -0,0 +1,3 @@
  1
+module RubyTikaApp
  2
+  VERSION = "0.0.1"
  3
+end
26  ruby_tika_app.gemspec
... ...
@@ -0,0 +1,26 @@
  1
+# -*- encoding: utf-8 -*-
  2
+$:.push File.expand_path("../lib", __FILE__)
  3
+require "ruby_tika_app"
  4
+
  5
+Gem::Specification.new do |s|
  6
+  s.name        = "ruby_tika_app"
  7
+  s.version     = RubyTikaApp::VERSION
  8
+  s.authors     = ["Chris Parker"]
  9
+  s.email       = ["mrcsparker@gmail.com"]
  10
+  s.homepage    = "http://github.com"
  11
+  s.summary     = %q{Wrapper around the tika-app jar}
  12
+  s.description = %q{Wrapper around the tika-app jar}
  13
+
  14
+  s.rubyforge_project = "ruby_tika_app"
  15
+
  16
+  s.files         = `git ls-files`.split("\n")
  17
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
  18
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
  19
+  s.require_paths = ["lib"]
  20
+
  21
+  s.add_dependency("open4")
  22
+
  23
+  s.add_development_dependency("rspec", "~> 2.7.0")
  24
+  s.add_development_dependency("bundler", ">= 1.0.15")
  25
+
  26
+end
BIN  spec/docs/graph_sampling_simplex11.pdf
Binary file not shown
53  spec/ruby_tika_app_spec.rb
... ...
@@ -0,0 +1,53 @@
  1
+require 'spec_helper'
  2
+
  3
+describe RubyTikaApp do
  4
+
  5
+  before(:each) do
  6
+    @test_file = "#{File.join(File.dirname(__FILE__))}/docs/graph_sampling_simplex11.pdf"
  7
+  end
  8
+
  9
+  describe "#to_xml" do
  10
+    it "header" do
  11
+      rta = RubyTikaApp.new(@test_file)
  12
+      rta.to_xml[0..37].should == "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
  13
+    end
  14
+  
  15
+    it "middle" do
  16
+      rta = RubyTikaApp.new(@test_file)
  17
+      xml = rta.to_xml
  18
+
  19
+      xml_size = xml.size / 2
  20
+
  21
+      xml[xml_size..(xml_size + 100)].should == "HRW considers all the duplicated nodes as valid nodes.\nThese duplicated nodes make the node distribut"
  22
+    end
  23
+  end
  24
+
  25
+  describe "#to_html" do
  26
+    it "header" do
  27
+      rta = RubyTikaApp.new(@test_file)
  28
+      rta.to_html[0..42].should == "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
  29
+    end
  30
+
  31
+    it "middle" do
  32
+      rta = RubyTikaApp.new(@test_file)
  33
+      rta.to_html[1000 ... 1100].should == "ersity of Goettingen, Germany\n3 Department of Computer Science, U.C. Santa Barbara, USA\n4 Deutsche T"
  34
+    end
  35
+  end
  36
+
  37
+  describe "#to_json" do
  38
+
  39
+  end
  40
+
  41
+  describe "#to_text" do
  42
+
  43
+  end
  44
+
  45
+  describe "#to_text_main" do
  46
+
  47
+  end
  48
+
  49
+  describe "#to_metadata" do
  50
+
  51
+  end
  52
+
  53
+end
9  spec/spec_helper.rb
... ...
@@ -0,0 +1,9 @@
  1
+require 'rubygems'
  2
+require 'bundler/setup'
  3
+
  4
+require 'ruby_tika_app'
  5
+require 'rspec'
  6
+
  7
+RSpec.configure do |config|
  8
+
  9
+end

0 notes on commit e474b00

Please sign in to comment.
Something went wrong with that request. Please try again.